In [None]:
# https://towardsdatascience.com/the-complete-guide-to-time-series-forecasting-using-sklearn-pandas-and-numpy-7694c90e45c1

In [3]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt

In [4]:
data = sm.datasets.co2.load_pandas().data

In [None]:
fig, ax = plt.subplots(figsize=(16, 11))
ax.plot(data['co2'])
ax.set_xlabel('Time')
ax.set_ylabel('CO2 concentration (ppmw)')
fig.autofmt_xdate()
plt.tight_layout()

In [6]:
data = data.interpolate()

In [7]:
df = data.copy()
df['y'] = df['co2'].shift(-1)
df

Unnamed: 0,co2,y
1958-03-29,316.1,317.3
1958-04-05,317.3,317.6
1958-04-12,317.6,317.5
1958-04-19,317.5,316.4
1958-04-26,316.4,316.9
...,...,...
2001-12-01,370.3,370.8
2001-12-08,370.8,371.2
2001-12-15,371.2,371.3
2001-12-22,371.3,371.5


In [8]:
train = df[:-104]
test = df[-104:]  # last 103 rows to test
test = test.drop(test.tail(1).index) # Drop last row

In [9]:
test = test.copy()
test['baseline_pred'] = test['co2']
test

Unnamed: 0,co2,y,baseline_pred
2000-01-08,368.5,369.0,368.5
2000-01-15,369.0,369.8,369.0
2000-01-22,369.8,369.2,369.8
2000-01-29,369.2,369.1,369.2
2000-02-05,369.1,369.6,369.1
...,...,...,...
2001-11-24,370.3,370.3,370.3
2001-12-01,370.3,370.8,370.3
2001-12-08,370.8,371.2,370.8
2001-12-15,371.2,371.3,371.2


In [12]:
train['co2'].values

array([316.1, 317.3, 317.6, ..., 368. , 368.2, 368.6])

In [10]:
X_train = train['co2'].values.reshape(-1,1)
X_train

array([[316.1],
       [317.3],
       [317.6],
       ...,
       [368. ],
       [368.2],
       [368.6]])

In [None]:
from sklearn.tree import DecisionTreeRegressor
X_train = train['co2'].values.reshape(-1,1)  # reshape s.t no restriction on rows (i.e. -1), but only 1 column
y_train = train['y'].values.reshape(-1,1)
X_test = test['co2'].values.reshape(-1,1)
# Initialize the model
dt_reg = DecisionTreeRegressor(random_state=42)
# Fit the model
dt_reg.fit(X=X_train, y=y_train)
# Make predictions
dt_pred = dt_reg.predict(X_test)
# Assign predictions to a new column in test
test['dt_pred'] = dt_pred

In [None]:
#############################################################

In [None]:
import sys
import pandas as pd
sys.path.append("C:/Users/ping/MyDrive/py_files/python/py379/")
from myUtils import pickle_load, pickle_dump
path_pickle_dump = 'C:/Users/ping/OneDrive/Documents/jenn_bb_sales/'
pd.set_option('max_colwidth', 12, 'display.max_columns', 18, 'display.width', 1200, 'display.max_rows',100)

In [None]:
# # read Square csv and pickle df
# df1 = pd.read_csv('C:/Users/ping/OneDrive/Documents/jenn_bb_sales/items-2021-02-01-2021-10-02.csv')
# df2 = pd.read_csv('C:/Users/ping/OneDrive/Documents/jenn_bb_sales/items-2021-09-06-2022-09-07.csv')
# df = pd.concat([df1, df2])
# df = df.drop_duplicates()  # drop overlap between the 2 csv files
# # df.sort_values(by=['Date', 'Time'])
# # df.reset_index(drop=True)  # create new index
# df = df.sort_values(by=['Date', 'Time'])
# df = df.reset_index(drop=True)  # create new index
# pickle_dump(df, path_pickle_dump, 'df_sq_download')
# df.shape

In [None]:
# # clean data
# df = pickle_load(path_pickle_dump, 'df_sq_download')
# df_shape_before_clean = df.shape
# # select only these columns
# df = df[['Date', 'Time', 'Category', 'Item', 'Qty',
#        'Price Point Name', 'Gross Sales',
#        'Discounts', 'Net Sales', 'Tax', 
#        'Device Name', 'Notes', 'Event Type', 
#        'Dining Option', 'Customer Name']]
# # strip leading $, convert from object to float
# cols_to_clean = ['Gross Sales', 'Discounts', 'Net Sales', 'Tax']
# for col in cols_to_clean:
#   df[col] = df[col].str.split('$').str[-1]  # strip leading $
#   df[col] = pd.to_numeric(df[col])  # convert from object to float
# # remove whole sale customer "Canyon Coffee"
# print(f'df.shape before removing whole sale customer: {df.shape}')
# df = df[df['Customer Name'] != "Canyon Coffee"]
# print(f'df.shape after removing whole sale customer:  {df.shape}')
# df_shape_after_clean = df.shape
# pickle_dump(df, path_pickle_dump, 'df_clean')
# print(f'df.shape before clean:  {df_shape_before_clean}')
# print(f'df.shape after clean:   {df_shape_after_clean}')

In [None]:
# df = pickle_load(path_pickle_dump, 'df_clean')
# print(f'df.shape: {df.shape}')

In [None]:
# # sum daily sales into a series
# gross = df.groupby('Date')['Gross Sales'].sum()
# df_gross = pd.DataFrame(gross)
# df_gross.index = pd.to_datetime(df_gross.index)  # change index to datetime before concat

In [None]:
# # Los Angeles temperature and precipitation data
# # https://www.ncei.noaa.gov/cdo-web/datatools/findstation
# df_temp = pd.read_csv('C:/Users/ping/OneDrive/Documents/jenn_bb_sales/la_temp.csv')
# df_temp = df_temp.set_index("Date")
# df_temp.index = pd.to_datetime(df_temp.index)  # change index to datetime before concat

In [None]:
# df_gross_temp = pd.concat([df_gross, df_temp], axis=1, join='inner')
# pickle_dump(df_gross_temp, path_pickle_dump, 'df_gross_temp')

In [None]:
df_gross_temp = pickle_load(path_pickle_dump, 'df_gross_temp')
df_gross_temp