In [None]:
# to run notebook inside of package
import sys
sys.path.append('/Users/mollyliebeskind/Documents/auto_forecast/auto_forecast')

# standard data manipulation imports
import pandas as pd

# import internal package functions
from src.plotting import *
from src.data_processing import *
from src.modeling import SalesForecasting

: 

# Run the package

In [None]:
value_col = 'sales'
diffed_value_col = f"{value_col}_differenced"
date_col = 'date'
mean_freq = 'Y'
forecast_horizon = 12
model_list = ['LinearRegression', 'RandomForest', 'XGBoost', 'LSTM', 'ARIMA'] 

daily_data = pd.read_csv('train.csv')

monthly_data = aggregate_by_time(
    daily_data, 
    date_col, 
    resample_freq='M', 
    aggregate='sum'
    )

monthly_data = difference_data(
    data=monthly_data, 
    date_col=date_col,
    value_col=value_col, 
    diff_value_col_name=diffed_value_col
    )

supervised_data = create_lag_data(
    data=monthly_data, 
    date_col=date_col, 
    value_col=diffed_value_col, 
    lags=13)

# train, test = create_train_test(supervised_data, test_size)

x_cols = list(supervised_data.drop(['store', 'item', date_col, value_col, diffed_value_col], axis=1).columns)
supervised_data = supervised_data[[date_col] + x_cols + [diffed_value_col]]
train, test = create_train_test(supervised_data, forecast_horizon)

scaler = DataScaler()
print('Train data shape: ', train.shape)
train_scaled = scaler.fit_transform(train).set_index(date_col)
test_scaled = scaler.transform(test).set_index(date_col)
print('Train data shape: ', train_scaled.shape)

model = SalesForecasting(model_list=model_list)
model.fit(train_scaled[x_cols], train_scaled[[diffed_value_col]])
output = model.predict(test_scaled[x_cols], y_values=test_scaled[[diffed_value_col]], scaler=scaler)

: 

In [None]:
results_plot = model.plot_results()

: 

In [None]:
errors_plot = model.plot_errs()

: 

In [None]:
output_df = pd.DataFrame(model.stored_models).T
output_df

: 

# Visualize EDA

In [None]:
plot_periodic_values_hist(daily_data, value_col)

: 

In [None]:
plot_values_per_group(daily_data, value_col, ['store'])

: 

In [None]:
plot_time_series(monthly_data, date_col, value_col, mean_freq)

: 

# Visualize Data Processing

In [None]:
plot_time_series(monthly_data, date_col, diffed_value_col, mean_freq)

: 

In [None]:
plt_acf_pcf(monthly_data, date_col, diffed_value_col)

: 

In [None]:

plot_lag_cols(supervised_data, date_col, diffed_value_col, 'lag', num_lags=3)

: 

In [None]:
visualize_train_test(train, test, date_col, diffed_value_col, figsize=(12,4))

: 