# Retail Sales Forescast Project by Shizheng Hou, Chuke Xu and Lei

# Preprocessing

## Import needed packages

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers

from sklearn import linear_model
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import cross_validate

## Read and Import data

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df_sales = pd.read_csv('/kaggle/input/retaildataset/sales data-set.csv', parse_dates=["Date"])
df_stores = pd.read_csv('/kaggle/input/retaildataset/stores data-set.csv')
df_features = pd.read_csv('/kaggle/input/retaildataset/Features data set.csv', parse_dates=["Date"])

In [None]:
df_sales.head()

In [None]:
df_stores.head()

In [None]:
df_features.tail()

## Merge data

In [None]:
df = df_sales.merge(df_stores).merge(df_features)
df.tail()

In [None]:
train_data = df.sample(frac=0.8, random_state=123)
train_data

In [None]:
test_data = df[~df.index.isin(train_data.index)]
test_data

In [None]:
train_data.describe()

## Tranform date to 3 columns

In [None]:
train_data["Month"] = train_data.Date.dt.month
train_data["Year"] = train_data.Date.dt.year
train_data["Week"] = train_data.Date.dt.weekofyear
train_data["Day"] = train_data.Date.dt.dayofyear
train_data.drop(['Date'],axis=1,inplace=True)

In [None]:
train_data.describe()

## Data frame information

In [None]:
train_data.info()

## Number of Nulls for each Feature

In [None]:
train_data.isnull().sum()

## Replace all missing value with zero

In [None]:
lst = ['MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5']
for column in lst:
    train_data[column].fillna((train_data[column].mean()), inplace=True)
train_data.head()

## Encoding

In [None]:
m = pd.get_dummies(train_data["Type"])
m = m.rename(columns={"A": "TypeA", "B": "TypeB", "C": "TypeC"})

train_data = pd.concat([train_data, m], axis = 1)
train_data.drop("Type", axis = 1, inplace = True)

m = pd.get_dummies(train_data["IsHoliday"])
m = m.rename(columns={False: "Not Holiday", True: "Holiday"})

train_data = pd.concat([train_data, m], axis = 1)
train_data.drop("IsHoliday", axis = 1, inplace = True)

m = pd.get_dummies(train_data["Store"])
m = m.rename(columns=lambda x: 'Store' + str(x))

train_data = pd.concat([train_data, m], axis = 1)
train_data.drop("Store", axis = 1, inplace = True)

m = pd.get_dummies(train_data["Dept"])
m = m.rename(columns=lambda x: 'Dept' + str(x))

train_data = pd.concat([train_data, m], axis = 1)
train_data.drop("Dept", axis = 1, inplace = True)

m = pd.get_dummies(train_data["Week"])
m = m.rename(columns=lambda x: 'Week' + str(x))

train_data = pd.concat([train_data, m], axis = 1)
train_data.drop("Week", axis = 1, inplace = True)

In [None]:
train_data

In [None]:
test_data["Month"] = test_data.Date.dt.month
test_data["Year"] = test_data.Date.dt.year
test_data["Week"] = test_data.Date.dt.weekofyear
test_data["Day"] = test_data.Date.dt.dayofyear
test_data.drop(['Date'],axis=1,inplace=True)

lst = ['MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5']
for column in lst:
    test_data[column].fillna(0, inplace=True)

s = pd.get_dummies(test_data["Type"])
s = s.rename(columns={"A": "TypeA", "B": "TypeB", "C": "TypeC"})

test_data = pd.concat([test_data, s], axis = 1)
test_data.drop("Type", axis = 1, inplace = True)

m = pd.get_dummies(test_data["IsHoliday"])
m = m.rename(columns={False: "Not Holiday", True: "Holiday"})

test_data = pd.concat([test_data, m], axis = 1)
test_data.drop("IsHoliday", axis = 1, inplace = True)

m = pd.get_dummies(test_data["Store"])
m = m.rename(columns=lambda x: 'Store' + str(x))

test_data = pd.concat([test_data, m], axis = 1)
test_data.drop("Store", axis = 1, inplace = True)

m = pd.get_dummies(test_data["Dept"])
m = m.rename(columns=lambda x: 'Dept' + str(x))

test_data = pd.concat([test_data, m], axis = 1)
test_data.drop("Dept", axis = 1, inplace = True)

m = pd.get_dummies(test_data["Week"])
m = m.rename(columns=lambda x: 'Week' + str(x))

test_data = pd.concat([test_data, m], axis = 1)
test_data.drop("Week", axis = 1, inplace = True)

In [None]:
test_data

## Normalization

In [None]:
df_weekly_sales = train_data['Weekly_Sales']
train_data = train_data / train_data.max()
train_data['Weekly_Sales'] = df_weekly_sales

In [None]:
train_data

In [None]:
test_weekly_sales = test_data['Weekly_Sales']
test_data = test_data / test_data.max()
test_data['Weekly_Sales'] = test_weekly_sales

In [None]:
test_data

## Split X and y

In [None]:
X = train_data.drop('Weekly_Sales', axis=1)
y = train_data['Weekly_Sales']

In [None]:
X_test = test_data.drop('Weekly_Sales', axis=1)
y_test = test_data['Weekly_Sales']

## (PCA)

In [None]:
# from sklearn.decomposition import PCA

# train_data_drop = train_data.drop('Weekly_Sales', axis=1)

# pca = PCA(100)
# pca_train_data = pca.fit_transform(train_data_drop)

# plt.plot(np.cumsum(pca.explained_variance_ratio_))
# plt.xlabel('number of components')
# plt.ylabel('cumulative explained variance')

## Heat Map

In [None]:
# sns.set(rc={'figure.figsize':(20,18)})
# sns.heatmap(df.corr(), center = 0, annot = True)

## (Standardization)

In [None]:
# from sklearn import preprocessing
# scaler = preprocessing.StandardScaler()

# scaler_list = ['Weekly_Sales', 'Size', 'Temperature', 'Fuel_Price', 'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5', 'CPI', 'Unemployment', 'Month', 'Year', 'Day']

# scaler_data = train_data[scaler_list]
# scaler_data = pd.DataFrame(scaler.fit_transform(scaler_data))

# train_data.drop(scaler_list, axis=1, inplace=True).reset_index(inplace=True)

# # train_data = pd.concat([scaler_data, train_data], ignore_index=True, axis=1)

# # train_data = pd.DataFrame(scaler.fit_transform(X))
# # train_data['Weekly_Sales'] = df_weekly_sales

## Cross Validation

In [None]:
# from sklearn import linear_model
# from sklearn.model_selection import cross_validate
# from sklearn.metrics import make_scorer
# from sklearn.metrics import confusion_matrix
# import lightgbm as lgb

In [None]:
# X = np.array(train_data.drop('Weekly_Sales', axis=1))
# y = np.array(train_data["Weekly_Sales"])
# my_model = lgb.LGBMRegressor(objective='regression', num_leaves=150, max_depth = 14, learning_rate=0.5, n_estimators=2000, reg_alpha=0.5)
# cv_results = cross_validate(my_model, X, y, scoring = "r2", cv = 10)
# sorted(cv_results.keys())
# cv_results['test_score']

# Submission 0: Linear regression

## OLS Regression

In [None]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [None]:
reg_ols = sm.OLS(y, X)
est = reg_ols.fit()
est.summary()  

In [None]:
y_pred = est.predict(X)

r_2 = r2_score(y, y_pred)
rmse = mean_squared_error(y, y_pred, squared=False)
mae = mean_absolute_error(y, y_pred)

print("R squared:", r_2)
print("RMSE:", rmse)
print("MAE:", mae)

In [None]:
y_test_pred = est.predict(X_test)

r_2 = r2_score(y_test, y_test_pred)
rmse = mean_squared_error(y_test, y_test_pred, squared=False)
mae = mean_absolute_error(y_test, y_test_pred)

print("R squared:", r_2)
print("RMSE:", rmse)
print("MAE:", mae)

# Submission 1: Lasso, Ridge and Polynomial regression

## Lasso regression

In [None]:
from sklearn.linear_model import Lasso

In [None]:
reg_lasso = Lasso().fit(X,y)

y_pred = reg_lasso.predict(X)

r_2 = reg_lasso.score(X, y)
rmse = mean_squared_error(y, y_pred, squared=False)
mae = mean_absolute_error(y, y_pred)

print("R squared:", r_2)
print("RMSE:", rmse)
print("MAE:", mae)

In [None]:
y_test_pred = reg_lasso.predict(X_test)

r2_test = reg_lasso.score(X_test, y_test)
rmse_test = mean_squared_error(y_test, y_test_pred, squared=False)
mae_test = mean_absolute_error(y_test, y_test_pred)

print("R squared:", r2_test)
print("RMSE:", rmse_test)
print("MAE:", mae_test)

## Ridge regression

In [None]:
from sklearn.linear_model import Ridge

In [None]:
reg_ridge = Ridge().fit(X,y)

y_pred = reg_ridge.predict(X)

r_2 = reg_ridge.score(X, y)
rmse = mean_squared_error(y, y_pred, squared=False)
mae = mean_absolute_error(y, y_pred)

print("R squared:", r_2)
print("RMSE:", rmse)
print("MAE:", mae)

In [None]:
y_test_pred = reg_ridge.predict(X_test)

r_2 = reg_ridge.score(X_test, y_test)
rmse = mean_squared_error(y_test, y_test_pred, squared=False)
mae = mean_absolute_error(y_test, y_test_pred)

print("R squared:", r_2)
print("RMSE:", rmse)
print("MAE:", mae)

## Polynomial regression

In [None]:
# from sklearn.preprocessing import PolynomialFeatures
# from sklearn import linear_model

In [None]:
# poly = PolynomialFeatures(degree=2)
# X = poly.fit_transform(X)

# clf = linear_model.LinearRegression().fit(X, y)
# clf.score(X, y)

# Submission 2: GBDT, XGBoost and LightBGM

## Gradient Boosting Decision Tree

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
reg_gbdt = GradientBoostingRegressor().fit(X,y)

y_pred = reg_gbdt.predict(X)

r_2 = reg_gbdt.score(X, y)
rmse = mean_squared_error(y, y_pred, squared=False)
mae = mean_absolute_error(y, y_pred)

print("R squared:", r_2)
print("RMSE:", rmse)
print("MAE:", mae)

In [None]:
y_test_pred = reg_gbdt.predict(X_test)

r_2 = reg_gbdt.score(X_test, y_test)
rmse = mean_squared_error(y_test, y_test_pred, squared=False)
mae = mean_absolute_error(y_test, y_test_pred)

print("R squared:", r_2)
print("RMSE:", rmse)
print("MAE:", mae)

## XGBoost

In [None]:
from xgboost import XGBRegressor

In [None]:
reg_xgb = XGBRegressor(objective='reg:squarederror', n_estimators=2000)
reg_xgb = reg_xgb.fit(X,y)

y_pred = reg_xgb.predict(X)

r2 = reg_xgb.score(X, y)
rmse = mean_squared_error(y, y_pred, squared=False)
mae = mean_absolute_error(y, y_pred)

print("R squared:", r_2)
print("RMSE:", rmse)
print("MAE:", mae)

In [None]:
y_test_pred = reg_xgb.predict(X_test)

r2_test = reg_xgb.score(X_test, y_test)
rmse_test = mean_squared_error(y_test, y_test_pred, squared=False)
mae_test = mean_absolute_error(y_test, y_test_pred)

print("R squared:", r2_test)
print("RMSE:", rmse_test)
print("MAE:", mae_test)

## LightGBM

In [None]:
import lightgbm as lgb

In [None]:
reg_lgb = lgb.LGBMRegressor(objective='regression', num_leaves=60, max_depth = 9, learning_rate=0.5, n_estimators=2000, reg_alpha=0.6, subsample=0.6, colsample_bytree = 0.8, scale_pos_weight = 5)
reg_lgb.fit(X, y, verbose=False)

In [None]:
y_pred = reg_lgb.predict(X)

r_2 = reg_lgb.score(X, y)
rmse = mean_squared_error(y, y_pred, squared=False)
mae = mean_absolute_error(y, y_pred)

print("R squared:", r_2)
print("RMSE:", rmse)
print("MAE:", mae)

In [None]:
y_test_pred = reg_lgb.predict(X_test)

r2_test = reg_lgb.score(X_test, y_test)
rmse_test = mean_squared_error(y_test, y_test_pred, squared=False)
mae_test = mean_absolute_error(y_test, y_test_pred)

print("R squared:", r2_test)
print("RMSE:", rmse_test)
print("MAE:", mae_test)

# Submission 3: Neural Network

In [None]:
def build_model():
  model = keras.Sequential([
    layers.Dense(256, activation='relu', input_shape=[len(X.keys())]),
    layers.Dense(128, activation='relu'),
    layers.Dense(64, activation='relu'),
    layers.Dense(32, activation='relu'),
    layers.Dense(1)
  ])

  optimizer = tf.keras.optimizers.RMSprop(0.001)

  model.compile(loss='mse',
                optimizer=optimizer,
                metrics=['mae', 'mse'])
  return model

model = build_model()

In [None]:
model.summary()

In [None]:
train_stats = train_data.describe()
train_stats.pop("Weekly_Sales")
train_stats = train_stats.transpose()
train_stats

In [None]:
def norm(x):
  return (x - train_stats['mean']) / train_stats['std']
  
norm_X = norm(X)
norm_X_test = norm(X_test)

In [None]:
class PrintDot(keras.callbacks.Callback):
  def on_epoch_end(self, epoch, logs):
    if epoch % 100 == 0: print('')
    print('.', end='')

EPOCHS = 200

history = model.fit(
  norm_X, y,
  epochs=EPOCHS, validation_split = 0.2, verbose=0,
  callbacks=[PrintDot()])

In [None]:
hist = pd.DataFrame(history.history)
hist['epoch'] = history.epoch
hist.tail()

In [None]:
def plot_history(history):
  hist = pd.DataFrame(history.history)
  hist['epoch'] = history.epoch

  plt.figure()
  plt.xlabel('Epoch')
  plt.ylabel('Mean Abs Error [Weekly_Sales]')
  plt.plot(hist['epoch'], hist['mae'],
           label='Train Error')
  plt.plot(hist['epoch'], hist['val_mae'],
           label = 'Val Error')
  plt.ylim([0,3000])
  plt.legend()

  plt.figure()
  plt.xlabel('Epoch')
  plt.ylabel('Mean Square Error [Weekly_Sales]')
  plt.plot(hist['epoch'], hist['mse'],
           label='Train Error')
  plt.plot(hist['epoch'], hist['val_mse'],
           label = 'Val Error')
#   plt.ylim([0,3000])
  plt.legend()
  plt.show()


plot_history(history)

In [None]:
y_pred = model.predict(norm_X)

r_2 = r2_score(y, y_pred)
rmse = mean_squared_error(y, y_pred, squared=False)
mae = mean_absolute_error(y, y_pred)

print("R squared:", r_2)
print("RMSE:", rmse)
print("MAE:", mae)

In [None]:
y_test_pred = model.predict(norm_X_test)

r2_test = r2_score(y_test, y_test_pred)
rmse_test = mean_squared_error(y_test, y_test_pred, squared=False)
mae_test = mean_absolute_error(y_test, y_test_pred)

print("R squared:", r2_test)
print("RMSE:", rmse_test)
print("MAE:", mae_test)

# Submission 4: Split train dataset and train two models (using LightGBM)

In [None]:
outliers_data = train_data[train_data["Weekly_Sales"] >= 40000]
normal_data = train_data[train_data["Weekly_Sales"] < 40000]

## Outliers data

In [None]:
X = outliers_data.drop('Weekly_Sales', axis=1)
y = outliers_data['Weekly_Sales']
model_outliers = lgb.LGBMRegressor(objective='regression', num_leaves=140, max_depth = 15, learning_rate=0.5, n_estimators=2000, reg_alpha=0.6)
model_outliers.fit(X, y, verbose=False)

In [None]:
y_pred = model_outliers.predict(X)

r_2 = model_outliers.score(X, y)
rmse = mean_squared_error(y, y_pred, squared=False)
mae = mean_absolute_error(y, y_pred)
print("R squared:", r_2)
print("RMSE:", rmse)
print("MAE:", mae)

## Normal data

In [None]:
X = normal_data.drop('Weekly_Sales', axis=1)
y = normal_data['Weekly_Sales']
model_normal = lgb.LGBMRegressor(objective='regression', num_leaves=140, max_depth = 15, learning_rate=0.5, n_estimators=2000, reg_alpha=0.6)
model_normal.fit(X, y, verbose=False)

In [None]:
y_pred = model_normal.predict(X)

r_2 = model_normal.score(X, y)
rmse = mean_squared_error(y, y_pred, squared=False)
mae = mean_absolute_error(y, y_pred)

print("R squared:", r_2)
print("RMSE:", rmse)
print("MAE:", mae)

## KNN to split test data

In [None]:
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier

model = NearestNeighbors(algorithm = "brute", n_neighbors = 5)
model.fit(train_data.drop('Weekly_Sales', axis=1))

X_test = X_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

indices = model.kneighbors(X_test, 3, return_distance=False)

y_test = pd.DataFrame(y_test)
X_test['label'] = [0]*len(X_test)
y_test['label'] = [0]*len(y_test)
for i,index_list in enumerate(indices):
    count = 0
    for index in index_list:
        if train_data['Weekly_Sales'].iloc[index] >= 40000:
            count += 1
    if count > 1:
        X_test['label'].iloc[i] = 1
        y_test['label'].iloc[i] = 1

X_test_normal = X_test[X_test['label'] == 0]
X_test_outlier = X_test[X_test['label'] == 1]
y_test_normal = y_test[y_test['label'] == 0]
y_test_outlier = y_test[y_test['label'] == 1]

y_test_normal = y_test_normal.drop('label', axis = 1)
y_test_outlier = y_test_outlier.drop('label', axis = 1)

## Predict on two test dataset using two models

In [None]:
y_test_normal_pred = model_normal.predict(X_test_normal.drop('label', axis=1))

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from math import sqrt
r2_test = r2_score(y_test_normal, y_test_normal_pred)
rmse_test = sqrt(mean_squared_error(y_test_normal, y_test_normal_pred))
mae_test = mean_absolute_error(y_test_normal, y_test_normal_pred)

print("R squared:", r2_test)
print("RMSE:", rmse_test)
print("MAE:", mae_test)

In [None]:
y_test_outlier_pred = model_outliers.predict(X_test_outlier.drop('label', axis=1))

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from math import sqrt
r2_test = r2_score(y_test_outlier, y_test_outlier_pred)
rmse_test = sqrt(mean_squared_error(y_test_outlier, y_test_outlier_pred))
mae_test = mean_absolute_error(y_test_outlier, y_test_outlier_pred)

print("R squared:", r2_test)
print("RMSE:", rmse_test)
print("MAE:", mae_test)

## Overall performance in whole test dataset

In [None]:
overall_y_test_pred = y_test_normal_pred.tolist() + y_test_outlier_pred.tolist()
overall_y_test = y_test_normal['Weekly_Sales'].tolist() + y_test_outlier['Weekly_Sales'].tolist()

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from math import sqrt
r2_test = r2_score(overall_y_test, overall_y_test_pred)
rmse_test = sqrt(mean_squared_error(overall_y_test, overall_y_test_pred))
mae_test = mean_absolute_error(overall_y_test, overall_y_test_pred)

print("R squared:", r2_test)
print("RMSE:", rmse_test)
print("MAE:", mae_test)