In [317]:
import pandas as pd
import numpy as np
import scipy
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt

In [318]:
df = pd.read_csv("./training.csv")

In [319]:
#Add weeks
df["dayOfWeek"] = df["dayOfTheYear"]%7 + 1

In [320]:
#Remove outlier days: 184, 185
df = df[df["dayOfTheYear"] != 184]
df = df[df["dayOfTheYear"] != 185]

In [321]:
# Drop columns bc it is the same for all of our data 
x_data = df.drop(columns=['Food Service', 'State', 'Loyalty Site', 'ExtraMile Site', 'Cash/Credit Site', 'CoBrand', 'City', 'EBT Site', 'Alcohol', 'Carwash'])

df = df[df['GrossSoldQuantity'] != None]
# print(df.shape)
# print(df['GrossSoldQuantity'].shape)

In [322]:
x_data[x_data['dayOfTheYear'] == 125].head(16)

Unnamed: 0,StoreNumber,dayOfTheYear,3HourBucket,GrossSoldQuantity,dayOfWeek
483,1000,125,1,7,7
484,1000,125,2,9,7
485,1000,125,3,6,7
486,1000,125,4,7,7
1900,2000,125,1,8,7
1901,2000,125,2,9,7
1902,2000,125,3,9,7
1903,2000,125,4,7,7
3313,3000,125,1,8,7
3314,3000,125,2,10,7


In [323]:
y_data = df['GrossSoldQuantity'].to_frame()

In [324]:
x_data = x_data.drop(columns=['GrossSoldQuantity'])

In [325]:
def determine_season(x):
    if(79<= x and x <= 171):
        #spring
        return 1;
    elif(172 <= x and x <= 265):
        return 2;
    elif(265 <= x and x <= 358):
        return 3;
    else:
        return 4;

#Add seasons
x_data["season"] = x_data["dayOfTheYear"].apply(lambda x: determine_season(x))

In [326]:
x_data.dtypes

StoreNumber     int64
dayOfTheYear    int64
3HourBucket     int64
dayOfWeek       int64
season          int64
dtype: object

In [327]:
y_data.shape

(5628, 1)

In [328]:
from sklearn.model_selection import train_test_split
x_train, x_validation, y_train, y_validation = train_test_split(x_data, y_data, 
                                                    test_size=0.2, random_state=101)

In [329]:
from sklearn.ensemble import RandomForestRegressor

In [330]:
x_train.dtypes

StoreNumber     int64
dayOfTheYear    int64
3HourBucket     int64
dayOfWeek       int64
season          int64
dtype: object

In [331]:
rf = RandomForestRegressor(n_estimators=100, max_features='sqrt', max_depth=10, n_jobs=None,
                           bootstrap=True, oob_score=True, random_state=101)
model = rf.fit(x_train, y_train) 
print('R^2 Training Score: {:.2f}'.format(rf.score(x_train, y_train)))
print('OOB Score: {:.2f}'.format(rf.oob_score_))
print('Validation Score: {:.2f}'.format(rf.score(x_validation, y_validation)))

  This is separate from the ipykernel package so we can avoid doing imports until


R^2 Training Score: 0.86
OOB Score: 0.81
Validation Score: 0.80


In [332]:
# Official Predict 
y_pred = model.predict(x_validation)




In [342]:
def rootMSE(actual, pred):
    return np.sqrt(((pred - actual) ** 2).mean())

In [343]:
y_validation.head()

Unnamed: 0,GrossSoldQuantity
70,7
5246,19
5051,38
4655,17
391,7


In [349]:
rounded_pred = np.rint(y_pred).reshape(-1, 1)

In [347]:
print(rootMSE(y_validation, rounded_pred))

GrossSoldQuantity    9.856745
dtype: float64


In [348]:
print(rootMSE(y_validation, rounded_pred))

GrossSoldQuantity    9.856745
dtype: float64


In [309]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
y_pred = gnb.fit(x_train, y_train).predict(x_validation)



  y = column_or_1d(y, warn=True)


In [284]:
print(type(y_pred))
print(type(y_validation))

<class 'numpy.ndarray'>
<class 'pandas.core.frame.DataFrame'>


In [300]:
print(rootMSE(y_validation.to_numpy(), y_pred))

28.616759923397804


In [286]:
import tensorflow as tf

In [355]:
from sklearn.decomposition import PCA
from sklearn.inspection import permutation_importance

In [356]:
pca = PCA(n_components=2)

In [357]:
pca_model = pca.fit(x_data)

In [358]:
print(pca_model.singular_values_)

[83874.89018475  7863.46262828]


In [353]:
r = permutation_importance(model, x_validation, y_validation, n_repeats=30, random_state=42)

In [354]:


permutation_df = pd.DataFrame(columns=['Feature', 'Importance Mean', 'Importance'])

for i in r.importances_mean.argsort()[::-1]:
    #Checking if it is within two standard deviations of the mean
    if (r.importances_mean[i] - 2 * r.importances_std[i]) > 0:
        importance_val = str(r.importances_mean[i]) + " +/- " + str(r.importances_std[i])
        permutation_df = permutation_df.append({'Feature': x_train.columns[i], 'Importance Mean': r.importances_mean[i],
                                                'Importance': importance_val}, ignore_index=True)

#Sorts the features in permutation_df from largest to smallest importance
permutation_df.sort_values(by='Importance Mean', ascending=False)

Unnamed: 0,Feature,Importance Mean,Importance
0,dayOfWeek,0.73146,0.7314601825108763 +/- 0.032008169029340956
1,StoreNumber,0.702423,0.7024232725761181 +/- 0.03537576692646416
2,3HourBucket,0.280023,0.2800230154950196 +/- 0.020212323456993544
3,dayOfTheYear,0.022468,0.022467955884575765 +/- 0.003600623927210976
