In [400]:
import pandas as pd
import numpy as np
import scipy
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt

In [401]:
df = pd.read_csv("./training.csv")

In [402]:
#Add weeks
df["dayOfWeek"] = df["dayOfTheYear"]%7 + 1

In [403]:
#Remove outlier days: 184, 185
df = df[df["dayOfTheYear"] != 184]
df = df[df["dayOfTheYear"] != 185]

In [404]:
# Drop columns bc it is the same for all of our data 
x_data = df.drop(columns=['Food Service', 'State', 'Loyalty Site', 'ExtraMile Site', 'Cash/Credit Site', 'CoBrand', 'City', 'EBT Site', 'Alcohol', 'Carwash'])

df = df[df['GrossSoldQuantity'] != None]
# print(df.shape)
# print(df['GrossSoldQuantity'].shape)

In [405]:
x_data[x_data['dayOfTheYear'] == 125].head(16)

Unnamed: 0,StoreNumber,dayOfTheYear,3HourBucket,GrossSoldQuantity,dayOfWeek
483,1000,125,1,7,7
484,1000,125,2,9,7
485,1000,125,3,6,7
486,1000,125,4,7,7
1900,2000,125,1,8,7
1901,2000,125,2,9,7
1902,2000,125,3,9,7
1903,2000,125,4,7,7
3313,3000,125,1,8,7
3314,3000,125,2,10,7


In [406]:
y_data = df['GrossSoldQuantity'].to_frame()

In [407]:
x_data = x_data.drop(columns=['GrossSoldQuantity'])

In [408]:
def determine_season(x):
    if(79<= x and x <= 171):
        #spring
        return 1;
    elif(172 <= x and x <= 265):
        return 2;
    elif(265 <= x and x <= 358):
        return 3;
    else:
        return 4;

#Add seasons
x_data["season"] = x_data["dayOfTheYear"].apply(lambda x: determine_season(x))

In [409]:
x_data["StoreNumber"] = x_data["StoreNumber"].astype("category")
x_data["3HourBucket"] = x_data["3HourBucket"].astype("category")
x_data['random'] = np.random.rand(x_data.shape[0], 1)

In [410]:
y_data.shape

(5628, 1)

In [411]:
from sklearn.model_selection import train_test_split
x_train, x_validation, y_train, y_validation = train_test_split(x_data, y_data, 
                                                    test_size=0.2, random_state=101)

In [412]:
from sklearn.ensemble import RandomForestRegressor

In [413]:
x_train.dtypes

StoreNumber     category
dayOfTheYear       int64
3HourBucket     category
dayOfWeek          int64
season             int64
random           float64
dtype: object

In [414]:
rf = RandomForestRegressor(n_estimators=100, max_features='sqrt', max_depth=10, n_jobs=None,
                           bootstrap=True, oob_score=True, random_state=101)
model = rf.fit(x_train, y_train) 
print('R^2 Training Score: {:.2f}'.format(rf.score(x_train, y_train)))
print('OOB Score: {:.2f}'.format(rf.oob_score_))
print('Validation Score: {:.2f}'.format(rf.score(x_validation, y_validation)))

  This is separate from the ipykernel package so we can avoid doing imports until


R^2 Training Score: 0.86
OOB Score: 0.80
Validation Score: 0.79


In [415]:
# Official Predict 
y_pred = model.predict(x_validation)




In [416]:
def rootMSE(actual, pred):
    return np.sqrt(((pred - actual) ** 2).mean())

In [417]:
y_validation.head()

Unnamed: 0,GrossSoldQuantity
70,7
5246,19
5051,38
4655,17
391,7


In [418]:
rounded_pred = np.rint(y_pred).reshape(-1, 1)

In [419]:
print(rootMSE(y_validation, rounded_pred))

GrossSoldQuantity    9.900437
dtype: float64


In [420]:
print(rootMSE(y_validation, rounded_pred))

GrossSoldQuantity    9.900437
dtype: float64


In [392]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
y_pred = gnb.fit(x_train, y_train).predict(x_validation)



  y = column_or_1d(y, warn=True)


In [393]:
print(type(y_pred))
print(type(y_validation))

<class 'numpy.ndarray'>
<class 'pandas.core.frame.DataFrame'>


In [394]:
print(rootMSE(y_validation.to_numpy(), y_pred))

41.8266114152953


In [286]:
import tensorflow as tf

In [355]:
from sklearn.decomposition import PCA
from sklearn.inspection import permutation_importance

In [423]:
# pca = PCA(n_components=2)

In [424]:
# pca_model = pca.fit(x_data)

In [425]:
# print(pca_model.singular_values_)

In [421]:
r = permutation_importance(model, x_validation, y_validation, n_repeats=30, random_state=42)

In [422]:


permutation_df = pd.DataFrame(columns=['Feature', 'Importance Mean', 'Importance'])

for i in r.importances_mean.argsort()[::-1]:
    #Checking if it is within two standard deviations of the mean
    if (r.importances_mean[i] - 2 * r.importances_std[i]) > 0:
        importance_val = str(r.importances_mean[i]) + " +/- " + str(r.importances_std[i])
        permutation_df = permutation_df.append({'Feature': x_train.columns[i], 'Importance Mean': r.importances_mean[i],
                                                'Importance': importance_val}, ignore_index=True)

#Sorts the features in permutation_df from largest to smallest importance
permutation_df.sort_values(by='Importance Mean', ascending=False)

Unnamed: 0,Feature,Importance Mean,Importance
0,dayOfWeek,0.700363,0.7003628586759241 +/- 0.0313354731247596
1,StoreNumber,0.674502,0.6745016112909391 +/- 0.03329376371024005
2,3HourBucket,0.263259,0.26325937354220524 +/- 0.0203095298503114
3,dayOfTheYear,0.017331,0.01733132537176749 +/- 0.002206075216880444
4,season,0.00872,0.008719934560932074 +/- 0.002868334634141871
