In [9]:
import pandas as pd
import numpy as np
import scipy
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [10]:
df = pd.read_csv("./training.csv")

In [11]:
#Add weeks
df["dayOfWeek"] = df["dayOfTheYear"]%7 + 1

In [12]:
def determine_season(x):
    if(79<= x and x <= 171):
        #spring
        return 1;
    elif(172 <= x and x <= 265):
        return 2;
    elif(265 <= x and x <= 358):
        return 3;
    else:
        return 4;

#Add Astronomical Seasons
df["season"] = df["dayOfTheYear"].apply(lambda x: determine_season(x))

In [13]:
#Remove outlier days: 184, 185
df = df[df["dayOfTheYear"] != 184]
df = df[df["dayOfTheYear"] != 185]

In [14]:
#remove unhelpful data - all data has same value
df = df.drop(columns=['Food Service', 'State', 'Loyalty Site', 'ExtraMile Site', 'Cash/Credit Site', 'CoBrand'])

#remove data - map exactly to store number
df = df.drop(columns=['EBT Site', 'Alcohol', 'Carwash', 'City'])

In [15]:
def rootMSE(actual, pred):
    return np.sqrt(((pred - actual) ** 2).mean())

In [16]:
#Model Training

x = df.drop(columns=['GrossSoldQuantity'])
y = df["GrossSoldQuantity"]
x_train, x_validation, y_train, y_validation = train_test_split(x, y, test_size=0.2, random_state=101)
x_train.dtypes
rf = RandomForestRegressor(n_estimators=100, max_features='sqrt', max_depth=10, n_jobs=None,
                           bootstrap=True, oob_score=True, random_state=101)
model = rf.fit(x_train, y_train) 
print('R^2 Training Score: {:.2f}'.format(rf.score(x_train, y_train)))
print('OOB Score: {:.2f}'.format(rf.oob_score_))
print('Validation Score: {:.2f}'.format(rf.score(x_validation, y_validation)))

y_pred = model.predict(x_validation)
rounded_pred = np.rint(y_pred)

print(rootMSE(y_validation, rounded_pred))

R^2 Training Score: 0.86
OOB Score: 0.81
Validation Score: 0.80
9.856744767251993


In [17]:
#test results
test = pd.read_csv("/Users/rhuck/Documents/Datathon2021/Chevron_2021_Datathon_Challenge/filesFor30MinBeforeJudging/scoring.csv")

#remove unhelpful data - all data has same value
test = test.drop(columns=['Food Service', 'State', 'Loyalty Site', 'ExtraMile Site', 'Cash/Credit Site', 'CoBrand'])

#remove data - map exactly to store number
test = test.drop(columns=['EBT Site', 'Alcohol', 'Carwash', 'City'])

#Add weeks
test["dayOfWeek"] = test["dayOfTheYear"]%7 + 1

#Add Astronomical Seasons
test["season"] = test["dayOfTheYear"].apply(lambda x: determine_season(x))

x_test = test.drop(columns=["GrossSoldQuantity"])
y_test = test["GrossSoldQuantity"]

y_test_pred = model.predict(x_test)
rounded_test_pred = np.rint(y_test_pred)

print(y_test)
print(rounded_test_pred)

print(rootMSE(y_test, rounded_test_pred))

0      0
1      3
2      4
3      1
4      9
5      9
6      3
7      6
8      1
9      2
10     1
11     2
12     1
13     1
14     7
15     3
16     0
17     1
18     1
19     2
20     1
21     2
22     4
23     7
24     6
25    11
26    30
27    24
28    12
29    16
30    18
31    37
Name: GrossSoldQuantity, dtype: int64
[ 2.  5.  4.  6.  5.  7.  6.  8.  3.  4.  4.  5.  5.  6.  6.  7.  3.  5.
  4.  4.  4.  6.  5.  5. 14. 21. 25. 34. 15. 22. 25. 40.]
4.366062299143245
