## Zip Code GGR Estimate Modeling

### Imputation of Missing Data with Median

In [3]:
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split

df = pd.read_excel(r'C:\Users\sheridan.payne\Documents\Python Scripts\GGR_Estimate_Modeling_Template.xlsx')
df['Cost of Living Index'] = df['Cost of Living Index'].fillna(df['Cost of Living Index'].median())

# Splitting Dataset: sample_df = CRC & DCG; (2) test_df = OGG
sample_df = df[df['Business Location'] != 'Oak Grove']
X = sample_df[sample_df.columns.difference(['Business Location', 'Slot Gross Theo'])]
Y = sample_df['Slot Gross Theo']

test_df = df[df['Business Location'] == 'Oak Grove']
X_ogg = test_df[test_df.columns.difference(['Business Location', 'Slot Gross Theo'])]
Y_ogg = test_df['Slot Gross Theo']

#Splitting Train/Test Data using CRC & DCG
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=117)

### Linear Regression

In [4]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression().fit(X_train,Y_train)
lr_score = lr.score(X_test, Y_test)
print('Linear Regression score:', lr_score)

# Predicting CRC & DCG
pred = lr.predict(X_test)
performance_test = pd.DataFrame({'True Value': Y_test,
                            'Prediction': pred,
                            'Error' : Y_test-pred,
                            'Error %': abs((pred-Y_test)/Y_test)*100})
performance_test

# Predicting OGG
pred = lr.predict(X_ogg)
performance_ogg = pd.DataFrame({'True Value': Y_ogg,
                            'Prediction': pred,
                            'Error': Y_ogg-pred,
                            'Error %': abs((pred-Y_ogg)/Y_ogg)*100})
performance_ogg

Linear Regression score: 0.9725258033964109


Unnamed: 0,True Value,Prediction,Error,Error %
468,9.030030e+05,4.933997e+05,409603.290523,45.360127
469,1.705553e+06,1.476920e+06,228632.839367,13.405204
470,6.323701e+03,2.447587e+04,-18152.168099,287.049767
471,1.537480e+06,1.250726e+06,286754.816057,18.650956
472,4.755381e+04,9.214763e+04,-44593.819491,93.775494
...,...,...,...,...
665,2.961561e+02,-5.280170e+04,53097.860543,17929.011269
666,2.816461e+04,4.078636e+04,-12621.750964,44.814227
667,2.013335e+03,-1.389571e+04,15909.041084,790.183584
668,3.214907e+02,-8.046884e+03,8368.375035,2602.991326


### KNN Model

In [5]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV

# Finding best # of neighbors for model
parameters = {"n_neighbors": range(1, 50)}
gridsearch = GridSearchCV(KNeighborsRegressor(), parameters)
gridsearch.fit(X_train, Y_train)
best_k = gridsearch.best_params_["n_neighbors"]

# Train / Test Model using CRC & DCG
knn = KNeighborsRegressor(n_neighbors=best_k).fit(X_train,Y_train)
print('KNN score:', knn.score(X_test,Y_test))

pred = knn.predict(X_test)
performance_test = pd.DataFrame({'True Value' : Y_test,
                            'Prediction' : pred,
                            'Error' : Y_test-pred,
                            'Error %': abs((pred-Y_test)/Y_test)*100
                           })

# Testing Model using OGG
pred = knn.predict(X_ogg)
performance_ogg = pd.DataFrame({'True Value': Y_ogg, 
                            'Prediction': pred, 
                            'Error': Y_ogg-pred, 
                            'Error %': abs((pred-Y_ogg)/Y_ogg)*100
                           })
performance_ogg

KNN score: 0.9651990454686487


Unnamed: 0,True Value,Prediction,Error,Error %
468,9.030030e+05,5.214846e+04,850854.50035,94.224995
469,1.705553e+06,1.385591e+06,319962.04130,18.760019
470,6.323701e+03,3.567538e+05,-350430.06995,5541.534721
471,1.537480e+06,1.120561e+06,416919.88910,27.117085
472,4.755381e+04,5.864301e+04,-11089.20380,23.319275
...,...,...,...,...
665,2.961561e+02,1.710650e+02,125.09110,42.238232
666,2.816461e+04,6.103719e+03,22060.88755,78.328405
667,2.013335e+03,9.381565e+02,1075.17830,53.402857
668,3.214907e+02,9.279077e+02,-606.41700,188.626607


### Ridge Regression

In [6]:
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import RepeatedKFold

cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=117)
rr = RidgeCV(alphas=np.arange(0.001,1,0.01), cv=cv, scoring='neg_mean_absolute_error').fit(X_train, Y_train)

print('Ridge Regression score:', rr.score(X_test, Y_test))

# Testing model with CRC & DCG
pred = rr.predict(X_test)
performance_test = pd.DataFrame({'True Value': Y_test,
                            'Prediction': pred,
                            'Error': Y_test-pred,
                            'Error %': abs((pred-Y_test)/Y_test)*100})
performance_test

# Testing model with OGG
pred = rr.predict(X_ogg)
performance_ogg = pd.DataFrame({'True Value': Y_ogg,
                            'Prediction': pred,
                            'Error': Y_ogg-pred,
                            'Error %': abs((pred-Y_ogg)/Y_ogg)*100})
performance_ogg

Ridge Regression score: 0.9726281402017997


Unnamed: 0,True Value,Prediction,Error,Error %
468,9.030030e+05,4.932237e+05,409779.237734,45.379612
469,1.705553e+06,1.480312e+06,225240.956944,13.206331
470,6.323701e+03,2.411304e+04,-17789.340491,281.312183
471,1.537480e+06,1.250123e+06,287357.187532,18.690136
472,4.755381e+04,9.201383e+04,-44460.025119,93.494140
...,...,...,...,...
665,2.961561e+02,-5.302392e+04,53320.079637,18004.045717
666,2.816461e+04,4.065881e+04,-12494.202034,44.361358
667,2.013335e+03,-1.324773e+04,15261.060252,757.999129
668,3.214907e+02,-7.494171e+03,7815.661587,2431.069262
