In [1]:
import pandas as pd

In [2]:
import numpy as np
from pprint import pprint
import matplotlib.pyplot as plt

## Read the CSV and Perform Basic Data Cleaning

AVG TPS = average ticket price sold 

In [3]:
# import historic venue data
df = pd.read_csv("working_model_AMP_2.csv")
df.head()

Unnamed: 0,DATE,ROOM,TOTAL ADV,FINAL,AVG TPS,TOTAL COUNT,BAR RINGS,Day of Week,Month,streams_transformed,average_age,percent_male
0,5/14/19,BALLROOM,504,7560.0,15,504,4785.22,Tuesday,5,0.173646,30.078358,0.758037
1,12/2/18,BALLROOM,502,11044.0,22,502,4493.17,Sunday,12,0.299224,30.724574,0.693456
2,5/22/18,BALLROOM,501,8517.0,17,501,5492.03,Tuesday,5,0.269953,35.29802,0.741351
3,6/15/19,BALLROOM,501,10020.0,20,501,4535.97,Saturday,6,0.141881,30.819497,0.462539
4,5/2/19,BALLROOM,500,10000.0,20,500,6841.79,Thursday,5,1.0,29.599578,0.755208


In [4]:
# convert months to seasons
df['Season']= np.where((df['Month']<3) | (df['Month']>11),'Winter',\
     np.where((df['Month']<9) & (df['Month']>5),'Summer',\
     np.where((df['Month']>2) & (df['Month']<6),'Spring','Fall')))
df.head()

Unnamed: 0,DATE,ROOM,TOTAL ADV,FINAL,AVG TPS,TOTAL COUNT,BAR RINGS,Day of Week,Month,streams_transformed,average_age,percent_male,Season
0,5/14/19,BALLROOM,504,7560.0,15,504,4785.22,Tuesday,5,0.173646,30.078358,0.758037,Spring
1,12/2/18,BALLROOM,502,11044.0,22,502,4493.17,Sunday,12,0.299224,30.724574,0.693456,Winter
2,5/22/18,BALLROOM,501,8517.0,17,501,5492.03,Tuesday,5,0.269953,35.29802,0.741351,Spring
3,6/15/19,BALLROOM,501,10020.0,20,501,4535.97,Saturday,6,0.141881,30.819497,0.462539,Summer
4,5/2/19,BALLROOM,500,10000.0,20,500,6841.79,Thursday,5,1.0,29.599578,0.755208,Spring


In [5]:
# delete redundant or unneeded columns for this model

df =df.drop(["FINAL","DATE","Month","AVG TPS","BAR RINGS","TOTAL ADV","average_age"],axis=1)

In [6]:

columns = df.columns.values
columns

array(['ROOM', 'TOTAL COUNT', 'Day of Week', 'streams_transformed',
       'percent_male', 'Season'], dtype=object)

In [7]:
# one hot encoding

df_d = pd.get_dummies(df)
df_d.head()

Unnamed: 0,TOTAL COUNT,streams_transformed,percent_male,ROOM_BALLROOM,ROOM_TAVERN,Day of Week_Friday,Day of Week_Monday,Day of Week_Saturday,Day of Week_Sunday,Day of Week_Thursday,Day of Week_Tuesday,Day of Week_Wednesday,Season_Fall,Season_Spring,Season_Summer,Season_Winter
0,504,0.173646,0.758037,1,0,0,0,0,0,0,1,0,0,1,0,0
1,502,0.299224,0.693456,1,0,0,0,0,1,0,0,0,0,0,0,1
2,501,0.269953,0.741351,1,0,0,0,0,0,0,1,0,0,1,0,0
3,501,0.141881,0.462539,1,0,0,0,1,0,0,0,0,0,0,1,0
4,500,1.0,0.755208,1,0,0,0,0,0,1,0,0,0,1,0,0


## Initial model for ticket count

In [8]:
# df_tix_count = df_d.drop(["INDEX","BAR RINGS","FINAL","AVG TPS"],axis=1)

In [9]:
# determine the predictor
X = df_d.drop("TOTAL COUNT", axis=1)
y = df_d["TOTAL COUNT"]
print(X.shape, y.shape)

(190, 15) (190,)


In [10]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import chi2
model = LinearRegression()

In [11]:
X.columns.values

array(['streams_transformed', 'percent_male', 'ROOM_BALLROOM',
       'ROOM_TAVERN', 'Day of Week_Friday', 'Day of Week_Monday',
       'Day of Week_Saturday', 'Day of Week_Sunday',
       'Day of Week_Thursday', 'Day of Week_Tuesday',
       'Day of Week_Wednesday', 'Season_Fall', 'Season_Spring',
       'Season_Summer', 'Season_Winter'], dtype=object)

## MODELS

### random state =42, test size = .2

In [12]:
random_state= 42
test_size= 0.2
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state,test_size=test_size)
feature_list = list(X_train.columns.values)
model_A = LinearRegression()


# Fitting our model with all of our features in X
model_A.fit(X_train, y_train)

score_train = model_A.score(X_train, y_train)
score_test = model_A.score(X_test, y_test)



scores, pvalues_train = chi2(X_train, y_train)
pvalues_train
scores, pvalues_test = chi2(X_test, y_test)
pvalues_test

coef_feat_df_A = pd.DataFrame(list(zip(model_A.coef_, pvalues_train, pvalues_test, feature_list)))
coef_feat_df_A.columns = ['coef','p-values train','p-values test','feature']
coef_feat_df_A = coef_feat_df_A.sort_values('p-values train',ascending=True)

print(f"random state = {random_state}, test size = {test_size}")
print(f"training Score: {score_train}")
print(f"testing Score: {score_test}")
print('y-axis intercept: ', model_A.intercept_)
coef_feat_df_A.head(20)


random state = 42, test size = 0.2
training Score: 0.6220747431879621
testing Score: 0.5996058714357075
y-axis intercept:  73.82425195391595


Unnamed: 0,coef,p-values train,p-values test,feature
7,-2.937157,0.381988,0.249028,Day of Week_Sunday
5,15.085311,0.457326,0.606442,Day of Week_Monday
9,7.64874,0.674163,0.859671,Day of Week_Tuesday
4,9.853549,0.750903,0.837087,Day of Week_Friday
14,23.287172,0.804399,0.60899,Season_Winter
6,4.296459,0.869889,0.738526,Day of Week_Saturday
13,1.356224,0.943297,0.723028,Season_Summer
2,75.400563,0.963325,0.889244,ROOM_BALLROOM
10,-1.405581,0.975588,0.60899,Day of Week_Wednesday
11,-24.608841,0.987295,0.8193,Season_Fall


In [13]:
predictions = model_A.predict(X_test)

pred_df = pd.DataFrame({"Predicted Ticket COUNT": predictions , "Actual Ticket COUNT": y_test})
pred_df.head(10)

Unnamed: 0,Predicted Ticket COUNT,Actual Ticket COUNT
175,47.63786,15
180,7.713205,10
111,41.195456,53
65,220.692521,139
101,36.598573,66
15,211.764078,407
9,317.856507,500
16,409.656883,383
141,64.945354,26
124,61.722794,41


### random state =7, test size = .2

In [14]:
random_state= 7
test_size= 0.2
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state,test_size=test_size)
feature_list = list(X_train.columns.values)
model_B = LinearRegression()

# Fitting our model with all of our features in X
model_B.fit(X_train, y_train)

score_train = model_B.score(X_train, y_train)
score_test = model_B.score(X_test, y_test)



scores, pvalues_train = chi2(X_train, y_train)
pvalues_train
scores, pvalues_test = chi2(X_test, y_test)
pvalues_test

coef_feat_df_B = pd.DataFrame(list(zip(model_B.coef_, pvalues_train, pvalues_test, feature_list)))
coef_feat_df_B.columns = ['coef','p-values train','p-values test','feature']
coef_feat_df_B = coef_feat_df_B.sort_values('p-values train',ascending=True)

print(f"random state = {random_state}, test size = {test_size}")
print(f"training Score: {score_train}")
print(f"testing Score: {score_test}")
print('y-axis intercept: ', model_B.intercept_)
coef_feat_df_B.head(20)


random state = 7, test size = 0.2
training Score: 0.6384049613398243
testing Score: 0.5048329095313645
y-axis intercept:  84.16031117925768


Unnamed: 0,coef,p-values train,p-values test,feature
5,5.541183,0.346496,0.883251,Day of Week_Monday
7,10.555864,0.594059,0.75941,Day of Week_Sunday
14,2.942172,0.718434,0.661681,Season_Winter
6,-1.982478,0.747916,0.516233,Day of Week_Saturday
9,-5.17508,0.796741,0.782108,Day of Week_Tuesday
4,18.337362,0.833335,0.421521,Day of Week_Friday
13,5.522275,0.854966,0.865027,Season_Summer
2,77.086788,0.901118,0.961152,ROOM_BALLROOM
10,-9.744864,0.938858,0.752315,Day of Week_Wednesday
11,-13.496832,0.986123,0.835225,Season_Fall


In [15]:
predictions = model_B.predict(X_test)

pred_df = pd.DataFrame({"Predicted Ticket COUNT": predictions , "Actual Ticket COUNT": y_test})
pred_df.head(10)

Unnamed: 0,Predicted Ticket COUNT,Actual Ticket COUNT
22,231.846351,338
177,38.303344,11
46,205.790734,192
63,203.932719,112
24,212.069255,315
185,41.271957,29
40,174.254206,191
188,67.145442,67
78,199.744822,125
96,187.87172,57


### random state =0, test size = .2

In [16]:
random_state= 0
test_size= 0.2
feature_list = []
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state,test_size=test_size)
feature_list = list(X_train.columns.values)
model_C = LinearRegression()

# Fitting our model with all of our features in X
model_C.fit(X_train, y_train)

score_train = model_C.score(X_train, y_train)
score_test = model_C.score(X_test, y_test)



scores, pvalues_train = chi2(X_train, y_train)
pvalues_train
scores, pvalues_test = chi2(X_test, y_test)
pvalues_test

coef_feat_df_C = pd.DataFrame(list(zip(model_C.coef_, pvalues_train, pvalues_test, feature_list)))
coef_feat_df_C.columns = ['coef','p-values train','p-values test','feature']
coef_feat_df_C = coef_feat_df_C.sort_values('p-values train',ascending=True)

print(f"random state = {random_state}, test size = {test_size}")
print(f"training Score: {score_train}")
print(f"testing Score: {score_test}")
print('y-axis intercept: ', model_C.intercept_)
coef_feat_df_C.head(20)


random state = 0, test size = 0.2
training Score: 0.6089493901526369
testing Score: 0.6605159056727758
y-axis intercept:  101.22682585819248


Unnamed: 0,coef,p-values train,p-values test,feature
5,3.951555,0.686361,0.606442,Day of Week_Monday
7,-7.90735,0.694467,0.327542,Day of Week_Sunday
14,12.930686,0.727008,0.517011,Season_Winter
4,26.388726,0.747289,0.655159,Day of Week_Friday
6,-1.845779,0.825849,0.627614,Day of Week_Saturday
9,3.285074,0.869262,0.56809,Day of Week_Tuesday
13,1.889618,0.955517,0.778159,Season_Summer
8,-34.801928,0.963613,0.92147,Day of Week_Thursday
10,10.929702,0.966836,0.880905,Day of Week_Wednesday
2,86.178458,0.979058,0.844416,ROOM_BALLROOM


In [17]:
predictions = model_C.predict(X_test)

pred_df = pd.DataFrame({"Predicted Ticket COUNT": predictions , "Actual Ticket COUNT": y_test})
pred_df.head(10)

Unnamed: 0,Predicted Ticket COUNT,Actual Ticket COUNT
108,32.623388,78
74,240.498224,105
161,69.115181,39
95,83.047268,64
123,34.676447,52
71,52.400189,97
18,315.345172,366
124,78.846683,41
143,212.384656,68
7,372.65169,500


### random state =0, test size = .3

In [18]:
random_state= 0
test_size= 0.3
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state,test_size=test_size)
feature_list = list(X_train.columns.values)
model_D = LinearRegression()

# Fitting our model with all of our features in X
model_D.fit(X_train, y_train)

score_train = model_D.score(X_train, y_train)
score_test = model_D.score(X_test, y_test)



scores, pvalues_train = chi2(X_train, y_train)
pvalues_train
scores, pvalues_test = chi2(X_test, y_test)
pvalues_test

coef_feat_df_D = pd.DataFrame(list(zip(model_D.coef_, pvalues_train, pvalues_test, feature_list)))
coef_feat_df_D.columns = ['coef','p-values train','p-values test','feature']
coef_feat_df_D = coef_feat_df_D.sort_values('p-values train',ascending=True)

print(f"random state = {random_state}, test size = {test_size}")
print(f"training Score: {score_train}")
print(f"testing Score: {score_test}")
print('y-axis intercept: ', model_D.intercept_)
coef_feat_df_D.head(20)




random state = 0, test size = 0.3
training Score: 0.5886693640252461
testing Score: 0.6922211119159175
y-axis intercept:  96.98258337497141


Unnamed: 0,coef,p-values train,p-values test,feature
4,25.199778,0.677504,0.812197,Day of Week_Friday
5,3.43305,0.68187,0.646358,Day of Week_Monday
14,15.069507,0.741897,0.645621,Season_Winter
7,-6.742189,0.749124,0.19525,Day of Week_Sunday
6,-0.193316,0.801772,0.495707,Day of Week_Saturday
9,4.269496,0.824258,0.775225,Day of Week_Tuesday
10,12.839621,0.957579,0.896392,Day of Week_Wednesday
13,3.492823,0.961768,0.865059,Season_Summer
8,-38.806441,0.971319,0.970743,Day of Week_Thursday
2,86.079116,0.988277,0.840713,ROOM_BALLROOM


In [19]:
predictions = model_D.predict(X_test)

pred_df = pd.DataFrame({"Predicted Ticket COUNT": predictions , "Actual Ticket COUNT": y_test})
pred_df.head(10)

Unnamed: 0,Predicted Ticket COUNT,Actual Ticket COUNT
108,34.510215,78
74,239.801009,105
161,68.31875,39
95,80.197414,64
123,36.046936,52
71,49.47515,97
18,309.988444,366
124,76.72979,41
143,211.012405,68
7,368.877489,500


### random state =42, test size = .3

In [20]:
random_state= 42
test_size= 0.3
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state,test_size=test_size)

feature_list = list(X_train.columns.values)
model_E = LinearRegression()

# Fitting our model with all of our features in X
# Fitting our model with all of our features in X
model_E.fit(X_train, y_train)

score_train = model_E.score(X_train, y_train)
score_test = model_E.score(X_test, y_test)






scores, pvalues_train = chi2(X_train, y_train)
pvalues_train
scores, pvalues_test = chi2(X_test, y_test)
pvalues_test

coef_feat_df_E = pd.DataFrame(list(zip(model_E.coef_, pvalues_train, pvalues_test, feature_list)))
coef_feat_df_E.columns = ['coef','p-values train','p-values test','feature']
coef_feat_df_E = coef_feat_df_E.sort_values('p-values train',ascending=True)

print(f"random state = {random_state}, test size = {test_size}")
print(f"training Score: {score_train}")
print(f"testing Score: {score_test}")
print('y-axis intercept: ', model_E.intercept_)
coef_feat_df_E.head(20)



random state = 42, test size = 0.3
training Score: 0.6172581398417268
testing Score: 0.6078710380439473
y-axis intercept:  88.1183866366348


Unnamed: 0,coef,p-values train,p-values test,feature
7,3.476942,0.470294,0.697207,Day of Week_Sunday
5,14.478274,0.485445,0.842814,Day of Week_Monday
4,17.27423,0.523567,0.846556,Day of Week_Friday
14,27.726053,0.711876,0.565789,Season_Winter
9,11.282653,0.723262,0.822983,Day of Week_Tuesday
6,1.618744,0.932773,0.667803,Day of Week_Saturday
13,-1.866478,0.94241,0.828884,Season_Summer
8,-38.278245,0.946976,0.936039,Day of Week_Thursday
2,79.159788,0.961559,0.94884,ROOM_BALLROOM
10,-9.852599,0.9853,0.545785,Day of Week_Wednesday


In [21]:
predictions = model_E.predict(X_test)

pred_df = pd.DataFrame({"Predicted Ticket COUNT": predictions , "Actual Ticket COUNT": y_test})
pred_df.head(10)

Unnamed: 0,Predicted Ticket COUNT,Actual Ticket COUNT
175,47.413702,15
180,2.308433,10
111,31.88069,53
65,213.421774,139
101,40.34635,66
15,222.304293,407
9,327.903072,500
16,410.113718,383
141,66.026518,26
124,69.607247,41


### random state =7, test size = .3

In [22]:
random_state= 7
test_size= 0.3
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state,test_size=test_size)
feature_list = list(X_train.columns.values)
model_F = LinearRegression()

# Fitting our model with all of our features in X
model_F.fit(X_train, y_train)

score_train = model_F.score(X_train, y_train)
score_test = model_F.score(X_test, y_test)



scores, pvalues_train = chi2(X_train, y_train)
pvalues_train
scores, pvalues_test = chi2(X_test, y_test)
pvalues_test

coef_feat_df_F = pd.DataFrame(list(zip(model_F.coef_, pvalues_train, pvalues_test, feature_list)))
coef_feat_df_F.columns = ['coef','p-values train','p-values test','feature']
coef_feat_df_F = coef_feat_df_F.sort_values('p-values train',ascending=True)

print(f"random state = {random_state}, test size = {test_size}")
print(f"training Score: {score_train}")
print(f"testing Score: {score_test}")
print('y-axis intercept: ', model_F.intercept_)
coef_feat_df_F.head(20)



random state = 7, test size = 0.3
training Score: 0.6410682133619923
testing Score: 0.48940760942116696
y-axis intercept:  91.33453801123133


Unnamed: 0,coef,p-values train,p-values test,feature
5,5.676798,0.541863,0.815604,Day of Week_Monday
7,16.171961,0.659235,0.69339,Day of Week_Sunday
13,4.018127,0.758319,0.934995,Season_Summer
6,2.599812,0.813748,0.513358,Day of Week_Saturday
14,0.544591,0.821649,0.715776,Season_Winter
9,-7.038469,0.824321,0.791758,Day of Week_Tuesday
4,12.199594,0.881124,0.67684,Day of Week_Friday
2,84.901266,0.901758,0.98541,ROOM_BALLROOM
10,-8.487379,0.926755,0.633224,Day of Week_Wednesday
11,-10.411882,0.934124,0.93106,Season_Fall


In [23]:
model_F.get_params()

{'copy_X': True, 'fit_intercept': True, 'n_jobs': None, 'normalize': False}

In [24]:
predictions = model_F.predict(X_test)

pred_df = pd.DataFrame({"Predicted Ticket COUNT": predictions , "Actual Ticket COUNT": y_test})
pred_df.head(10)

Unnamed: 0,Predicted Ticket COUNT,Actual Ticket COUNT
22,253.043748,338
177,47.456545,11
46,214.747127,192
63,215.244169,112
24,225.799278,315
185,39.322178,29
40,188.659259,191
188,71.229952,67
78,217.724335,125
96,212.398003,57


## Pickling the prefered model


In [25]:
import pickle
# Dump the trained linear regression model with Pickle
lin_reg_pkl_filename = 'lin_reg_tix_count.pkl'

# Open the file to save as pkl file
lin_reg_model_pkl = open(lin_reg_pkl_filename, 'wb')
pickle.dump(model_E, lin_reg_model_pkl)

# Close the pickle instances
lin_reg_model_pkl.close()

In [26]:
# Loading the saved model pickle
lin_reg_model_pkl = open('lin_reg_tix_count.pkl', 'rb')
lin_reg_model = pickle.load(lin_reg_model_pkl)
print("Loaded Linear Regression model :: ", lin_reg_model)

Loaded Linear Regression model ::  LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)
