In [1]:
import pandas as pd

In [2]:
import numpy as np
from pprint import pprint
import matplotlib.pyplot as plt

## Read the CSV and Perform Basic Data Cleaning

AVG TPS = average ticket price sold 

In [3]:
# import historic venue data
df = pd.read_csv("working_model_AMP_2.csv")
df.head()

Unnamed: 0,DATE,ROOM,TOTAL ADV,FINAL,AVG TPS,TOTAL COUNT,BAR RINGS,Day of Week,Month,streams_transformed,average_age,percent_male
0,5/14/19,BALLROOM,504,7560.0,15,504,4785.22,Tuesday,5,0.173646,30.078358,0.758037
1,12/2/18,BALLROOM,502,11044.0,22,502,4493.17,Sunday,12,0.299224,30.724574,0.693456
2,5/22/18,BALLROOM,501,8517.0,17,501,5492.03,Tuesday,5,0.269953,35.29802,0.741351
3,6/15/19,BALLROOM,501,10020.0,20,501,4535.97,Saturday,6,0.141881,30.819497,0.462539
4,5/2/19,BALLROOM,500,10000.0,20,500,6841.79,Thursday,5,1.0,29.599578,0.755208


In [4]:
# convert months to seasons
df['Season']= np.where((df['Month']<3) | (df['Month']>11),'Winter',\
     np.where((df['Month']<9) & (df['Month']>5),'Summer',\
     np.where((df['Month']>2) & (df['Month']<6),'Spring','Fall')))
df.head()

Unnamed: 0,DATE,ROOM,TOTAL ADV,FINAL,AVG TPS,TOTAL COUNT,BAR RINGS,Day of Week,Month,streams_transformed,average_age,percent_male,Season
0,5/14/19,BALLROOM,504,7560.0,15,504,4785.22,Tuesday,5,0.173646,30.078358,0.758037,Spring
1,12/2/18,BALLROOM,502,11044.0,22,502,4493.17,Sunday,12,0.299224,30.724574,0.693456,Winter
2,5/22/18,BALLROOM,501,8517.0,17,501,5492.03,Tuesday,5,0.269953,35.29802,0.741351,Spring
3,6/15/19,BALLROOM,501,10020.0,20,501,4535.97,Saturday,6,0.141881,30.819497,0.462539,Summer
4,5/2/19,BALLROOM,500,10000.0,20,500,6841.79,Thursday,5,1.0,29.599578,0.755208,Spring


In [5]:
# delete redundant or unneeded columns for this model

# it would be worth trying this one with and without room

df =df.drop(["FINAL","DATE","Month","BAR RINGS","AVG TPS"],axis=1)

In [6]:

columns = df.columns.values
columns

array(['ROOM', 'TOTAL ADV', 'TOTAL COUNT', 'Day of Week',
       'streams_transformed', 'average_age', 'percent_male', 'Season'],
      dtype=object)

In [7]:
# one hot encoding
df_d = pd.get_dummies(df)
df_d.head()

Unnamed: 0,TOTAL ADV,TOTAL COUNT,streams_transformed,average_age,percent_male,ROOM_BALLROOM,ROOM_TAVERN,Day of Week_Friday,Day of Week_Monday,Day of Week_Saturday,Day of Week_Sunday,Day of Week_Thursday,Day of Week_Tuesday,Day of Week_Wednesday,Season_Fall,Season_Spring,Season_Summer,Season_Winter
0,504,504,0.173646,30.078358,0.758037,1,0,0,0,0,0,0,1,0,0,1,0,0
1,502,502,0.299224,30.724574,0.693456,1,0,0,0,0,1,0,0,0,0,0,0,1
2,501,501,0.269953,35.29802,0.741351,1,0,0,0,0,0,0,1,0,0,1,0,0
3,501,501,0.141881,30.819497,0.462539,1,0,0,0,1,0,0,0,0,0,0,1,0
4,500,500,1.0,29.599578,0.755208,1,0,0,0,0,0,1,0,0,0,1,0,0


## Initial model for  advance ticket count

In [8]:
# df_tix_count = df_d.drop(["INDEX","BAR RINGS","FINAL","AVG TPS"],axis=1)

In [9]:
# determine the predictor
X = df_d.drop("TOTAL ADV", axis=1)
y = df_d["TOTAL ADV"]
print(X.shape, y.shape)

(190, 17) (190,)


In [10]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import chi2
model = LinearRegression()

In [11]:
X.columns.values

array(['TOTAL COUNT', 'streams_transformed', 'average_age',
       'percent_male', 'ROOM_BALLROOM', 'ROOM_TAVERN',
       'Day of Week_Friday', 'Day of Week_Monday', 'Day of Week_Saturday',
       'Day of Week_Sunday', 'Day of Week_Thursday',
       'Day of Week_Tuesday', 'Day of Week_Wednesday', 'Season_Fall',
       'Season_Spring', 'Season_Summer', 'Season_Winter'], dtype=object)

## MODELS

### random state =42, test size = .2

In [12]:
random_state= 42
test_size= 0.2
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state,test_size=test_size)
feature_list = list(X_train.columns.values)
model_A = LinearRegression()


# Fitting our model with all of our features in X
model_A.fit(X_train, y_train)

score_train = model_A.score(X_train, y_train)
score_test = model_A.score(X_test, y_test)



scores, pvalues_train = chi2(X_train, y_train)
pvalues_train
scores, pvalues_test = chi2(X_test, y_test)
pvalues_test

coef_feat_df_A = pd.DataFrame(list(zip(model_A.coef_, pvalues_train, pvalues_test, feature_list)))
coef_feat_df_A.columns = ['coef','p-values train','p-values test','feature']
coef_feat_df_A = coef_feat_df_A.sort_values('p-values train',ascending=True)

print(f"random state = {random_state}, test size = {test_size}")
print(f"training Score: {score_train}")
print(f"testing Score: {score_test}")
print('y-axis intercept: ', model_A.intercept_)
coef_feat_df_A.head(20)


random state = 42, test size = 0.2
training Score: 0.9797034093187783
testing Score: 0.9839066346275186
y-axis intercept:  -52.553895267109496


Unnamed: 0,coef,p-values train,p-values test,feature
0,0.969002,0.0,0.0,TOTAL COUNT
10,0.716376,0.64919,0.789512,Day of Week_Thursday
11,8.152193,0.731032,0.759166,Day of Week_Tuesday
16,-5.139076,0.777605,0.41802,Season_Winter
6,-0.936899,0.840716,0.899175,Day of Week_Friday
12,-2.078572,0.844339,0.60899,Day of Week_Wednesday
9,3.952343,0.863906,0.249028,Day of Week_Sunday
15,-2.533212,0.868818,0.723028,Season_Summer
4,-6.349216,0.868993,0.844416,ROOM_BALLROOM
7,-3.745122,0.86978,0.606442,Day of Week_Monday


In [13]:
predictions = model_A.predict(X_test)

pred_df = pd.DataFrame({"Predicted Adv Ticket Count": predictions , "Actual Adv Ticket Count": y_test})
pred_df.head(10)

Unnamed: 0,Predicted Adv Ticket Count,Actual Adv Ticket Count
175,-13.089003,3
180,-5.423877,2
111,34.459688,37
65,93.656435,92
101,53.548941,45
15,358.143072,375
9,468.341701,500
16,379.386114,374
141,14.463614,15
124,19.819793,28


### random state =7, test size = .2

In [14]:
random_state= 7
test_size= 0.2
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state,test_size=test_size)
feature_list = list(X_train.columns.values)
model_B = LinearRegression()

# Fitting our model with all of our features in X
model_B.fit(X_train, y_train)

score_train = model_B.score(X_train, y_train)
score_test = model_B.score(X_test, y_test)



scores, pvalues_train = chi2(X_train, y_train)
pvalues_train
scores, pvalues_test = chi2(X_test, y_test)
pvalues_test

coef_feat_df_B = pd.DataFrame(list(zip(model_B.coef_, pvalues_train, pvalues_test, feature_list)))
coef_feat_df_B.columns = ['coef','p-values train','p-values test','feature']
coef_feat_df_B = coef_feat_df_B.sort_values('p-values train',ascending=True)

print(f"random state = {random_state}, test size = {test_size}")
print(f"training Score: {score_train}")
print(f"testing Score: {score_test}")
print('y-axis intercept: ', model_B.intercept_)
coef_feat_df_B.head(20)


random state = 7, test size = 0.2
training Score: 0.9811441892893942
testing Score: 0.9761949260884176
y-axis intercept:  -48.09282903619405


Unnamed: 0,coef,p-values train,p-values test,feature
0,0.994841,0.0,0.0,TOTAL COUNT
9,2.026506,0.651337,0.718845,Day of Week_Sunday
16,-3.016458,0.734058,0.61544,Season_Winter
4,-7.508696,0.812049,0.923601,ROOM_BALLROOM
11,6.473494,0.859851,0.743429,Day of Week_Tuesday
7,-2.45332,0.878046,0.701957,Day of Week_Monday
6,-2.996103,0.879937,0.37505,Day of Week_Friday
10,1.338754,0.932073,0.516481,Day of Week_Thursday
12,-0.193506,0.946588,0.801953,Day of Week_Wednesday
15,-3.180097,0.957173,0.887005,Season_Summer


In [15]:
predictions = model_B.predict(X_test)
# pd.options.display.float_format = '{:,.0f}'.format
pred_df = pd.DataFrame({"Predicted Adv Ticket Count": predictions , "Actual Adv Ticket Count": y_test})
pred_df.head(10)

Unnamed: 0,Predicted Adv Ticket Count,Actual Adv Ticket Count
22,312.299734,299
177,-5.28451,3
46,149.679716,140
63,82.79136,97
24,284.487299,270
185,0.651394,1
40,150.241778,166
188,42.620074,0
78,80.682087,74
96,24.01259,50


### random state =0, test size = .2

In [16]:
random_state= 0
test_size= 0.2
feature_list = []
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state,test_size=test_size)
feature_list = list(X_train.columns.values)
model_C = LinearRegression()

# Fitting our model with all of our features in X
model_C.fit(X_train, y_train)

score_train = model_C.score(X_train, y_train)
score_test = model_C.score(X_test, y_test)



scores, pvalues_train = chi2(X_train, y_train)
pvalues_train
scores, pvalues_test = chi2(X_test, y_test)
pvalues_test

coef_feat_df_C = pd.DataFrame(list(zip(model_C.coef_, pvalues_train, pvalues_test, feature_list)))
coef_feat_df_C.columns = ['coef','p-values train','p-values test','feature']
coef_feat_df_C = coef_feat_df_C.sort_values('p-values train',ascending=True)

print(f"random state = {random_state}, test size = {test_size}")
print(f"training Score: {score_train}")
print(f"testing Score: {score_test}")
print('y-axis intercept: ', model_C.intercept_)
coef_feat_df_C.head(20)


random state = 0, test size = 0.2
training Score: 0.9782578028247635
testing Score: 0.9901255447331987
y-axis intercept:  -39.916938860713614


Unnamed: 0,coef,p-values train,p-values test,feature
0,0.978822,0.0,0.0,TOTAL COUNT
11,8.395874,0.68466,0.56809,Day of Week_Tuesday
9,4.147762,0.737304,0.327542,Day of Week_Sunday
6,-2.867415,0.808873,0.782244,Day of Week_Friday
16,-7.945306,0.818214,0.782244,Season_Winter
7,-2.859172,0.846603,0.606442,Day of Week_Monday
4,-6.415468,0.859522,0.889244,ROOM_BALLROOM
10,-0.47737,0.866798,0.821467,Day of Week_Thursday
12,-1.882848,0.929495,0.685744,Day of Week_Wednesday
15,-1.150826,0.980982,0.466745,Season_Summer


In [17]:
predictions = model_C.predict(X_test)

pred_df = pd.DataFrame({"Predicted Adv Ticket Count": predictions , "Actual Adv Ticket Count": y_test})
pred_df.head(10)

Unnamed: 0,Predicted Adv Ticket Count,Actual Adv Ticket Count
108,55.368947,41
74,64.693993,81
161,7.825508,7
95,35.64113,53
123,25.99403,29
71,78.626399,86
18,346.93874,347
124,21.99647,28
143,32.779344,14
7,463.931617,500


### random state =0, test size = .3

In [18]:
random_state= 0
test_size= 0.3
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state,test_size=test_size)
feature_list = list(X_train.columns.values)
model_D = LinearRegression()

# Fitting our model with all of our features in X
model_D.fit(X_train, y_train)

score_train = model_D.score(X_train, y_train)
score_test = model_D.score(X_test, y_test)



scores, pvalues_train = chi2(X_train, y_train)
pvalues_train
scores, pvalues_test = chi2(X_test, y_test)
pvalues_test

coef_feat_df_D = pd.DataFrame(list(zip(model_D.coef_, pvalues_train, pvalues_test, feature_list)))
coef_feat_df_D.columns = ['coef','p-values train','p-values test','feature']
coef_feat_df_D = coef_feat_df_D.sort_values('p-values train',ascending=True)

print(f"random state = {random_state}, test size = {test_size}")
print(f"training Score: {score_train}")
print(f"testing Score: {score_test}")
print('y-axis intercept: ', model_D.intercept_)
coef_feat_df_D.head(20)




random state = 0, test size = 0.3
training Score: 0.9768446221802812
testing Score: 0.9890026733373886
y-axis intercept:  -36.46319950672819


Unnamed: 0,coef,p-values train,p-values test,feature
0,0.978615,0.0,0.0,TOTAL COUNT
7,-1.418725,0.701574,0.902952,Day of Week_Monday
6,-2.400249,0.702036,0.955527,Day of Week_Friday
11,8.5147,0.778591,0.704896,Day of Week_Tuesday
9,3.974759,0.78926,0.143574,Day of Week_Sunday
16,-8.634241,0.870853,0.873396,Season_Winter
12,-1.706751,0.87874,0.743222,Day of Week_Wednesday
15,-1.910254,0.923786,0.749981,Season_Summer
4,-6.146928,0.945583,0.781673,ROOM_BALLROOM
10,-0.172916,0.953625,0.73384,Day of Week_Thursday


In [19]:
predictions = model_D.predict(X_test)

pred_df = pd.DataFrame({"Predicted Adv Ticket Count": predictions , "Actual Adv Ticket Count": y_test})
pred_df.head(10)

Unnamed: 0,Predicted Adv Ticket Count,Actual Adv Ticket Count
108,52.95904,41
74,64.688642,81
161,8.669691,7
95,34.952264,53
123,26.341333,29
71,76.84607,86
18,346.788398,347
124,22.643742,28
143,30.827736,14
7,462.167578,500


### random state =42, test size = .3

In [20]:
random_state= 42
test_size= 0.3
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state,test_size=test_size)
feature_list = list(X_train.columns.values)
model_E = LinearRegression()

# Fitting our model with all of our features in X
# Fitting our model with all of our features in X
model_E.fit(X_train, y_train)

score_train = model_E.score(X_train, y_train)
score_test = model_E.score(X_test, y_test)



scores, pvalues_train = chi2(X_train, y_train)
pvalues_train
scores, pvalues_test = chi2(X_test, y_test)
pvalues_test

coef_feat_df_E = pd.DataFrame(list(zip(model_E.coef_, pvalues_train, pvalues_test, feature_list)))
coef_feat_df_E.columns = ['coef','p-values train','p-values test','feature']
coef_feat_df_E = coef_feat_df_E.sort_values('p-values train',ascending=True)

print(f"random state = {random_state}, test size = {test_size}")
print(f"training Score: {score_train}")
print(f"testing Score: {score_test}")
print('y-axis intercept: ', model_E.intercept_)
coef_feat_df_E.head(20)



random state = 42, test size = 0.3
training Score: 0.9815740082507486
testing Score: 0.9754103599480847
y-axis intercept:  -53.01298874872444


Unnamed: 0,coef,p-values train,p-values test,feature
0,0.975935,0.0,0.0,TOTAL COUNT
10,0.68636,0.610964,0.891552,Day of Week_Thursday
11,9.605897,0.708676,0.673104,Day of Week_Tuesday
7,-1.455051,0.773766,0.662829,Day of Week_Monday
16,-6.71684,0.788001,0.780703,Season_Winter
4,-8.332319,0.825159,0.93597,ROOM_BALLROOM
6,-4.577845,0.868593,0.950173,Day of Week_Friday
12,-1.779067,0.878148,0.749198,Day of Week_Wednesday
15,-0.209921,0.883786,0.880144,Season_Summer
9,3.289923,0.91898,0.418429,Day of Week_Sunday


In [21]:
predictions = model_E.predict(X_test)

pred_df = pd.DataFrame({"Predicted Adv Ticket Count": predictions , "Actual Adv Ticket Count": y_test})
pred_df.head(10)

Unnamed: 0,Predicted Adv Ticket Count,Actual Adv Ticket Count
175,-9.019631,3
180,-5.531733,2
111,35.776768,37
65,95.375051,92
101,57.069519,45
15,357.086892,375
9,465.188534,500
16,382.200651,374
141,15.790067,15
124,18.0197,28


### random state =7, test size = .3

In [22]:
random_state= 7
test_size= 0.3
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state,test_size=test_size)
feature_list = list(X_train.columns.values)
model_F = LinearRegression()

# Fitting our model with all of our features in X
model_F.fit(X_train, y_train)

score_train = model_F.score(X_train, y_train)
score_test = model_F.score(X_test, y_test)



scores, pvalues_train = chi2(X_train, y_train)
pvalues_train
scores, pvalues_test = chi2(X_test, y_test)
pvalues_test

coef_feat_df_F = pd.DataFrame(list(zip(model_F.coef_, pvalues_train, pvalues_test, feature_list)))
coef_feat_df_F.columns = ['coef','p-values train','p-values test','feature']
coef_feat_df_F = coef_feat_df_F.sort_values('p-values train',ascending=True)

print(f"random state = {random_state}, test size = {test_size}")
print(f"training Score: {score_train}")
print(f"testing Score: {score_test}")
print('y-axis intercept: ', model_F.intercept_)
coef_feat_df_F.head(20)



random state = 7, test size = 0.3
training Score: 0.9810508410960369
testing Score: 0.9769447632520529
y-axis intercept:  -43.86879789673584


Unnamed: 0,coef,p-values train,p-values test,feature
0,0.994081,0.0,0.0,TOTAL COUNT
16,-3.273496,0.621919,0.69248,Season_Winter
9,2.186522,0.718848,0.537429,Day of Week_Sunday
8,-3.856248,0.729542,0.684974,Day of Week_Saturday
11,6.687693,0.777594,0.890508,Day of Week_Tuesday
4,-7.618274,0.783515,0.948892,ROOM_BALLROOM
6,-2.246719,0.905953,0.8689,Day of Week_Friday
7,-4.44012,0.90947,0.737102,Day of Week_Monday
12,1.004926,0.929165,0.742649,Day of Week_Wednesday
10,0.663947,0.959914,0.69248,Day of Week_Thursday


In [23]:
predictions = model_F.predict(X_test)

pred_df = pd.DataFrame({"Predicted Adv Ticket Count": predictions , "Actual Adv Ticket Count": y_test})
pred_df.head(10)

Unnamed: 0,Predicted Adv Ticket Count,Actual Adv Ticket Count
22,311.761393,299
177,-5.184756,3
46,148.471715,140
63,79.060237,97
24,284.56892,270
185,0.8248,1
40,151.266596,166
188,41.764286,0
78,80.934529,74
96,24.376815,50


## Pickling the prefered model


In [24]:
import pickle
# Dump the trained linear regression model with Pickle
lin_reg_pkl_filename = 'lin_reg_tix_adv_count.pkl'

# Open the file to save as pkl file
lin_reg_model_pkl = open(lin_reg_pkl_filename, 'wb')
pickle.dump(model_D, lin_reg_model_pkl)

# Close the pickle instances
lin_reg_model_pkl.close()

In [25]:
# Loading the saved model pickle
lin_reg_model_pkl = open(lin_reg_pkl_filename, 'rb')
lin_reg_model = pickle.load(lin_reg_model_pkl)
print("Loaded Linear Regression model :: ", lin_reg_model)

Loaded Linear Regression model ::  LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)
