In [1]:
import pandas as pd

In [2]:
import numpy as np
from pprint import pprint
import matplotlib.pyplot as plt

## Read the CSV and Perform Basic Data Cleaning

AVG TPS = average ticket price sold 

In [3]:
# import historic venue data
df = pd.read_csv("working_model_AMP_2.csv")
df.head()

Unnamed: 0,DATE,ROOM,TOTAL ADV,FINAL,AVG TPS,TOTAL COUNT,BAR RINGS,Day of Week,Month,streams_transformed,average_age,percent_male
0,5/14/19,BALLROOM,504,7560.0,15,504,4785.22,Tuesday,5,0.173646,30.078358,0.758037
1,12/2/18,BALLROOM,502,11044.0,22,502,4493.17,Sunday,12,0.299224,30.724574,0.693456
2,5/22/18,BALLROOM,501,8517.0,17,501,5492.03,Tuesday,5,0.269953,35.29802,0.741351
3,6/15/19,BALLROOM,501,10020.0,20,501,4535.97,Saturday,6,0.141881,30.819497,0.462539
4,5/2/19,BALLROOM,500,10000.0,20,500,6841.79,Thursday,5,1.0,29.599578,0.755208


In [4]:
# convert months to seasons
df['Season']= np.where((df['Month']<3) | (df['Month']>11),'Winter',\
     np.where((df['Month']<9) & (df['Month']>5),'Summer',\
     np.where((df['Month']>2) & (df['Month']<6),'Spring','Fall')))
df.head()

Unnamed: 0,DATE,ROOM,TOTAL ADV,FINAL,AVG TPS,TOTAL COUNT,BAR RINGS,Day of Week,Month,streams_transformed,average_age,percent_male,Season
0,5/14/19,BALLROOM,504,7560.0,15,504,4785.22,Tuesday,5,0.173646,30.078358,0.758037,Spring
1,12/2/18,BALLROOM,502,11044.0,22,502,4493.17,Sunday,12,0.299224,30.724574,0.693456,Winter
2,5/22/18,BALLROOM,501,8517.0,17,501,5492.03,Tuesday,5,0.269953,35.29802,0.741351,Spring
3,6/15/19,BALLROOM,501,10020.0,20,501,4535.97,Saturday,6,0.141881,30.819497,0.462539,Summer
4,5/2/19,BALLROOM,500,10000.0,20,500,6841.79,Thursday,5,1.0,29.599578,0.755208,Spring


In [5]:
# delete redundant or unneeded columns for this model

# it would be worth trying this one with and without room

df =df.drop(["FINAL","DATE","Month","BAR RINGS","TOTAL ADV"],axis=1)

In [6]:

columns = df.columns.values
columns

array(['ROOM', 'AVG TPS', 'TOTAL COUNT', 'Day of Week',
       'streams_transformed', 'average_age', 'percent_male', 'Season'],
      dtype=object)

In [7]:
# df['AVG TPS'] = df['AVG TPS'].astype('int32').dtypes
# df.dtypes

In [8]:
# one hot encoding
df_d =pd.get_dummies(df, columns=['ROOM','Day of Week','Season'])
df_d.head()

Unnamed: 0,AVG TPS,TOTAL COUNT,streams_transformed,average_age,percent_male,ROOM_BALLROOM,ROOM_TAVERN,Day of Week_Friday,Day of Week_Monday,Day of Week_Saturday,Day of Week_Sunday,Day of Week_Thursday,Day of Week_Tuesday,Day of Week_Wednesday,Season_Fall,Season_Spring,Season_Summer,Season_Winter
0,15,504,0.173646,30.078358,0.758037,1,0,0,0,0,0,0,1,0,0,1,0,0
1,22,502,0.299224,30.724574,0.693456,1,0,0,0,0,1,0,0,0,0,0,0,1
2,17,501,0.269953,35.29802,0.741351,1,0,0,0,0,0,0,1,0,0,1,0,0
3,20,501,0.141881,30.819497,0.462539,1,0,0,0,1,0,0,0,0,0,0,1,0
4,20,500,1.0,29.599578,0.755208,1,0,0,0,0,0,1,0,0,0,1,0,0


## Initial model for ticket price

In [9]:
# df_tix_count = df_d.drop(["INDEX","BAR RINGS","FINAL","AVG TPS"],axis=1)

In [10]:
# determine the predictor
X = df_d.drop("AVG TPS", axis=1)
y = df_d["AVG TPS"]
#y = np.asarray(df_d["AVG TPS"],dtype=np.float64)
print(X.shape, y.shape)

(190, 17) (190,)


In [11]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import chi2
model = LinearRegression()

In [12]:
X.columns.values

array(['TOTAL COUNT', 'streams_transformed', 'average_age',
       'percent_male', 'ROOM_BALLROOM', 'ROOM_TAVERN',
       'Day of Week_Friday', 'Day of Week_Monday', 'Day of Week_Saturday',
       'Day of Week_Sunday', 'Day of Week_Thursday',
       'Day of Week_Tuesday', 'Day of Week_Wednesday', 'Season_Fall',
       'Season_Spring', 'Season_Summer', 'Season_Winter'], dtype=object)

## MODELS

### random state =42, test size = .2

In [13]:
random_state= 42
test_size= 0.2
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state,test_size=test_size)
feature_list = list(X_train.columns.values)
model_A = LinearRegression()


# Fitting our model with all of our features in X
model_A.fit(X_train, y_train)

score_train = model_A.score(X_train, y_train)
score_test = model_A.score(X_test, y_test)



# scores, pvalues_train = chi2(X_train, y_train)
# pvalues_train
# scores, pvalues_test = chi2(X_test, y_test)
# pvalues_test

# coef_feat_df_A = pd.DataFrame(list(zip(model_A.coef_, pvalues_train, pvalues_test, feature_list)))
coef_feat_df_A = pd.DataFrame(list(zip(model_A.coef_,  feature_list)))

coef_feat_df_A.columns = ['coef','feature']
# coef_feat_df_A = coef_feat_df_A.sort_values('p-values train',ascending=True)

print(f"random state = {random_state}, test size = {test_size}")
print(f"training Score: {score_train}")
print(f"testing Score: {score_test}")
print('y-axis intercept: ', model_A.intercept_)
coef_feat_df_A.head(20)


random state = 42, test size = 0.2
training Score: 0.6010091643402901
testing Score: 0.5468708321049158
y-axis intercept:  8.602827149312112


Unnamed: 0,coef,feature
0,0.008802,TOTAL COUNT
1,0.024755,streams_transformed
2,0.263526,average_age
3,-4.952771,percent_male
4,2.550423,ROOM_BALLROOM
5,-2.550423,ROOM_TAVERN
6,0.010854,Day of Week_Friday
7,0.152378,Day of Week_Monday
8,-0.390504,Day of Week_Saturday
9,-0.323919,Day of Week_Sunday


In [14]:
predictions = model_A.predict(X_test)

pred_df = pd.DataFrame({"Predicted Ticket Price": predictions , "Actual Ticket Price": y_test})
pred_df.head(10)

Unnamed: 0,Predicted Ticket Price,Actual Ticket Price
175,11.303599,12
180,12.728877,10
111,13.408005,15
65,16.310372,15
101,11.677333,15
15,18.5099,25
9,20.883224,22
16,20.237651,30
141,12.23941,12
124,11.600592,10


### random state =7, test size = .2

In [15]:
random_state= 7
test_size= 0.2
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state,test_size=test_size)
feature_list = list(X_train.columns.values)
model_B = LinearRegression()

# Fitting our model with all of our features in X
model_B.fit(X_train, y_train)

score_train = model_B.score(X_train, y_train)
score_test = model_B.score(X_test, y_test)



# scores, pvalues_train = chi2(X_train, y_train)
# pvalues_train
# scores, pvalues_test = chi2(X_test, y_test)
# pvalues_test

# coef_feat_df_B = pd.DataFrame(list(zip(model_B.coef_, pvalues_train, pvalues_test, feature_list)))
coef_feat_df_B = pd.DataFrame(list(zip(model_B.coef_,  feature_list)))

coef_feat_df_B.columns = ['coef','feature']
# coef_feat_df_B = coef_feat_df_B.sort_values('p-values train',ascending=True)

print(f"random state = {random_state}, test size = {test_size}")
print(f"training Score: {score_train}")
print(f"testing Score: {score_test}")
print('y-axis intercept: ', model_B.intercept_)
coef_feat_df_B.head(20)


random state = 7, test size = 0.2
training Score: 0.6082638662454593
testing Score: 0.4786001122994793
y-axis intercept:  8.554595477926513


Unnamed: 0,coef,feature
0,0.012577,TOTAL COUNT
1,-1.075404,streams_transformed
2,0.230218,average_age
3,-3.792268,percent_male
4,2.367294,ROOM_BALLROOM
5,-2.367294,ROOM_TAVERN
6,0.285936,Day of Week_Friday
7,0.369694,Day of Week_Monday
8,-0.402699,Day of Week_Saturday
9,-0.883698,Day of Week_Sunday


In [16]:
predictions = model_B.predict(X_test)

pred_df = pd.DataFrame({"Predicted Ticket Price": predictions , "Actual Ticket Price": y_test})
pred_df.head(10)

Unnamed: 0,Predicted Ticket Price,Actual Ticket Price
22,19.570351,23
177,10.653986,11
46,17.970639,19
63,19.748507,20
24,20.273473,15
185,10.804011,10
40,17.89485,15
188,11.513857,10
78,16.002242,13
96,15.436813,20


### random state =0, test size = .2

In [17]:
random_state= 0
test_size= 0.2
feature_list = []
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state,test_size=test_size)
feature_list = list(X_train.columns.values)
model_C = LinearRegression()

# Fitting our model with all of our features in X
model_C.fit(X_train, y_train)

score_train = model_C.score(X_train, y_train)
score_test = model_C.score(X_test, y_test)



# scores, pvalues_train = chi2(X_train, y_train)
# pvalues_train
# scores, pvalues_test = chi2(X_test, y_test)
# pvalues_test

# coef_feat_df_C = pd.DataFrame(list(zip(model_C.coef_, pvalues_train, pvalues_test, feature_list)))
coef_feat_df_C = pd.DataFrame(list(zip(model_C.coef_,  feature_list)))

coef_feat_df_C.columns = ['coef','feature']
# coef_feat_df_C = coef_feat_df_C.sort_values('p-values train',ascending=True)

print(f"random state = {random_state}, test size = {test_size}")
print(f"training Score: {score_train}")
print(f"testing Score: {score_test}")
print('y-axis intercept: ', model_C.intercept_)
coef_feat_df_C.head(20)


random state = 0, test size = 0.2
training Score: 0.6092614588124586
testing Score: 0.36824761571079945
y-axis intercept:  8.755529044965051


Unnamed: 0,coef,feature
0,0.009347,TOTAL COUNT
1,1.335218,streams_transformed
2,0.258991,average_age
3,-5.364557,percent_male
4,2.603226,ROOM_BALLROOM
5,-2.603226,ROOM_TAVERN
6,0.054316,Day of Week_Friday
7,0.112716,Day of Week_Monday
8,-0.296669,Day of Week_Saturday
9,-0.45267,Day of Week_Sunday


In [18]:
predictions = model_C.predict(X_test)

pred_df = pd.DataFrame({"Predicted Ticket Price": predictions , "Actual Ticket Price": y_test})
pred_df.head(10)

Unnamed: 0,Predicted Ticket Price,Actual Ticket Price
108,11.881866,13
74,16.843263,15
161,11.093269,12
95,11.407754,15
123,10.19462,10
71,12.820028,15
18,20.556838,18
124,11.372889,10
143,17.702167,12
7,20.851235,20


### random state =0, test size = .3

In [19]:
random_state= 0
test_size= 0.3
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state,test_size=test_size)
feature_list = list(X_train.columns.values)
model_D = LinearRegression()

# Fitting our model with all of our features in X
model_D.fit(X_train, y_train)

score_train = model_D.score(X_train, y_train)
score_test = model_D.score(X_test, y_test)



# scores, pvalues_train = chi2(X_train, y_train)
# pvalues_train
# scores, pvalues_test = chi2(X_test, y_test)
# pvalues_test

# coef_feat_df_D = pd.DataFrame(list(zip(model_D.coef_, pvalues_train, pvalues_test, feature_list)))
coef_feat_df_D = pd.DataFrame(list(zip(model_D.coef_,  feature_list)))
coef_feat_df_D.columns = ['coef','feature']
# coef_feat_df_D = coef_feat_df_D.sort_values('p-values train',ascending=True)

print(f"random state = {random_state}, test size = {test_size}")
print(f"training Score: {score_train}")
print(f"testing Score: {score_test}")
print('y-axis intercept: ', model_D.intercept_)
coef_feat_df_D.head(20)




random state = 0, test size = 0.3
training Score: 0.5992296842956133
testing Score: 0.49911423329511395
y-axis intercept:  9.196384550797209


Unnamed: 0,coef,feature
0,0.00912,TOTAL COUNT
1,1.377795,streams_transformed
2,0.23819,average_age
3,-4.942058,percent_male
4,2.600191,ROOM_BALLROOM
5,-2.600191,ROOM_TAVERN
6,0.03127,Day of Week_Friday
7,-0.027863,Day of Week_Monday
8,-0.514791,Day of Week_Saturday
9,-0.45757,Day of Week_Sunday


In [20]:
predictions = model_D.predict(X_test)

pred_df = pd.DataFrame({"Predicted Ticket Price": predictions , "Actual Ticket Price": y_test})
pred_df.head(10)

Unnamed: 0,Predicted Ticket Price,Actual Ticket Price
108,11.379634,13
74,16.844677,15
161,11.133573,12
95,11.471185,15
123,10.292126,10
71,12.576236,15
18,20.922177,18
124,11.13094,10
143,17.44807,12
7,20.826261,20


### random state =42, test size = .3

In [21]:
random_state= 42
test_size= 0.3
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state,test_size=test_size)
feature_list = list(X_train.columns.values)
model_E = LinearRegression()

# Fitting our model with all of our features in X
# Fitting our model with all of our features in X
model_E.fit(X_train, y_train)

score_train = model_E.score(X_train, y_train)
score_test = model_E.score(X_test, y_test)



# scores, pvalues_train = chi2(X_train, y_train)
# pvalues_train
# scores, pvalues_test = chi2(X_test, y_test)
# pvalues_test

# coef_feat_df_E = pd.DataFrame(list(zip(model_E.coef_, pvalues_train, pvalues_test, feature_list)))
coef_feat_df_E = pd.DataFrame(list(zip(model_E.coef_,  feature_list)))

coef_feat_df_E.columns = ['coef','feature']
# coef_feat_df_E = coef_feat_df_E.sort_values('p-values train',ascending=True)

print(f"random state = {random_state}, test size = {test_size}")
print(f"training Score: {score_train}")
print(f"testing Score: {score_test}")
print('y-axis intercept: ', model_E.intercept_)
coef_feat_df_E.head(20)



random state = 42, test size = 0.3
training Score: 0.602670048011366
testing Score: 0.5469899560630385
y-axis intercept:  7.5943548105514305


Unnamed: 0,coef,feature
0,0.008472,TOTAL COUNT
1,0.170123,streams_transformed
2,0.281638,average_age
3,-4.259546,percent_male
4,2.383761,ROOM_BALLROOM
5,-2.383761,ROOM_TAVERN
6,-0.05162,Day of Week_Friday
7,0.064864,Day of Week_Monday
8,-0.335321,Day of Week_Saturday
9,-0.016273,Day of Week_Sunday


In [22]:
predictions = model_E.predict(X_test)

pred_df = pd.DataFrame({"Predicted Ticket Price": predictions , "Actual Ticket Price": y_test})
pred_df.head(10)

Unnamed: 0,Predicted Ticket Price,Actual Ticket Price
175,11.142228,12
180,12.395938,10
111,13.111773,15
65,15.787101,15
101,11.646389,15
15,18.304988,25
9,20.288297,22
16,19.705541,30
141,12.353754,12
124,11.376215,10


### random state =7, test size = .3

In [23]:
random_state= 7
test_size= 0.3
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state,test_size=test_size)
feature_list = list(X_train.columns.values)
model_F = LinearRegression()

# Fitting our model with all of our features in X
model_F.fit(X_train, y_train)

score_train = model_F.score(X_train, y_train)
score_test = model_F.score(X_test, y_test)



# scores, pvalues_train = chi2(X_train, y_train)
# pvalues_train
# scores, pvalues_test = chi2(X_test, y_test)
# pvalues_test

# coef_feat_df_F = pd.DataFrame(list(zip(model_F.coef_, pvalues_train, pvalues_test, feature_list)))
coef_feat_df_F = pd.DataFrame(list(zip(model_F.coef_,  feature_list)))

coef_feat_df_F.columns = ['coef','feature']
# coef_feat_df_F = coef_feat_df_F.sort_values('p-values train',ascending=True)

print(f"random state = {random_state}, test size = {test_size}")
print(f"training Score: {score_train}")
print(f"testing Score: {score_test}")
print('y-axis intercept: ', model_F.intercept_)
coef_feat_df_F.head(20)



random state = 7, test size = 0.3
training Score: 0.6304850441031955
testing Score: 0.4479961727973836
y-axis intercept:  8.34891468388517


Unnamed: 0,coef,feature
0,0.011661,TOTAL COUNT
1,-0.939178,streams_transformed
2,0.237929,average_age
3,-3.414,percent_male
4,2.5414,ROOM_BALLROOM
5,-2.5414,ROOM_TAVERN
6,0.256802,Day of Week_Friday
7,-0.211116,Day of Week_Monday
8,-0.3397,Day of Week_Saturday
9,-0.468944,Day of Week_Sunday


In [24]:
predictions = model_F.predict(X_test)

pred_df = pd.DataFrame({"Predicted Ticket Price": predictions , "Actual Ticket Price": y_test})
pred_df.head(10)

Unnamed: 0,Predicted Ticket Price,Actual Ticket Price
22,20.419491,23
177,11.451777,11
46,18.329007,19
63,20.226885,20
24,20.07647,15
185,10.836464,10
40,18.201677,15
188,12.02734,10
78,16.117841,13
96,16.478129,20


## Pickling the prefered model


In [25]:
import pickle
# Dump the trained linear regression model with Pickle
lin_reg_pkl_filename = 'lin_reg_tix_price.pkl'

# Open the file to save as pkl file
lin_reg_model_pkl = open(lin_reg_pkl_filename, 'wb')
pickle.dump(model_C, lin_reg_model_pkl)

# Close the pickle instances
lin_reg_model_pkl.close()

In [26]:
# Loading the saved model pickle
lin_reg_model_pkl = open(lin_reg_pkl_filename, 'rb')
lin_reg_model = pickle.load(lin_reg_model_pkl)
print("Loaded Linear Regression model :: ", lin_reg_model)

Loaded Linear Regression model ::  LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)
