In [1]:
import pandas as pd

In [2]:
import numpy as np
from pprint import pprint
import matplotlib.pyplot as plt

## Read the CSV and Perform Basic Data Cleaning

AVG TPS = average ticket price sold 

In [3]:
# import historic venue data
df = pd.read_csv("working_model_AMP_2-bar_ring_edit.csv")
df.head()

Unnamed: 0,DATE,ROOM,TOTAL ADV,FINAL,AVG TPS,TOTAL COUNT,BAR RINGS,Day of Week,Month,streams_transformed,average_age,percent_male
0,3/2/19,BALLROOM,500,11000.0,22,500,7038.14,Saturday,3,0.44341,30.58885,0.672169
1,5/2/19,BALLROOM,500,10000.0,20,500,6841.79,Thursday,5,1.0,29.599578,0.755208
2,3/27/19,BALLROOM,500,7500.0,15,500,5975.79,Wednesday,3,0.965554,30.861255,0.465789
3,12/18/18,BALLROOM,390,8790.0,20,435,5750.0,Tuesday,12,0.053904,34.068242,0.591354
4,5/22/18,BALLROOM,501,8517.0,17,501,5492.03,Tuesday,5,0.269953,35.29802,0.741351


In [4]:
# convert months to seasons
df['Season']= np.where((df['Month']<3) | (df['Month']>11),'Winter',\
     np.where((df['Month']<9) & (df['Month']>5),'Summer',\
     np.where((df['Month']>2) & (df['Month']<6),'Spring','Fall')))
df.head()

Unnamed: 0,DATE,ROOM,TOTAL ADV,FINAL,AVG TPS,TOTAL COUNT,BAR RINGS,Day of Week,Month,streams_transformed,average_age,percent_male,Season
0,3/2/19,BALLROOM,500,11000.0,22,500,7038.14,Saturday,3,0.44341,30.58885,0.672169,Spring
1,5/2/19,BALLROOM,500,10000.0,20,500,6841.79,Thursday,5,1.0,29.599578,0.755208,Spring
2,3/27/19,BALLROOM,500,7500.0,15,500,5975.79,Wednesday,3,0.965554,30.861255,0.465789,Spring
3,12/18/18,BALLROOM,390,8790.0,20,435,5750.0,Tuesday,12,0.053904,34.068242,0.591354,Winter
4,5/22/18,BALLROOM,501,8517.0,17,501,5492.03,Tuesday,5,0.269953,35.29802,0.741351,Spring


In [5]:
# delete redundant or unneeded columns for this model

# it would be worth trying this one with and without AVG TPS, ROOM, TOTAL ADV

df =df.drop(["FINAL","DATE","Month","TOTAL COUNT"],axis=1)

In [6]:

columns = df.columns.values
columns

array(['ROOM', 'TOTAL ADV', 'AVG TPS', 'BAR RINGS', 'Day of Week',
       'streams_transformed', 'average_age', 'percent_male', 'Season'],
      dtype=object)

In [7]:
# one hot encoding
df_d = pd.get_dummies(df)
df_d.head()

Unnamed: 0,TOTAL ADV,AVG TPS,BAR RINGS,streams_transformed,average_age,percent_male,ROOM_BALLROOM,ROOM_TAVERN,Day of Week_Friday,Day of Week_Monday,Day of Week_Saturday,Day of Week_Sunday,Day of Week_Thursday,Day of Week_Tuesday,Day of Week_Wednesday,Season_Fall,Season_Spring,Season_Summer,Season_Winter
0,500,22,7038.14,0.44341,30.58885,0.672169,1,0,0,0,1,0,0,0,0,0,1,0,0
1,500,20,6841.79,1.0,29.599578,0.755208,1,0,0,0,0,0,1,0,0,0,1,0,0
2,500,15,5975.79,0.965554,30.861255,0.465789,1,0,0,0,0,0,0,0,1,0,1,0,0
3,390,20,5750.0,0.053904,34.068242,0.591354,1,0,0,0,0,0,0,1,0,0,0,0,1
4,501,17,5492.03,0.269953,35.29802,0.741351,1,0,0,0,0,0,0,1,0,0,1,0,0


## Initial model for bar rings


In [8]:
# df_tix_count = df_d.drop(["INDEX","BAR RINGS","FINAL","AVG TPS"],axis=1)

In [9]:
# df_A = df_d.drop(['Day of Week_Monday','Day of Week_Sunday','Season_Summer','Season_Fall',"Day of Week_Tuesday"], axis=1)


In [10]:
# determine the predictor
X = df_d.drop("BAR RINGS", axis=1)

y = df_d["BAR RINGS"]
print(X.shape, y.shape)

(181, 18) (181,)


In [11]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import chi2
model = LinearRegression()

In [12]:
X.columns.values

array(['TOTAL ADV', 'AVG TPS', 'streams_transformed', 'average_age',
       'percent_male', 'ROOM_BALLROOM', 'ROOM_TAVERN',
       'Day of Week_Friday', 'Day of Week_Monday', 'Day of Week_Saturday',
       'Day of Week_Sunday', 'Day of Week_Thursday',
       'Day of Week_Tuesday', 'Day of Week_Wednesday', 'Season_Fall',
       'Season_Spring', 'Season_Summer', 'Season_Winter'], dtype=object)

In [13]:
X.head()

Unnamed: 0,TOTAL ADV,AVG TPS,streams_transformed,average_age,percent_male,ROOM_BALLROOM,ROOM_TAVERN,Day of Week_Friday,Day of Week_Monday,Day of Week_Saturday,Day of Week_Sunday,Day of Week_Thursday,Day of Week_Tuesday,Day of Week_Wednesday,Season_Fall,Season_Spring,Season_Summer,Season_Winter
0,500,22,0.44341,30.58885,0.672169,1,0,0,0,1,0,0,0,0,0,1,0,0
1,500,20,1.0,29.599578,0.755208,1,0,0,0,0,0,1,0,0,0,1,0,0
2,500,15,0.965554,30.861255,0.465789,1,0,0,0,0,0,0,0,1,0,1,0,0
3,390,20,0.053904,34.068242,0.591354,1,0,0,0,0,0,0,1,0,0,0,0,1
4,501,17,0.269953,35.29802,0.741351,1,0,0,0,0,0,0,1,0,0,1,0,0


## MODELS

### random state =42, test size = .2

In [14]:
# df_A =df_A.drop(["BAR RINGS",'Day of Week_Monday','Day of Week_Sunday','Season_Summer','Season_Fall',"Day of Week_Tuesday"], axis=1)


In [15]:
random_state= 42
test_size= 0.2
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state,test_size=test_size)
feature_list = list(X_train.columns.values)
model_A = LinearRegression()


# Fitting our model with all of our features in X
model_A.fit(X_train, y_train)

score_train = model_A.score(X_train, y_train)
score_test = model_A.score(X_test, y_test)



# scores, pvalues_train = chi2(X_train, y_train)
# pvalues_train
# scores, pvalues_test = chi2(X_test, y_test)
# pvalues_test

# coef_feat_df_A = pd.DataFrame(list(zip(model_A.coef_, pvalues_train, pvalues_test, feature_list)))
coef_feat_df_A = pd.DataFrame(list(zip(model_A.coef_,  feature_list)))

coef_feat_df_A.columns = ['coef','feature']
coef_feat_df_A = coef_feat_df_A.sort_values('coef',ascending=True)

print(f"random state = {random_state}, test size = {test_size}")
print(f"training Score: {score_train}")
print(f"testing Score: {score_test}")
print('y-axis intercept: ', model_A.intercept_)
coef_feat_df_A.head(20)


random state = 42, test size = 0.2
training Score: 0.8605791508319651
testing Score: 0.7600638591695039
y-axis intercept:  33.191099628964594


Unnamed: 0,coef,feature
12,-202.747826,Day of Week_Tuesday
16,-107.88155,Season_Summer
6,-103.40988,ROOM_TAVERN
8,-98.773796,Day of Week_Monday
2,-37.950414,streams_transformed
14,-33.166999,Season_Fall
15,-26.979377,Season_Spring
10,-25.080828,Day of Week_Sunday
1,-22.17774,AVG TPS
7,-21.19879,Day of Week_Friday


In [16]:
predictions = model_A.predict(X_test)

pred_df = pd.DataFrame({"Predicted Bar Rings": predictions , "Actual Bar Rings": y_test})
pred_df.head(10)

Unnamed: 0,Predicted Bar Rings,Actual Bar Rings
19,4672.181273,3546.19
42,1068.717168,1915.26
153,328.441725,331.86
78,993.150428,1119.41
145,680.771402,386.43
15,2457.88227,3719.49
24,2324.754306,3076.76
68,1432.658243,1225.19
113,768.094212,729.55
118,761.566848,696.89


### random state =7, test size = .2

In [17]:
random_state= 7
test_size= 0.2
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state,test_size=test_size)
feature_list = list(X_train.columns.values)
model_B = LinearRegression()

# Fitting our model with all of our features in X
model_B.fit(X_train, y_train)

score_train = model_B.score(X_train, y_train)
score_test = model_B.score(X_test, y_test)



# scores, pvalues_train = chi2(X_train, y_train)
# pvalues_train
# scores, pvalues_test = chi2(X_test, y_test)
# pvalues_test

# coef_feat_df_B = pd.DataFrame(list(zip(model_B.coef_, pvalues_train, pvalues_test, feature_list)))
coef_feat_df_B = pd.DataFrame(list(zip(model_B.coef_,  feature_list)))

coef_feat_df_B.columns = ['coef','feature']
# coef_feat_df_B = coef_feat_df_B.sort_values('p-values train',ascending=True)

print(f"random state = {random_state}, test size = {test_size}")
print(f"training Score: {score_train}")
print(f"testing Score: {score_test}")
print('y-axis intercept: ', model_B.intercept_)
coef_feat_df_B.head(20)


random state = 7, test size = 0.2
training Score: 0.8608530754850067
testing Score: 0.6700809100755679
y-axis intercept:  -223.56469667932993


Unnamed: 0,coef,feature
0,8.912718,TOTAL ADV
1,-10.744015,AVG TPS
2,-60.979678,streams_transformed
3,16.111192,average_age
4,796.617247,percent_male
5,101.904446,ROOM_BALLROOM
6,-101.904446,ROOM_TAVERN
7,-16.64432,Day of Week_Friday
8,-83.612686,Day of Week_Monday
9,230.423426,Day of Week_Saturday


In [18]:
predictions = model_B.predict(X_test)

pred_df = pd.DataFrame({"Predicted Bar Rings": predictions , "Actual Bar Rings": y_test})
pred_df.head(10)

Unnamed: 0,Predicted Bar Rings,Actual Bar Rings
138,610.789951,480.23
36,949.326,2054.4
170,353.775031,253.41
143,287.084927,407.5
126,706.014839,585.36
133,520.542883,528.92
102,105.309946,881.55
32,1167.72231,2250.4
120,1256.299555,660.7
145,794.970151,386.43


### random state =0, test size = .2

In [19]:
random_state= 0
test_size= 0.2
feature_list = []
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state,test_size=test_size)
feature_list = list(X_train.columns.values)
model_C = LinearRegression()

# Fitting our model with all of our features in X
model_C.fit(X_train, y_train)

score_train = model_C.score(X_train, y_train)
score_test = model_C.score(X_test, y_test)



# scores, pvalues_train = chi2(X_train, y_train)
# pvalues_train
# scores, pvalues_test = chi2(X_test, y_test)
# pvalues_test

# coef_feat_df_C = pd.DataFrame(list(zip(model_C.coef_, pvalues_train, pvalues_test, feature_list)))
coef_feat_df_C = pd.DataFrame(list(zip(model_C.coef_,  feature_list)))

coef_feat_df_C.columns = ['coef','feature']
# coef_feat_df_C = coef_feat_df_C.sort_values('p-values train',ascending=True)

print(f"random state = {random_state}, test size = {test_size}")
print(f"training Score: {score_train}")
print(f"testing Score: {score_test}")
print('y-axis intercept: ', model_C.intercept_)
coef_feat_df_C.head(20)


random state = 0, test size = 0.2
training Score: 0.8602526358866978
testing Score: 0.7967795038034743
y-axis intercept:  82.8333192109867


Unnamed: 0,coef,feature
0,9.246078,TOTAL ADV
1,-8.73897,AVG TPS
2,-294.592688,streams_transformed
3,7.038408,average_age
4,758.951606,percent_male
5,135.683654,ROOM_BALLROOM
6,-135.683654,ROOM_TAVERN
7,76.906735,Day of Week_Friday
8,-95.522085,Day of Week_Monday
9,169.490319,Day of Week_Saturday


In [20]:
predictions = model_C.predict(X_test)

pred_df = pd.DataFrame({"Predicted Bar Rings": predictions , "Actual Bar Rings": y_test})
pred_df.head(10)

Unnamed: 0,Predicted Bar Rings,Actual Bar Rings
83,1048.612499,1034.24
7,4706.311277,4916.85
61,473.897763,1475.39
176,253.996819,190.32
137,2832.197452,485.64
60,1794.572248,1477.28
141,642.350643,427.24
126,603.111024,585.36
163,620.70417,282.58
111,873.937153,740.27


### random state =0, test size = .3

In [21]:
random_state= 0
test_size= 0.3
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state,test_size=test_size)
feature_list = list(X_train.columns.values)
model_D = LinearRegression()

# Fitting our model with all of our features in X
model_D.fit(X_train, y_train)

score_train = model_D.score(X_train, y_train)
score_test = model_D.score(X_test, y_test)



# scores, pvalues_train = chi2(X_train, y_train)
# pvalues_train
# scores, pvalues_test = chi2(X_test, y_test)
# pvalues_test

# coef_feat_df_D = pd.DataFrame(list(zip(model_D.coef_, pvalues_train, pvalues_test, feature_list)))
coef_feat_df_D = pd.DataFrame(list(zip(model_D.coef_,  feature_list)))

coef_feat_df_D.columns = ['coef','feature']
# coef_feat_df_D = coef_feat_df_A.sort_values('p-values train',ascending=True)

print(f"random state = {random_state}, test size = {test_size}")
print(f"training Score: {score_train}")
print(f"testing Score: {score_test}")
print('y-axis intercept: ', model_D.intercept_)
coef_feat_df_D.head(20)




random state = 0, test size = 0.3
training Score: 0.865419890178325
testing Score: 0.7809464233029962
y-axis intercept:  354.386017165338


Unnamed: 0,coef,feature
0,9.396285,TOTAL ADV
1,-19.401688,AVG TPS
2,-139.26145,streams_transformed
3,3.525416,average_age
4,740.122028,percent_male
5,147.470426,ROOM_BALLROOM
6,-147.470426,ROOM_TAVERN
7,75.838939,Day of Week_Friday
8,-128.389944,Day of Week_Monday
9,154.86321,Day of Week_Saturday


In [22]:
predictions = model_D.predict(X_test)

pred_df = pd.DataFrame({"Predicted Bar Rings": predictions , "Actual Bar Rings": y_test})
pred_df.head(10)

Unnamed: 0,Predicted Bar Rings,Actual Bar Rings
83,1050.87518,1034.24
7,4629.101045,4916.85
61,516.331714,1475.39
176,285.856359,190.32
137,2836.170686,485.64
60,1864.54507,1477.28
141,659.883563,427.24
126,638.483916,585.36
163,623.47003,282.58
111,913.738835,740.27


### random state =42, test size = .3

In [23]:
random_state= 42
test_size= 0.3
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state,test_size=test_size)
feature_list = list(X_train.columns.values)
model_E = LinearRegression()

# Fitting our model with all of our features in X
# Fitting our model with all of our features in X
model_E.fit(X_train, y_train)

score_train = model_E.score(X_train, y_train)
score_test = model_E.score(X_test, y_test)



# scores, pvalues_train = chi2(X_train, y_train)
# pvalues_train
# scores, pvalues_test = chi2(X_test, y_test)
# pvalues_test

# coef_feat_df_E = pd.DataFrame(list(zip(model_E.coef_, pvalues_train, pvalues_test, feature_list)))
coef_feat_df_E = pd.DataFrame(list(zip(model_E.coef_,  feature_list)))

coef_feat_df_E.columns = ['coef','feature']
# coef_feat_df_E = coef_feat_df_E.sort_values('p-values train',ascending=True)

print(f"random state = {random_state}, test size = {test_size}")
print(f"training Score: {score_train}")
print(f"testing Score: {score_test}")
print('y-axis intercept: ', model_E.intercept_)
coef_feat_df_E.head(20)



random state = 42, test size = 0.3
training Score: 0.8563473269854593
testing Score: 0.7878709989640345
y-axis intercept:  -99.26681622115575


Unnamed: 0,coef,feature
0,9.7274,TOTAL ADV
1,-19.855206,AVG TPS
2,-13.471133,streams_transformed
3,16.775603,average_age
4,717.358264,percent_male
5,58.629338,ROOM_BALLROOM
6,-58.629338,ROOM_TAVERN
7,-95.163284,Day of Week_Friday
8,-67.273413,Day of Week_Monday
9,147.153032,Day of Week_Saturday


In [24]:
predictions = model_E.predict(X_test)

pred_df = pd.DataFrame({"Predicted Bar Rings": predictions , "Actual Bar Rings": y_test})
pred_df.head(10)

Unnamed: 0,Predicted Bar Rings,Actual Bar Rings
19,4589.450362,3546.19
42,984.975011,1915.26
153,323.816307,331.86
78,946.097785,1119.41
145,744.226817,386.43
15,2346.294819,3719.49
24,2231.076918,3076.76
68,1408.132861,1225.19
113,696.00299,729.55
118,669.588148,696.89


### random state =7, test size = .3

In [25]:
random_state= 7
test_size= 0.3
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state,test_size=test_size)
feature_list = list(X_train.columns.values)
model_F = LinearRegression()

# Fitting our model with all of our features in X
model_F.fit(X_train, y_train)

score_train = model_F.score(X_train, y_train)
score_test = model_F.score(X_test, y_test)



# scores, pvalues_train = chi2(X_train, y_train)
# pvalues_train
# scores, pvalues_test = chi2(X_test, y_test)
# pvalues_test

# coef_feat_df_F = pd.DataFrame(list(zip(model_F.coef_, pvalues_train, pvalues_test, feature_list)))
coef_feat_df_F = pd.DataFrame(list(zip(model_F.coef_,  feature_list)))

coef_feat_df_F.columns = ['coef','feature']
# coef_feat_df_F = coef_feat_df_F.sort_values('p-values train',ascending=True)

print(f"random state = {random_state}, test size = {test_size}")
print(f"training Score: {score_train}")
print(f"testing Score: {score_test}")
print('y-axis intercept: ', model_F.intercept_)
coef_feat_df_F.head(20)



random state = 7, test size = 0.3
training Score: 0.8524676329241583
testing Score: 0.8130911178965116
y-axis intercept:  -182.14623574331085


Unnamed: 0,coef,feature
0,9.025723,TOTAL ADV
1,-13.969047,AVG TPS
2,-66.331224,streams_transformed
3,18.567424,average_age
4,695.385859,percent_male
5,144.274965,ROOM_BALLROOM
6,-144.274965,ROOM_TAVERN
7,-70.063117,Day of Week_Friday
8,-20.349305,Day of Week_Monday
9,274.669033,Day of Week_Saturday


In [26]:
predictions = model_F.predict(X_test)

pred_df = pd.DataFrame({"Predicted Bar Rings": predictions , "Actual Bar Rings": y_test})
pred_df.head(10)

Unnamed: 0,Predicted Bar Rings,Actual Bar Rings
138,461.168123,480.23
36,1033.2429,2054.4
170,267.932094,253.41
143,291.735332,407.5
126,652.55796,585.36
133,551.425415,528.92
102,32.186035,881.55
32,1133.272768,2250.4
120,1243.744707,660.7
145,636.517571,386.43


## Pickling the prefered model


In [27]:
import pickle
# Dump the trained linear regression model with Pickle
lin_reg_pkl_filename = 'lin_reg_bar_rings.pkl'

# Open the file to save as pkl file
lin_reg_model_pkl = open(lin_reg_pkl_filename, 'wb')
pickle.dump(model_D, lin_reg_model_pkl)

# Close the pickle instances
lin_reg_model_pkl.close()

### testing pickled model

In [28]:
# Loading the saved model pickle
lin_reg_model_pkl = open('lin_reg_bar_rings.pkl', 'rb')
lin_reg_model = pickle.load(lin_reg_model_pkl)
print("Loaded Linear Regression model :: ", lin_reg_model)

Loaded Linear Regression model ::  LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)


In [31]:
predictions = lin_reg_model.predict(X_test)
predictions

array([ 813.48469785, 1258.20535425,  574.05965302,  595.51138037,
        638.48391614,  714.81030536,  392.44376883, 1389.2046535 ,
       1269.02249792,  732.14124484, 3091.57943168,  820.64763094,
        623.47002963,  741.34483983,  878.12188243,  913.73883534,
       1340.73906964,  474.09214797, 2830.48525485, 1948.90993275,
       1403.30761819, 1185.2736464 , 1278.32425471, 1245.18654848,
       3206.20228249,  500.85546767,  587.92123905,  460.58155478,
        212.6723573 ,  663.40932131,  622.90042056, 2617.00850567,
        291.68067842,  614.82989521, 2312.06987706, 1161.35657806,
        237.97519459,  285.85635858, 5326.15498941,  650.20751475,
       1694.3584878 ,  759.67659213,  414.26895005,  594.94967638,
        994.24537275, 3258.62781742, 2867.97704299, 1584.75262012,
       1270.64872743, 1046.85598503,  598.47730252,  585.78017626,
       5283.93639267, 4026.0231205 , 1864.54507032])

In [32]:
# testing single sample

X_single_sample = X.iloc[75]

X_single_sample

TOTAL ADV                38.000000
AVG TPS                  14.000000
streams_transformed       0.000364
average_age              35.588781
percent_male              0.413008
ROOM_BALLROOM             0.000000
ROOM_TAVERN               1.000000
Day of Week_Friday        0.000000
Day of Week_Monday        0.000000
Day of Week_Saturday      1.000000
Day of Week_Sunday        0.000000
Day of Week_Thursday      0.000000
Day of Week_Tuesday       0.000000
Day of Week_Wednesday     0.000000
Season_Fall               0.000000
Season_Spring             0.000000
Season_Summer             0.000000
Season_Winter             1.000000
Name: 75, dtype: float64

In [33]:
X_sample = np.asarray((38,14,0.000364,35.588781,.413008,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0))
# len(X_sample)
X_sample = X_sample.reshape(1,-1)
# X_single_sample = X_single_sample.values.reshape(1, -1)
print(X_sample)

[[3.8000000e+01 1.4000000e+01 3.6400000e-04 3.5588781e+01 4.1300800e-01
  0.0000000e+00 1.0000000e+00 0.0000000e+00 0.0000000e+00 1.0000000e+00
  0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00
  0.0000000e+00 0.0000000e+00 1.0000000e+00]]


In [34]:
prediction = lin_reg_model.predict(X_sample)
prediction

array([992.17135432])