### Load the necessary modules

In [1]:
import numpy as np
import pandas as pd

from fastparquet import ParquetFile,write
import matplotlib.pyplot as plt
from matplotlib import cm
from tqdm import tqdm

from cm2df import cm2df,precision_recall_fscore_support_metrics2df
from sklearn.metrics import confusion_matrix, classification_report,precision_recall_fscore_support

### Load the trainign data (cleaned and encoded)

In [2]:
pf=ParquetFile('subset_feature_4ML_110619.parq')
udf=pf.to_pandas()

In [3]:
udf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32880 entries, 11049447 to 9968570
Data columns (total 35 columns):
category                 32880 non-null int64
past                     32880 non-null int32
votes                    32880 non-null int64
is_eventbrite            32880 non-null int32
is_free                  32880 non-null int32
doors                    32880 non-null int32
sold_out                 32880 non-null int32
venue.id                 32880 non-null int64
venue.popularity         32880 non-null float64
venue.zip                32880 non-null int64
ticket_allages           32880 non-null int32
ticket_price_low         32880 non-null float64
ticket_price_max         32880 non-null float64
min_age                  32880 non-null int32
artist.popularity.sum    32880 non-null float64
artist.popularity.avg    32880 non-null float64
artist.popularity.max    32880 non-null float64
dow                      32880 non-null int64
doy                      32880 non-null in

In [4]:
udf['multiday'] = udf['multiday'].astype(int)

In [5]:
udf.head()

Unnamed: 0_level_0,category,past,votes,is_eventbrite,is_free,doors,sold_out,venue.id,venue.popularity,venue.zip,...,venue.title,venue.address,venue.city,venue.state,venue.latitude,venue.longitude,duration,duration_day,multiday,avg_votes_pday
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11049447,0,0,784,0,0,1,0,15,15.0,0,...,Stubb's,801 Red River,Austin,TX,30.268458,-97.736175,7.0,1.0,0,784.0
11049456,0,0,223,0,0,1,0,70,15.0,8,...,Emo's,2015 E. Riverside Dr.,Austin,TX,30.240266,-97.728516,6.0,1.0,0,223.0
11049430,0,0,163,0,0,1,0,29,15.0,2,...,Historic Scoot Inn,1308 E. Fourth,Austin,TX,30.262141,-97.729385,7.5,1.0,0,163.0
11141496,1,0,126,1,0,1,0,28,1.0,0,...,Fallout Theater,616 Lavaca St,Austin,TX,30.269536,-97.74523,6.5,1.0,0,126.0
11109390,1,0,83,0,1,0,0,94,1.0,2,...,The Brixton,1412 E. 6th Street,Austin,TX,30.26359,-97.727604,5.0,1.0,0,83.0


In [6]:
usecolsX=['category', 'past', 'is_eventbrite', 'is_free', 'doors',
       'sold_out', 'venue.id', 'venue.popularity', 'venue.zip',
       'ticket_allages', 'ticket_price_low', 'ticket_price_max', 'min_age',
       'artist.popularity.sum', 'artist.popularity.avg',
       'artist.popularity.max', 'dow', 'doy', 'month', 'day', 'hour',
       'venue.tol_num_events', 'duration', 'duration_day', 'multiday']
usecoly1=['votes']
usecoly2=['avg_votes_pday']

In [7]:
# a function to create udf_y based on 3 classes: low, mid, high
def get_class(df,thre1,thre2):
    n=len(df)
    newdf=[]
    cut1=min([thre1,thre2])
    cut2=max([thre1,thre2])
    for item in df:
        if item<cut1: newdf+=[0]
        elif (item<cut2) and (item>=cut1):newdf+=[1]
        else: newdf+=[2]
    return newdf

In [8]:
# import algorithms for model comparisons
# tree models
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
# scaling of the features
from sklearn.preprocessing import MinMaxScaler

## Best Model Estimator: Random Forest> + SMOTE Upsampling high votes to same as medium votes , best Hyperparameters

In [9]:
udf_X=udf[usecolsX+['votes']]
udf_y=udf[usecoly1]
y=get_class(udf_y.values,80,300)

In [12]:
from sklearn.model_selection import StratifiedKFold
# do upsample with replacement
from sklearn.utils import resample

# prepare the scaler beforehand
scalar = MinMaxScaler()
scalar.fit(udf_X.iloc[:,0:25])
X = scalar.transform(udf_X.iloc[:,0:25])

# setting up splits
skf = StratifiedKFold(n_splits=5)
skf.get_n_splits(X, y)

5

In [15]:
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE 

Using Theano backend.


In [106]:
cv_pre_rec=[]
cv_f1_rec=[]
cv_recal_rec=[]

In [124]:
for train_index, test_index in skf.split(X, y):
    # spliting the data
    #print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train=[y[i] for i in train_index]
    y_test =[y[i] for i in test_index]
    print("in the testing data:")
    print("number of low votes:",list(y_test).count(0))
    print("number of medium votes:",list(y_test).count(1))
    print("number of high votes:",list(y_test).count(2))

    # upsample the proportion of high votes to be the same as medium
    y_train_12_mask=np.array([True if item in [1,2] else False for item in y_train], dtype=bool)
    X_train_12=X_train[y_train_12_mask]
    y_train_12=[y_train[x] for x in range(len(y_train)) if y_train_12_mask[x]]
    y_train_0_mask=np.array([True if item==0 else False for item in y_train], dtype=bool)
    X_train_0=X_train[y_train_0_mask]
    
    # balance and upsample the high votes:
    X_smote_up, y_smote_up=SMOTE().fit_resample(X_train_12, y_train_12)
    print("in the training data:")
    print("number of low votes:",list(y_train).count(0))
    print("number of medium votes:",list(y_smote_up).count(1))
    print("number of high votes:",list(y_smote_up).count(2))
    
    # concate the upsampled class 1,2 with 0, shuffle the order, and then do model training
    X_ups_all=np.concatenate((X_train_0,X_smote_up))
    y_ups_all=np.concatenate((np.array([0 for x in range(X_train_0.shape[0])]),y_smote_up))
    
    # assemble a shuffled training dataset
    perm=np.random.permutation(X_ups_all.shape[0])
    X_train_input=X_ups_all[perm,:]
    y_train_input=y_ups_all[perm]
    # start off Random Forest , use best parameters
    model=RandomForestClassifier(n_estimators=120,
     min_samples_split=5,
     min_samples_leaf=4,
     max_features=10,
     max_depth=22,
     criterion='gini',
     bootstrap=False,verbose=0, n_jobs=-1)
    model.fit(X_train_input,y_train_input)
    y_pred_test=model.predict(X_test)
    
    # print the result confusion matrix, save precision/recall/f1
    a=precision_recall_fscore_support(y_test, y_pred_test)
    print("confusion matrix:")
    print(confusion_matrix(y_test,y_pred_test))
    print(classification_report(y_test,y_pred_test))
    f1s=a[2]
    cv_pre_rec+=[a[0]]
    cv_recal_rec+=[a[1]]
    # create the weighted f1 score: larger emphasis on high-votes bin and medium
    # adjusted f1_forall= (f1_low+f1_mid*2+f1_high*3)/6
    adjf1=(f1s[0]+f1s[1]*2+f1s[2]*3)/6
    print("weighted f1 score:",adjf1)
    cv_f1_rec+=[adjf1]

in the testing data:
number of low votes: 6323
number of medium votes: 212
number of high votes: 42
in the training data:
number of low votes: 25288
number of medium votes: 847
number of high votes: 847
confusion matrix:
[[6285   31    7]
 [ 132   71    9]
 [  23   10    9]]
              precision    recall  f1-score   support

           0       0.98      0.99      0.98      6323
           1       0.63      0.33      0.44       212
           2       0.36      0.21      0.27        42

    accuracy                           0.97      6577
   macro avg       0.66      0.51      0.56      6577
weighted avg       0.96      0.97      0.96      6577

weighted f1 score: 0.44456525376191197
in the testing data:
number of low votes: 6322
number of medium votes: 212
number of high votes: 42
in the training data:
number of low votes: 25289
number of medium votes: 847
number of high votes: 847
confusion matrix:
[[6295   23    4]
 [ 125   81    6]
 [  27    5   10]]
              precision    r

### Final training on a full model

In [16]:
FinModel=RandomForestClassifier(n_estimators=120,
     min_samples_split=5,
     min_samples_leaf=4,
     max_features=10,
     max_depth=22,
     criterion='gini',
     bootstrap=False,verbose=0)

In [17]:
# upsampling of the high-votes
y_12_mask=np.array([True if item in [1,2] else False for item in y], dtype=bool)
X_12=X[y_12_mask]
y_12=[y[x] for x in range(len(y)) if y_12_mask[x]]
y_0_mask=np.array([True if item==0 else False for item in y], dtype=bool)
X_0=X[y_0_mask]

# balance and upsample the high votes:
X_smote_up, y_smote_up=SMOTE().fit_resample(X_12, y_12)
print("in the original data:")
print("number of low votes:",list(y).count(0))
print("number of medium votes:",list(y).count(1))
print("number of high votes:",list(y).count(2))
print("in the upsampled data:")
print("number of low votes:",list(y).count(0))
print("number of medium votes:",list(y_smote_up).count(1))
print("number of high votes:",list(y_smote_up).count(2))

in the original data:
number of low votes: 31611
number of medium votes: 1059
number of high votes: 210
in the upsampled data:
number of low votes: 31611
number of medium votes: 1059
number of high votes: 1059


In [18]:
# reconcate the upsampled class 1,2 with 0, shuffle the order, and then do model training
X_ups_all=np.concatenate((X_0,X_smote_up))
y_ups_all=np.concatenate((np.array([0 for x in range(X_0.shape[0])]),y_smote_up))

# assemble a shuffled training dataset
perm=np.random.permutation(X_ups_all.shape[0])
X_train_input=X_ups_all[perm,:]
y_train_input=y_ups_all[perm]

In [19]:
FinModel.fit(X_train_input,y_train_input)

RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
                       max_depth=22, max_features=10, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=4, min_samples_split=5,
                       min_weight_fraction_leaf=0.0, n_estimators=120,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [131]:
y_pred=FinModel.predict(X)

In [132]:
y_prob=FinModel.predict_proba(X)

In [133]:
result=list(zip(y,y_pred,udf['votes'].values,y_prob[:,0],y_prob[:,1],y_prob[:,2]))

In [134]:
result_df=pd.DataFrame(result,columns=['class actual','class predicted','actual votes','probability in class0','probability in class1','probability in class2'])

In [135]:
result_df.tail()

Unnamed: 0,class actual,class predicted,actual votes,probability in class0,probability in class1,probability in class2
32875,0,0,0,0.999963,3.7e-05,0.0
32876,0,0,0,0.999667,0.000229,0.000104
32877,0,0,0,0.999995,5e-06,0.0
32878,0,0,0,0.99204,0.002121,0.005838
32879,0,0,0,0.999841,0.000102,5.7e-05


In [148]:
#show all misclassifications
sub0=result_df.loc[result_df['class actual'] == 0]
sub1=result_df.loc[result_df['class actual'] == 1]
sub2=result_df.loc[result_df['class actual'] == 2]
sub0.loc[sub0['class predicted']!=0].sort_values('actual votes',ascending=False) # missclassified low votes

Unnamed: 0,class actual,class predicted,actual votes,probability in class0,probability in class1,probability in class2
907,0,1,79,0.303135,0.452579,0.244286
6223,0,2,79,0.354881,0.150774,0.494345
8604,0,1,74,0.405956,0.569792,0.024253
24789,0,1,74,0.282427,0.717573,0.0
3219,0,1,71,0.440426,0.544454,0.01512
7218,0,1,63,0.471607,0.518909,0.009484
22673,0,2,54,0.306627,0.341825,0.351548
9935,0,2,33,0.459008,0.045099,0.495893
14367,0,1,30,0.310238,0.444901,0.244861
22802,0,1,29,0.341329,0.649365,0.009306


In [149]:
sub2.loc[sub2['class predicted']!=2].sort_values('actual votes',ascending=False) # missclassified high votes

Unnamed: 0,class actual,class predicted,actual votes,probability in class0,probability in class1,probability in class2
32526,2,0,9120,0.790625,0.071111,0.138264
32006,2,0,8350,0.794792,0.069028,0.136181
31386,2,0,5176,0.837718,0.002778,0.159504
30297,2,0,3959,0.886468,0.096587,0.016944
31387,2,0,2515,0.972897,0.027103,0.000000
26383,2,0,1468,0.772619,0.047401,0.179980
4758,2,0,1329,0.773926,0.007220,0.218853
32527,2,0,1130,0.935169,0.022560,0.042272
17743,2,0,1126,0.782782,0.012115,0.205103
24007,2,0,1046,0.831925,0.022876,0.145199


In [143]:
# save the prediction to dataframe
result_df.to_csv('EventfulFinalMod_result111819.csv')

In [20]:
# save the model
import dill
# save the model to disk
filename = 'finalized_model111819.sav'
#pickle.dump(FinModel, open(filename, 'wb'))
### suggestion for loading the model from disk
### loaded_model = pickle.load(open(filename, 'rb'))
with open(filename, "wb") as dill_file:
    dill.dump(FinModel, dill_file)

In [151]:
# save the scalar for X input, for later useage :
from sklearn.externals import joblib  

# 'dump' scalar file
joblib.dump(scalar, 'MinMaxScalar_111819.pkl') 

['MinMaxScalar_111819.pkl']