In [1]:
import numpy as np
import pandas as pd

from fastparquet import ParquetFile,write
import matplotlib.pyplot as plt
from matplotlib import cm
from tqdm import tqdm

from cm2df import cm2df,precision_recall_fscore_support_metrics2df
from sklearn.metrics import confusion_matrix, classification_report,precision_recall_fscore_support

In [2]:
pf=ParquetFile('subset_feature_4ML_110619.parq')
udf=pf.to_pandas()

In [3]:
udf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32880 entries, 11049447 to 9968570
Data columns (total 35 columns):
category                 32880 non-null int64
past                     32880 non-null int32
votes                    32880 non-null int64
is_eventbrite            32880 non-null int32
is_free                  32880 non-null int32
doors                    32880 non-null int32
sold_out                 32880 non-null int32
venue.id                 32880 non-null int64
venue.popularity         32880 non-null float64
venue.zip                32880 non-null int64
ticket_allages           32880 non-null int32
ticket_price_low         32880 non-null float64
ticket_price_max         32880 non-null float64
min_age                  32880 non-null int32
artist.popularity.sum    32880 non-null float64
artist.popularity.avg    32880 non-null float64
artist.popularity.max    32880 non-null float64
dow                      32880 non-null int64
doy                      32880 non-null in

In [4]:
udf['multiday'] = udf['multiday'].astype(int)

In [5]:
usecolsX=['category', 'past', 'is_eventbrite', 'is_free', 'doors',
       'sold_out', 'venue.id', 'venue.popularity', 'venue.zip',
       'ticket_allages', 'ticket_price_low', 'ticket_price_max', 'min_age',
       'artist.popularity.sum', 'artist.popularity.avg',
       'artist.popularity.max', 'dow', 'doy', 'month', 'day', 'hour',
       'venue.tol_num_events', 'duration', 'duration_day', 'multiday']
usecoly1=['votes']
usecoly2=['avg_votes_pday']

In [6]:
# a function to create udf_y based on 3 classes: low, mid, high
def get_class(df,thre1,thre2):
    n=len(df)
    newdf=[]
    cut1=min([thre1,thre2])
    cut2=max([thre1,thre2])
    for item in df:
        if item<cut1: newdf+=[0]
        elif (item<cut2) and (item>=cut1):newdf+=[1]
        else: newdf+=[2]
    return newdf

In [7]:
# import algorithms for model comparisons
# tree models
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
# scaling of the features
from sklearn.preprocessing import MinMaxScaler

## Benchmark Model: Random Forest> + SMOTE Upsampling high votes to same as medium votes 

In [8]:
udf_X=udf[usecolsX+['votes']]
udf_y=udf[usecoly1]
y=get_class(udf_y.values,80,300)

In [93]:
# first spare the training data for upsampling
X_train, X_test, y_train, y_test = train_test_split(udf_X, y, test_size=0.3, stratify=y, random_state=12)
print("in the training data:")
print("number of low votes:",list(y_train).count(0))
print("number of medium votes:",list(y_train).count(1))
print("number of high votes:",list(y_train).count(2))
print("in the testing data:")
print("number of low votes:",list(y_test).count(0))
print("number of medium votes:",list(y_test).count(1))
print("number of high votes:",list(y_test).count(2))
# do upsample with replacement
from sklearn.utils import resample
# separate three classes
# add back the 'votes' column, and redo the sampling
mask0=[True if x==0 else False for x in y_train]
mask1=[True if x==1 else False for x in y_train]
mask2=[True if x==2 else False for x in y_train]
class0=X_train.loc[mask0]
class1=X_train.loc[mask1]
class2=X_train.loc[mask2]

in the training data:
number of low votes: 22128
number of medium votes: 741
number of high votes: 147
in the testing data:
number of low votes: 9483
number of medium votes: 318
number of high votes: 63


In [94]:
f1_rec=[]
prec_rec=[]
recal_rec=[]
max_f=len(udf_X.columns)

#### SMOTE upsample of the high votes: upsample high votes to be same as medium (3.5 times more than original)

In [96]:
from imblearn.over_sampling import SMOTE 
from collections import Counter

In [97]:
# combine the class1 and class2 to dataframe and then input to SMOTE:
tmpdf=pd.concat([class1,class2])
# re-shuffle the order
tmpdf=tmpdf.sample(frac=1)

y_new=get_class(tmpdf['votes'].values,80,300)

In [98]:
X_resampled, y_resampled = SMOTE().fit_resample(tmpdf, y_new)

In [99]:
print(sorted(Counter(y_resampled).items()))

[(1, 741), (2, 741)]


In [100]:
df_12_tol = pd.DataFrame(X_resampled, columns=usecolsX+['votes'])
# add back the dataframes and reshuffle
ndf=pd.concat([df_12_tol,class0])
ndf=ndf.sample(frac=1)

In [101]:
# new y-class
yy=get_class(ndf['votes'].values,80,300)
ndf=ndf[usecolsX]

In [102]:
print("increase sample size on the percentage:",(len(ndf)-len(X_train))/len(X_train)*100,"%")

increase sample size on the percentage: 2.5808133472367047 %


In [131]:
print("in the upsampled training data:")
print("number of low votes:",list(yy).count(0))
print("number of medium votes:",list(yy).count(1))
print("number of high votes:",list(yy).count(2),",compared with original:",list(y).count(2))
print("number of features in X:",len(ndf.columns))

in the upsampled training data:
number of low votes: 22128
number of medium votes: 741
number of high votes: 741 ,compared with original: 210
number of features in X: 25


In [104]:
# redo the benchmark model
scalar = MinMaxScaler()
scalar.fit(ndf)
X = scalar.transform(ndf)

In [105]:
# spare the X_test with proper versions
X_test=X_test[usecolsX]
X_test=scalar.transform(X_test)


In [106]:
prec_rec=[]
recal_rec=[]
wf1_rec=[]

In [38]:
classifier0=RandomForestClassifier(n_estimators=90,random_state=12,max_features=int(max_f*0.75),verbose=0)
classifier0.fit(X,yy)
pred_y_test=classifier0.predict(X_test)
# assess the model performance: obtain prescision/recall/f1 scores
a=precision_recall_fscore_support(y_test, pred_y_test)
f1s=a[2]
prec_rec+=[a[0]]
recal_rec+=[a[1]]
# create the weighted f1 score: larger emphasis on high-votes bin and medium
# adjusted f1_forall= (f1_low+f1_mid*2+f1_high*3)/6
adjf1=(f1s[0]+f1s[1]*2+f1s[2]*3)/6
print("weighted f1 score:",adjf1)
wf1_rec+=[adjf1]

weighted f1 score: 0.4916849365684099


In [39]:
# print confusion matrix-- testing
print("confusion matrix:",confusion_matrix(y_test, pred_y_test))
print("other metrics:",classification_report(y_test, pred_y_test))

confusion matrix: [[9438   32   13]
 [ 206  105    7]
 [  31   14   18]]
other metrics:               precision    recall  f1-score   support

           0       0.98      1.00      0.99      9483
           1       0.70      0.33      0.45       318
           2       0.47      0.29      0.36        63

    accuracy                           0.97      9864
   macro avg       0.71      0.54      0.60      9864
weighted avg       0.96      0.97      0.96      9864



In [43]:
# Look at parameters used by our current forest
print('Parameters currently in use:\n')
print(classifier0.get_params())

Parameters currently in use:

{'bootstrap': True, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 19, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 90, 'n_jobs': None, 'oob_score': False, 'random_state': 12, 'verbose': 0, 'warm_start': False}


#### Start Hyper-parameter tunning of the RF model:
##### * target parameters: (1) n_estimator: number of trees
#####                      (2) criterion: gini or entropy
#####                      (3) max_depth: The maximum depth of the tree
#####                      (4) min_samples_split: 2 or above , min number to split
#####                      (5) min_samples_leaf : 1 or above, min number on per leaf
#####                      (6) max_features: maximum feature number to randomly use per split, 25 or below
#####                      (7) bootstrap : Whether bootstrap samples are used when building trees, True/False

In [44]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline

In [121]:
# Create dictionary hyperparameter candidates
n_est=[90,120,160,200,300]
crit=['gini','entropy']
max_d=[5,15,25,40,100]
min_sl=[1,5,9,15,35]
min_ss=[2,8,15,35]
max_f=[5,10,15,20,25]
bots = [True, False]

# Create the random grid
hpparam = {'n_estimators': n_est,
           'criterion': crit,
           'max_features': max_f,
           'max_depth': max_d,
           'min_samples_leaf': min_sl,
           'min_samples_split': min_ss,
           'bootstrap': bots}
print(hpparam)

{'n_estimators': [90, 120, 160, 200, 300], 'criterion': ['gini', 'entropy'], 'max_features': [10, 15, 20, 25], 'max_depth': [12, 15, 18, 20, 22, 25], 'min_samples_leaf': [1, 3, 5, 7, 9], 'min_samples_split': [2, 5, 8, 10, 20], 'bootstrap': [True, False]}


In [113]:
# get valid scorers , check options:
from sklearn import metrics
print(sorted(metrics.SCORERS.keys()))

['accuracy', 'adjusted_mutual_info_score', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy', 'brier_score_loss', 'completeness_score', 'explained_variance', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'fowlkes_mallows_score', 'homogeneity_score', 'jaccard', 'jaccard_macro', 'jaccard_micro', 'jaccard_samples', 'jaccard_weighted', 'max_error', 'mutual_info_score', 'neg_log_loss', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_median_absolute_error', 'normalized_mutual_info_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'r2', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc', 'v_measure_score']


In [145]:
rf_random = RandomizedSearchCV(estimator = RandomForestClassifier(), param_distributions = hpparam, scoring='f1_micro',n_iter = 120, cv = 3, verbose=2, random_state=42, n_jobs = -1)

In [146]:
rf_random.fit(X,yy)

Fitting 3 folds for each of 120 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   49.0s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:  6.1min
[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed: 15.3min finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                                                    n_estimators='warn',
                                                    n_jobs=None

In [149]:
rf_random.best_params_

{'n_estimators': 120,
 'min_samples_split': 5,
 'min_samples_leaf': 1,
 'max_features': 10,
 'max_depth': 22,
 'criterion': 'gini',
 'bootstrap': False}

In [127]:
result=rf_random.cv_results_

In [142]:
rslt_df=pd.DataFrame(result)

In [143]:
rslt_df.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_min_samples_split,param_min_samples_leaf,param_max_features,param_max_depth,param_criterion,param_bootstrap,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,25.409412,1.165219,0.200134,0.014575,90,20,1,20,18,gini,False,"{'n_estimators': 90, 'min_samples_split': 20, ...",0.960864,0.96582,0.960102,0.962262,0.002535,100
1,14.685415,0.345324,0.208441,0.026994,90,8,9,20,15,gini,True,"{'n_estimators': 90, 'min_samples_split': 8, '...",0.959848,0.963914,0.963278,0.962346,0.001786,97
2,15.693385,0.398346,0.208117,0.01502,90,10,1,25,22,entropy,True,"{'n_estimators': 90, 'min_samples_split': 10, ...",0.965184,0.966836,0.965184,0.965735,0.000779,21
3,12.243274,0.203947,0.288895,0.023209,120,10,5,15,22,entropy,True,"{'n_estimators': 120, 'min_samples_split': 10,...",0.963405,0.966201,0.965947,0.965184,0.001262,38
4,72.730021,5.825824,0.638295,0.090327,300,8,7,25,25,entropy,False,"{'n_estimators': 300, 'min_samples_split': 8, ...",0.95324,0.951715,0.952986,0.952647,0.000667,119


In [144]:
# save the randomize grid search cv result
rslt_df.to_csv('RF_randgridsearchcv_result.csv')

In [150]:
# use the best parameters , compare performance
#classifier1=RandomForestClassifier(n_estimators=120,random_state=12,max_features=10,max_depth=22,bootstrap=False,min_samples_split=5,verbose=0)
#classifier1.fit(X,yy)
pred_y_test=rf_random.best_estimator_.predict(X_test)
# assess the model performance: obtain prescision/recall/f1 scores
a=precision_recall_fscore_support(y_test, pred_y_test)
f1s=a[2]
#prec_rec+=[a[0]]
#recal_rec+=[a[1]]
# create the weighted f1 score: larger emphasis on high-votes bin and medium
# adjusted f1_forall= (f1_low+f1_mid*2+f1_high*3)/6
adjf1=(f1s[0]+f1s[1]*2+f1s[2]*3)/6
print("weighted f1 score:",adjf1)
#wf1_rec+=[adjf1]

weighted f1 score: 0.05314315749646595


In [132]:
print("in the testing data:")
print("number of low votes:",list(y_test).count(0))
print("number of medium votes:",list(y_test).count(1))
print("number of high votes:",list(y_test).count(2))
print("number of features in X:",len(ndf.columns))

in the testing data:
number of low votes: 9483
number of medium votes: 318
number of high votes: 63
number of features in X: 25


In [129]:
# print confusion matrix-- testing
print("confusion matrix:",confusion_matrix(y_test, pred_y_test))
print("other metrics:",classification_report(y_test, pred_y_test))

confusion matrix: [[9434   38   11]
 [ 207  104    7]
 [  32   13   18]]
other metrics:               precision    recall  f1-score   support

           0       0.98      0.99      0.98      9483
           1       0.67      0.33      0.44       318
           2       0.50      0.29      0.36        63

    accuracy                           0.97      9864
   macro avg       0.72      0.54      0.60      9864
weighted avg       0.96      0.97      0.96      9864



In [133]:
# Test the different scalars on the features
# (1) MinMaxScalar
# (2) StandardScalar
# (3) RobustScalar
from sklearn.preprocessing import StandardScaler,RobustScaler


In [134]:
X_train, X_test, y_train, y_test = train_test_split(udf_X, y, test_size=0.3, stratify=y, random_state=12)
# redo the benchmark model
scalar1 = StandardScaler()
scalar1.fit(ndf)
X1 = scalar1.transform(ndf)

In [135]:
# spare the X_test with proper versions
X_test=X_test[usecolsX]
X_test1=scalar1.transform(X_test)

In [136]:
# use the best parameters , compare performance
classifier2=RandomForestClassifier(n_estimators=120,random_state=12,max_features=10, max_depth=22,min_samples_split=5,bootstrap=False,verbose=0)
classifier2.fit(X1,yy)
pred_y_test=classifier2.predict(X_test1)
# assess the model performance: obtain prescision/recall/f1 scores
a=precision_recall_fscore_support(y_test, pred_y_test)
f1s=a[2]
prec_rec+=[a[0]]
recal_rec+=[a[1]]
# create the weighted f1 score: larger emphasis on high-votes bin and medium
# adjusted f1_forall= (f1_low+f1_mid*2+f1_high*3)/6
adjf1=(f1s[0]+f1s[1]*2+f1s[2]*3)/6
print("weighted f1 score:",adjf1)
wf1_rec+=[adjf1]

weighted f1 score: 0.4945016196986005


In [137]:
# print confusion matrix-- testing
print("confusion matrix:",confusion_matrix(y_test, pred_y_test))
print("other metrics:",classification_report(y_test, pred_y_test))

confusion matrix: [[9440   36    7]
 [ 208  104    6]
 [  34   12   17]]
other metrics:               precision    recall  f1-score   support

           0       0.98      1.00      0.99      9483
           1       0.68      0.33      0.44       318
           2       0.57      0.27      0.37        63

    accuracy                           0.97      9864
   macro avg       0.74      0.53      0.60      9864
weighted avg       0.96      0.97      0.96      9864



In [138]:
### Test the Robust Scalar
scalar2 = RobustScaler()
scalar2.fit(ndf)
X2 = scalar2.transform(ndf)
X_test2=scalar2.transform(X_test)

In [139]:
# use the bechmark parameters , compare performance
classifier3=RandomForestClassifier(n_estimators=120,random_state=12,max_features=10, max_depth=22,min_samples_split=5,bootstrap=False,verbose=0)
classifier3.fit(X2,yy)
pred_y_test=classifier3.predict(X_test2)
# assess the model performance: obtain prescision/recall/f1 scores
a=precision_recall_fscore_support(y_test, pred_y_test)
f1s=a[2]
prec_rec+=[a[0]]
recal_rec+=[a[1]]
# create the weighted f1 score: larger emphasis on high-votes bin and medium
# adjusted f1_forall= (f1_low+f1_mid*2+f1_high*3)/6
adjf1=(f1s[0]+f1s[1]*2+f1s[2]*3)/6
print("weighted f1 score:",adjf1)
wf1_rec+=[adjf1]

weighted f1 score: 0.4927467704889388


In [140]:
# print confusion matrix-- testing
print("confusion matrix:",confusion_matrix(y_test, pred_y_test))
print("other metrics:",classification_report(y_test, pred_y_test))

confusion matrix: [[9438   38    7]
 [ 209  103    6]
 [  34   12   17]]
other metrics:               precision    recall  f1-score   support

           0       0.97      1.00      0.98      9483
           1       0.67      0.32      0.44       318
           2       0.57      0.27      0.37        63

    accuracy                           0.97      9864
   macro avg       0.74      0.53      0.60      9864
weighted avg       0.96      0.97      0.96      9864

