In [1]:
# Import library

import pandas as pd 
import numpy as np
import seaborn as sns

from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV

In [2]:
df = pd.read_csv('./data/processed_data(by_quarter).csv')
df.drop(['Unnamed: 0', "usd_pledged"], axis = 1, inplace = True)
df.head()

Unnamed: 0,duration,goal_usd,blurb_length,name_length,success,US based,main_category_comics,main_category_crafts,main_category_dance,main_category_design,...,main_category_games,main_category_journalism,main_category_music,main_category_photography,main_category_publishing,main_category_technology,main_category_theater,start_Q_Q2,start_Q_Q3,start_Q_Q4
0,0.163043,1.5e-05,0.382353,0.230769,1.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.315217,3e-05,0.676471,0.269231,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.315217,9e-06,0.588235,0.230769,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.478261,2.7e-05,0.411765,0.192308,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.641304,0.000232,0.411765,0.115385,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [3]:
## Define function for performance result

# Function to print KFold Cross validation performance on train set 
def KFoldresult_5fold(model, x_train, y_train, is_logreg):
    accuracy = cross_val_score (model, x_train,y_train, cv=5)
    print (model)
    print (f'KFolds cross validation: \n {accuracy} \n')
    print (f'Mean accuracy: \n {accuracy.mean()}\n')
    print ('Coefficient of feature: \n' )
    if is_logreg:
        for index, co in enumerate(model.coef_[0]):
            print (f'Feature {index}: {co:.5f}')
    else:
        for index, co in enumerate(model.feature_importances_):
            print (f'Feature {index}: {co:.5f}')
    return accuracy

# Function to return prediction and print prediction result on test set 
def predictionresult(model, x_test, y_test):
    y_pred = model.predict(x_test)
    print (f'Confusion_matrix: \n {confusion_matrix(y_test, y_pred)} \n')
    print (f'Classification report: \n {classification_report(y_test,y_pred)} \n')
    return y_pred

# Function to print out Grid Search parameters: 
def gridsearch(model, parameters, X_train, y_train):
    search = GridSearchCV(model, parameters, n_jobs=-1)
    search.fit(x_train,y_train)
    print(f'Parameter tested: {parameters}')
    print(f'Best Score : {search.best_score_}')
    print(f'Best parameters: {search.best_params_}')
    return search

def performace(y_ture, y_pred):
    return [accuracy_score(y_ture, y_pred), recall_score(y_ture, y_pred), precision_score(y_ture, y_pred), f1_score(y_ture, y_pred)]

In [4]:
# Split train set and test set 
y = df['success']
x = df.drop('success',axis=1)
x_train, x_test, y_train, y_test = train_test_split(x,y,train_size = 0.8, random_state = 42) #shuffle = False that means no random 

## Logistic Regression (log)
#### -Develop model

In [5]:
# Create model -log
log = LogisticRegression()
log.fit (x_train,y_train)

KFoldresult_5fold(log, x_train, y_train, True)

LogisticRegression()
KFolds cross validation: 
 [0.6541714  0.6577223  0.65995534 0.66555625 0.65708742] 

Mean accuracy: 
 0.658898540011138

Coefficient of feature: 

Feature 0: -2.33870
Feature 1: -7.30428
Feature 2: -0.69410
Feature 3: 3.07711
Feature 4: 0.12142
Feature 5: 1.15119
Feature 6: -0.56764
Feature 7: 1.15936
Feature 8: 0.20551
Feature 9: -0.11058
Feature 10: 0.20994
Feature 11: -1.09012
Feature 12: 0.16395
Feature 13: -1.04273
Feature 14: 0.45077
Feature 15: -0.52420
Feature 16: 0.55926
Feature 17: -0.85593
Feature 18: 0.31217
Feature 19: -0.02090
Feature 20: -0.12587
Feature 21: -0.00681


array([0.6541714 , 0.6577223 , 0.65995534, 0.66555625, 0.65708742])

In [6]:
log_y_predict = predictionresult(log, x_test, y_test)

Confusion_matrix: 
 [[ 7384  7518]
 [ 4093 15151]] 

Classification report: 
               precision    recall  f1-score   support

         0.0       0.64      0.50      0.56     14902
         1.0       0.67      0.79      0.72     19244

    accuracy                           0.66     34146
   macro avg       0.66      0.64      0.64     34146
weighted avg       0.66      0.66      0.65     34146
 



#### -Optimizing hyperparameters

In [7]:
param = {'C':np.linspace(0.1,1,10), 'penalty': ['l1', 'l2']} 
gridsearch (log, param, x_train, y_train)

Parameter tested: {'C': array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ]), 'penalty': ['l1', 'l2']}
Best Score : 0.658898540011138
Best parameters: {'C': 1.0, 'penalty': 'l2'}


GridSearchCV(estimator=LogisticRegression(), n_jobs=-1,
             param_grid={'C': array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ]),
                         'penalty': ['l1', 'l2']})

#### Apply the best parameters {'C': 1.0, 'penalty': 'l2'} 
 - best parameters is same as default parameters for model 'log'

## Random Forest Classifier (ranforest)
#### -Develop model 

In [8]:
ranforest = RandomForestClassifier(random_state = 42, n_jobs=-1)
ranforest.fit (x_train,y_train)

KFoldresult_5fold(ranforest, x_train, y_train, False)

RandomForestClassifier(n_jobs=-1, random_state=42)
KFolds cross validation: 
 [0.66749643 0.66819197 0.67229198 0.67104733 0.66916825] 

Mean accuracy: 
 0.6696391929439767

Coefficient of feature: 

Feature 0: 0.15751
Feature 1: 0.33374
Feature 2: 0.21646
Feature 3: 0.12619
Feature 4: 0.01699
Feature 5: 0.00754
Feature 6: 0.00523
Feature 7: 0.00407
Feature 8: 0.00489
Feature 9: 0.00527
Feature 10: 0.00791
Feature 11: 0.01849
Feature 12: 0.00635
Feature 13: 0.00690
Feature 14: 0.00812
Feature 15: 0.00569
Feature 16: 0.00724
Feature 17: 0.01485
Feature 18: 0.00424
Feature 19: 0.01429
Feature 20: 0.01383
Feature 21: 0.01420


array([0.66749643, 0.66819197, 0.67229198, 0.67104733, 0.66916825])

In [9]:
ranforest_y_predict = predictionresult(ranforest, x_test, y_test)

Confusion_matrix: 
 [[ 8611  6291]
 [ 4956 14288]] 

Classification report: 
               precision    recall  f1-score   support

         0.0       0.63      0.58      0.60     14902
         1.0       0.69      0.74      0.72     19244

    accuracy                           0.67     34146
   macro avg       0.66      0.66      0.66     34146
weighted avg       0.67      0.67      0.67     34146
 



#### -Optimizing Hyperparameters

In [10]:
param = {'max_depth': list(range(1,5))}
gridsearch (ranforest, param, x_train, y_train)

Parameter tested: {'max_depth': [1, 2, 3, 4]}
Best Score : 0.6631816046196805
Best parameters: {'max_depth': 4}


GridSearchCV(estimator=RandomForestClassifier(n_jobs=-1, random_state=42),
             n_jobs=-1, param_grid={'max_depth': [1, 2, 3, 4]})

#### Apply the best parameters {'max_depth': 4}

In [11]:
ranforest = RandomForestClassifier(max_depth=4, random_state = 42, n_jobs=-1)
ranforest.fit (x_train,y_train)

KFoldresult_5fold(ranforest, x_train, y_train, False)
ranforest_y_predict = predictionresult(ranforest, x_test, y_test)

RandomForestClassifier(max_depth=4, n_jobs=-1, random_state=42)
KFolds cross validation: 
 [0.66284731 0.66672768 0.66401874 0.66284731 0.65946698] 

Mean accuracy: 
 0.6631816046196805

Coefficient of feature: 

Feature 0: 0.12609
Feature 1: 0.33466
Feature 2: 0.00536
Feature 3: 0.11160
Feature 4: 0.00449
Feature 5: 0.03607
Feature 6: 0.00517
Feature 7: 0.01122
Feature 8: 0.00122
Feature 9: 0.00004
Feature 10: 0.00165
Feature 11: 0.15445
Feature 12: 0.00207
Feature 13: 0.02478
Feature 14: 0.02850
Feature 15: 0.00669
Feature 16: 0.03339
Feature 17: 0.11135
Feature 18: 0.00030
Feature 19: 0.00017
Feature 20: 0.00054
Feature 21: 0.00018
Confusion_matrix: 
 [[ 5897  9005]
 [ 2432 16812]] 

Classification report: 
               precision    recall  f1-score   support

         0.0       0.71      0.40      0.51     14902
         1.0       0.65      0.87      0.75     19244

    accuracy                           0.67     34146
   macro avg       0.68      0.63      0.63     34146
weighte

## XG Boost
#### -Develop model

In [12]:
xgmodel = XGBClassifier(use_label_encoder = False, eval_metric='mlogloss',n_jobs = -1 )
xgmodel.fit (x_train,y_train)

KFoldresult_5fold(xgmodel, x_train, y_train, False)

Parameters: { use_label_encoder } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { use_label_encoder } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { use_label_encoder } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { use_label_encoder } might not be used.

  This may not be accurate due to some parameters are only used in 

array([0.69553758, 0.70176081, 0.70282242, 0.69993045, 0.69980964])

In [13]:
xgmodel_y_predict = predictionresult(xgmodel, x_test, y_test)

Confusion_matrix: 
 [[ 8385  6517]
 [ 3721 15523]] 

Classification report: 
               precision    recall  f1-score   support

         0.0       0.69      0.56      0.62     14902
         1.0       0.70      0.81      0.75     19244

    accuracy                           0.70     34146
   macro avg       0.70      0.68      0.69     34146
weighted avg       0.70      0.70      0.69     34146
 



#### -Optimizing Hyperparameters

In [14]:
param = {'max_depth': list(range(1,5))}
gridsearch (xgmodel, param, x_train, y_train)

Parameters: { use_label_encoder } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameter tested: {'max_depth': [1, 2, 3, 4]}
Best Score : 0.7001112816703223
Best parameters: {'max_depth': 4}


GridSearchCV(estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, eval_metric='mlogloss',
                                     gamma=0, gpu_id=-1, importance_type='gain',
                                     interaction_constraints='',
                                     learning_rate=0.300000012,
                                     max_delta_step=0, max_depth=6,
                                     min_child_weight=1, missing=nan,
                                     monotone_constraints='()',
                                     n_estimators=100, n_jobs=-1,
                                     num_parallel_tree=1, random_state=0,
                                     reg_alpha=0, reg_lambda=1,
                                     scale_pos_weight=1, subsample=1,
                                     tree_method='exact',
                          

#### Apply the best parameters {'max_depth': 4}

In [15]:
xgmodel = XGBClassifier(use_label_encoder = False, eval_metric='mlogloss',n_jobs = -1, max_depth = 4)
xgmodel.fit (x_train,y_train)

KFoldresult_5fold(xgmodel, x_train, y_train, False)

Parameters: { use_label_encoder } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { use_label_encoder } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { use_label_encoder } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { use_label_encoder } might not be used.

  This may not be accurate due to some parameters are only used in 

array([0.6971483 , 0.70110188, 0.70036973, 0.70252956, 0.69940694])

In [16]:
xgmodel_y_predict = predictionresult(xgmodel, x_test, y_test)

Confusion_matrix: 
 [[ 8244  6658]
 [ 3599 15645]] 

Classification report: 
               precision    recall  f1-score   support

         0.0       0.70      0.55      0.62     14902
         1.0       0.70      0.81      0.75     19244

    accuracy                           0.70     34146
   macro avg       0.70      0.68      0.68     34146
weighted avg       0.70      0.70      0.69     34146
 



In [17]:
log_score = performace(y_test, log_y_predict)
rf_score = performace(y_test, ranforest_y_predict)
xg_score = performace(y_test, xgmodel_y_predict)

models_scores_table = pd.DataFrame({'Logistic Regression': log_score, 'Random Forest Classifier': rf_score, 'XGBoost':xg_score},
                                    index=['Accuracy', 'Recall', 'Precision', 'F1 Score'])

models_scores_table['Best Score'] = models_scores_table.idxmax(axis=1)

models_scores_table

Unnamed: 0,Logistic Regression,Random Forest Classifier,XGBoost,Best Score
Accuracy,0.65996,0.665056,0.699613,XGBoost
Recall,0.78731,0.873623,0.812981,Random Forest Classifier
Precision,0.668358,0.651199,0.701475,XGBoost
F1 Score,0.722974,0.746189,0.753123,XGBoost
