In [1]:
# Import library

import pandas as pd 
import numpy as np
import seaborn as sns

from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV

## Below cell: read the cleaned up dataframe

In [2]:
df = pd.read_csv('./data/processed_data(by_quarter).csv')
df.drop(['Unnamed: 0', 'usd_pledged'], axis = 1, inplace = True)
df.head()

Unnamed: 0,duration,goal_usd,blurb_length,name_length,success,US based,main_category_comics,main_category_crafts,main_category_dance,main_category_design,...,main_category_games,main_category_journalism,main_category_music,main_category_photography,main_category_publishing,main_category_technology,main_category_theater,start_Q_Q2,start_Q_Q3,start_Q_Q4
0,0.163043,1.5e-05,0.382353,0.230769,1.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.315217,3e-05,0.676471,0.269231,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.315217,9e-06,0.588235,0.230769,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.478261,2.7e-05,0.411765,0.192308,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.641304,0.000232,0.411765,0.115385,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [3]:
df.shape

(170730, 23)

## Below cell: read the raw dataframe so that there's columns for Main and Sub category

In [4]:
df_raw = pd.read_csv('./data/Kickstarter_projects_Feb19.csv')
df_raw.head()

Unnamed: 0,id,name,currency,main_category,sub_category,launched_at,deadline,duration,goal_usd,city,state,country,blurb_length,name_length,status,start_month,end_month,start_Q,end_Q,usd_pledged
0,1687733153,Socks of Speed and Socks of Elvenkind,USD,games,Tabletop Games,2018-10-30 20:00:02,2018-11-15 17:59:00,16.0,2000.0,Menasha,WI,US,14,7,successful,10,11,Q4,Q4,6061.0
1,227936657,Power Punch Boot Camp: An All-Ages Graphic Novel,GBP,comics,Comic Books,2018-08-06 10:00:43,2018-09-05 10:00:43,30.0,3870.99771,Shepperton,England,GB,24,8,successful,8,9,Q3,Q3,3914.50512
2,454186436,"Live Printing with SX8: ""Squeegee Pulp Up""",USD,fashion,Apparel,2017-06-09 15:41:03,2017-07-09 15:41:03,30.0,1100.0,Manhattan,NY,US,21,7,successful,6,7,Q2,Q3,1110.0
3,629469071,Lost Dog Street Band's Next Album,USD,music,Country & Folk,2014-09-25 18:46:01,2014-11-10 06:00:00,45.0,3500.0,Nashville,TN,US,15,6,successful,9,11,Q3,Q4,4807.0
4,183973060,"Qto-X, a Tiny Lantern",USD,technology,Gadgets,2016-11-28 16:35:11,2017-01-27 16:35:11,60.0,30000.0,Troy,MI,US,15,4,successful,11,1,Q4,Q1,40368.0


### Below cell: Dropping duplicates from raw dataframe so the rows from cleanup dataframe and raw dataframe match up

In [5]:
df_raw.drop_duplicates(keep='first', inplace = True)

## Below cell:
### 1. create a new column called main_sub_category that is a merge of main_category and sub_category
### 2. OneHotEncode the "main_sub_category" column
### 3. Name the newly encoded columns (169 of them) by OHE.get_feature_names()
### 4. Merge the encoded columns with the cleanup dateframe from Stephy and Sanjay

In [6]:
# Making a new column from main_category and sub_category,  this way, we preserve the uniqueness of each sub_category, since 'Comedy' sub_category appears in 2 main_category(s)
df_raw['main_sub_category'] = df_raw['main_category'] + ' ' + df_raw['sub_category']

from sklearn.preprocessing import OneHotEncoder
OHE = OneHotEncoder(sparse = False)   # add drop = 'first'  if you wanna drop first encoded column
x = OHE.fit_transform(df_raw[['main_sub_category']])

# Below, I am naming the new encoded columns using the OHE.get_feature_names() method since in the documentation example, the appearance of the encoded columns are sorted, and get_feature_names is also sorted, and the dimension matches up.

df_one_hot_cat = pd.DataFrame(x, columns = OHE.get_feature_names())
df_final = pd.concat([df,df_one_hot_cat], axis = 1)

In [7]:
df_final.head()

Unnamed: 0,duration,goal_usd,blurb_length,name_length,success,US based,main_category_comics,main_category_crafts,main_category_dance,main_category_design,...,x0_technology Wearables,x0_technology Web,x0_theater Comedy,x0_theater Experimental,x0_theater Festivals,x0_theater Immersive,x0_theater Musical,x0_theater Plays,x0_theater Spaces,x0_theater Theater
0,0.163043,1.5e-05,0.382353,0.230769,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.315217,3e-05,0.676471,0.269231,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.315217,9e-06,0.588235,0.230769,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.478261,2.7e-05,0.411765,0.192308,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.641304,0.000232,0.411765,0.115385,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
df.columns

Index(['duration', 'goal_usd', 'blurb_length', 'name_length', 'success',
       'US based', 'main_category_comics', 'main_category_crafts',
       'main_category_dance', 'main_category_design', 'main_category_fashion',
       'main_category_film & video', 'main_category_food',
       'main_category_games', 'main_category_journalism',
       'main_category_music', 'main_category_photography',
       'main_category_publishing', 'main_category_technology',
       'main_category_theater', 'start_Q_Q2', 'start_Q_Q3', 'start_Q_Q4'],
      dtype='object')

## Below cell: Dropping the columns prefixed by "main_category_" so that there's only main_sub_category columns left for analysis

In [9]:
df_final = df_final.drop(['main_category_comics', 'main_category_crafts',
       'main_category_dance', 'main_category_design', 'main_category_fashion',
       'main_category_film & video', 'main_category_food',
       'main_category_games', 'main_category_journalism',
       'main_category_music', 'main_category_photography',
       'main_category_publishing', 'main_category_technology',
       'main_category_theater'], axis = 1)

## Below cell: No changes

In [10]:
## Define function for performance result

# Function to print KFold Cross validation performance on train set 
def KFoldresult_5fold(model, x_train, y_train, is_logreg):
    accuracy = cross_val_score (model, x_train,y_train, cv=5)
    print (model)
    print (f'KFolds cross validation: \n {accuracy} \n')
    print (f'Mean accuracy: \n {accuracy.mean()}\n')
    print ('Coefficient of feature: \n' )
    if is_logreg:
        for index, co in enumerate(model.coef_[0]):
            print (f'Feature {index}: {co:.5f}')
    else:
        for index, co in enumerate(model.feature_importances_):
            print (f'Feature {index}: {co:.5f}')
    return accuracy

# Function to return prediction and print prediction result on test set 
def predictionresult(model, x_test, y_test):
    y_pred = model.predict(x_test)
    print (f'Confusion_matrix: \n {confusion_matrix(y_test, y_pred)} \n')
    print (f'Classification report: \n {classification_report(y_test,y_pred)} \n')
    return y_pred

# Function to print out Grid Search parameters: 
def gridsearch(model, parameters, X_train, y_train):
    search = GridSearchCV(model, parameters, n_jobs=-1)
    search.fit(x_train,y_train)
    print(f'Parameter tested: {parameters}')
    print(f'Best Score : {search.best_score_}')
    print(f'Best parameters: {search.best_params_}')
    return search

def performace(y_ture, y_pred):
    return [accuracy_score(y_ture, y_pred), recall_score(y_ture, y_pred), precision_score(y_ture, y_pred), f1_score(y_ture, y_pred)]

## Below cell: splitting the "new" dataframe into train and test sets

In [11]:
# Split train set and test set 
y_sub = df_final['success']
x_sub = df_final.drop('success',axis=1)
x_train_sub, x_test_sub, y_train_sub, y_test_sub = train_test_split(x_sub,y_sub,train_size = 0.8, random_state = 42) #shuffle = False that means no random 

In [12]:
# Split train set and test set 
y = df['success']
x = df.drop('success',axis=1)
x_train, x_test, y_train, y_test = train_test_split(x,y,train_size = 0.8, random_state = 42) #shuffle = False that means no random 

## Logistic Regression (log)
#### -Develop model

## Below cell: Log Regression with main_sub_category

In [13]:
# Create model -log
log = LogisticRegression()
log.fit (x_train_sub,y_train_sub)

KFoldresult_5fold(log, x_train_sub, y_train_sub, True)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

LogisticRegression()
KFolds cross validation: 
 [0.74894754 0.75454845 0.74803236 0.75180291 0.75281886] 

Mean accuracy: 
 0.7512300239327945

Coefficient of feature: 

Feature 0: -2.10884
Feature 1: -3.14026
Feature 2: -0.43843
Feature 3: 2.57165
Feature 4: 0.18005
Feature 5: -0.01548
Feature 6: -0.13093
Feature 7: -0.05568
Feature 8: 4.42095
Feature 9: -0.53131
Feature 10: -0.90006
Feature 11: -1.02677
Feature 12: 2.73056
Feature 13: -0.40346
Feature 14: -0.67757
Feature 15: -0.46153
Feature 16: -0.57652
Feature 17: 0.26309
Feature 18: -0.78786
Feature 19: -1.01899
Feature 20: -1.25054
Feature 21: 0.80822
Feature 22: 5.38521
Feature 23: 3.88129
Feature 24: -0.87948
Feature 25: 0.07405
Feature 26: -0.00248
Feature 27: -2.15667
Feature 28: 4.75094
Feature 29: -1.48738
Feature 30: -1.72681
Feature 31: -1.58544
Feature 32: -1.16926
Feature 33: -0.41672
Feature 34: -0.24667
Feature 35: -1.58418
Feature 36: -1.78386
Feature 37: -0.91299
Feature 38: -0.35409
Feature 39: -0.97545
Feature 40

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


array([0.74894754, 0.75454845, 0.74803236, 0.75180291, 0.75281886])

## Below cell: Log Regression with main_categories only

In [14]:
# Create model -log
log = LogisticRegression()
log.fit (x_train,y_train)

KFoldresult_5fold(log, x_train, y_train, True)

LogisticRegression()
KFolds cross validation: 
 [0.6541714  0.6577223  0.65995534 0.66555625 0.65708742] 

Mean accuracy: 
 0.658898540011138

Coefficient of feature: 

Feature 0: -2.33870
Feature 1: -7.30428
Feature 2: -0.69410
Feature 3: 3.07711
Feature 4: 0.12142
Feature 5: 1.15119
Feature 6: -0.56764
Feature 7: 1.15936
Feature 8: 0.20551
Feature 9: -0.11058
Feature 10: 0.20994
Feature 11: -1.09012
Feature 12: 0.16395
Feature 13: -1.04273
Feature 14: 0.45077
Feature 15: -0.52420
Feature 16: 0.55926
Feature 17: -0.85593
Feature 18: 0.31217
Feature 19: -0.02090
Feature 20: -0.12587
Feature 21: -0.00681


array([0.6541714 , 0.6577223 , 0.65995534, 0.66555625, 0.65708742])

## Below cell: Results with sub_category

In [15]:
log_y_predict_sub = predictionresult(log, x_test_sub, y_test_sub)

ValueError: X has 177 features per sample; expecting 22

## Below cell: Results with main_category only

In [None]:
log_y_predict = predictionresult(log, x_test, y_test)

#### -Optimizing hyperparameters

## Below cell: optimizing the regressor trained by the sub_category dataframe

In [None]:
param = {'C':np.linspace(0.1,1,10), 'penalty': ['l1', 'l2']} 
gridsearch (log, param, x_train_sub, y_train_sub)

## Below cell: no changes

In [None]:
param = {'C':np.linspace(0.1,1,10), 'penalty': ['l1', 'l2']} 
gridsearch (log, param, x_train, y_train)

#### Apply the best parameters {'C': 1.0, 'penalty': 'l2'} 
 - best parameters is same as default parameters for model 'log'

## Random Forest Classifier (ranforest)
#### -Develop model 

## Below cell: Random Forest with the sub_categories

In [None]:
ranforest = RandomForestClassifier(random_state = 42, n_jobs=-1)
ranforest.fit (x_train_sub,y_train_sub)

KFoldresult_5fold(ranforest, x_train_sub, y_train_sub, False)

## Below cell: Random Forest with the main_categories

In [None]:
ranforest = RandomForestClassifier(random_state = 42, n_jobs=-1)
ranforest.fit (x_train,y_train)

KFoldresult_5fold(ranforest, x_train, y_train, False)

## Below cell: Random Forest result with sub_categories

In [None]:
ranforest_y_predict = predictionresult(ranforest, x_test_sub, y_test_sub)

## Below cell: Random Forest result with main_categories

In [None]:
ranforest_y_predict = predictionresult(ranforest, x_test, y_test)

#### -Optimizing Hyperparameters

## Below cell: Random Forest optimizing with sub_categories

In [None]:
param = {'max_depth': list(range(1,5))}
gridsearch (ranforest, param, x_train_sub, y_train_sub)

## Below cell: Random Forest optimizing with main_categories

In [None]:
param = {'max_depth': list(range(1,5))}
gridsearch (ranforest, param, x_train, y_train)

#### Apply the best parameters {'max_depth': 4}

## Below cell: Applying to sub_categories

In [None]:
ranforest = RandomForestClassifier(max_depth=4, random_state = 42, n_jobs=-1)
ranforest.fit (x_train_sub,y_train_sub)

KFoldresult_5fold(ranforest, x_train_sub, y_train_sub, False)
ranforest_y_predict = predictionresult(ranforest, x_test_sub, y_test_sub)

## Below cell: Applying to main_categories

In [None]:
ranforest = RandomForestClassifier(max_depth=4, random_state = 42, n_jobs=-1)
ranforest.fit (x_train,y_train)

KFoldresult_5fold(ranforest, x_train, y_train, False)
ranforest_y_predict = predictionresult(ranforest, x_test, y_test)

## XG Boost
#### -Develop model

## XG Boost with sub_categories

In [None]:
xgmodel = XGBClassifier(use_label_encoder = False, eval_metric='mlogloss',n_jobs = -1 )
xgmodel.fit (x_train_sub,y_train_sub)

KFoldresult_5fold(xgmodel, x_train_sub, y_train_sub, False)

## XG Boost with main_categories

In [None]:
xgmodel = XGBClassifier(use_label_encoder = False, eval_metric='mlogloss',n_jobs = -1 )
xgmodel.fit (x_train,y_train)

KFoldresult_5fold(xgmodel, x_train, y_train, False)

## Below cell: XG Boost results with sub_categories

In [None]:
xgmodel_y_predict = predictionresult(xgmodel, x_test_sub, y_test_sub)

## Below cell: XG Boost results with main_categories

In [None]:
xgmodel_y_predict = predictionresult(xgmodel, x_test, y_test)

#### -Optimizing Hyperparameters

## Optimizing with sub_categories (did not run, took took long)

In [None]:
param = {'max_depth': list(range(1,5))}
gridsearch (xgmodel, param, x_train_sub, y_train_sub)

## Optimizaing with main_categories

In [None]:
param = {'max_depth': list(range(1,5))}
gridsearch (xgmodel, param, x_train, y_train)

#### Apply the best parameters {'max_depth': 4}

In [None]:
xgmodel = XGBClassifier(use_label_encoder = False, eval_metric='mlogloss',n_jobs = -1, max_depth = 4)
xgmodel.fit (x_train,y_train)

KFoldresult_5fold(xgmodel, x_train, y_train, False)

## Below cell: Prediction results with sub_categories

In [None]:
xgmodel_y_predict = predictionresult(xgmodel, x_test_sub, y_test_sub)

## Below cell: Prediction results with main_categories

In [None]:
xgmodel_y_predict = predictionresult(xgmodel, x_test, y_test)

## did not touch the following codes

In [None]:
log_score = performace(y_test, log_y_predict)
rf_score = performace(y_test, ranforest_y_predict)
xg_score = performace(y_test, xgmodel_y_predict)

models_scores_table = pd.DataFrame({'Logistic Regression': log_score, 'Random Forest Classifier': rf_score, 'XGBoost':xg_score},
                                    index=['Accuracy', 'Recall', 'Precision', 'F1 Score'])

models_scores_table['Best Score'] = models_scores_table.idxmax(axis=1)

models_scores_table