In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix

from sklearn.model_selection import RandomizedSearchCV as RSCV

import joblib
import warnings
warnings.filterwarnings('ignore')

In [2]:
ksDf = pd.read_csv('cleaned_kickstarter_dataset.csv')
ksDf.head()

Unnamed: 0.1,Unnamed: 0,backers,usd_pledged_real,usd_goal_real,year,duration,main_category_Art,main_category_Comics,main_category_Crafts,main_category_Dance,...,main_category_Food,main_category_Games,main_category_Journalism,main_category_Music,main_category_Photography,main_category_Publishing,main_category_Technology,main_category_Theater,encoded_state,encoded_months
0,0,0.0,0.0,1533.95,2015.0,59.0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
1,1,15.0,2421.0,30000.0,2017.0,60.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,11
2,2,3.0,220.0,45000.0,2013.0,45.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4
3,3,1.0,1.0,5000.0,2012.0,30.0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,7
4,5,224.0,52375.0,50000.0,2016.0,35.0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,3


### Dropping the unwanted columns from the Data, because it could affect the outcome

In [3]:
ksDf.drop('Unnamed: 0', axis = 1, inplace = True)

### Made a reusable function to find the score of the model

In [12]:
def findClassificationScore(model, paramXTest, paramYTest):
    yPred = model.predict(paramXTest)
    predDict = dict()
    
    print("Model: ", model)
    predDict['Model'] = model
    print("Balanced Accuracy Score: ", balanced_accuracy_score(paramYTest, yPred))
    predDict['Accuracy'] = balanced_accuracy_score(paramYTest, yPred)
    print("Precision Score: ", precision_score(paramYTest, yPred, average='macro'))
    predDict['Precision'] = precision_score(paramYTest, yPred, average='macro')
    print("Recall: ", recall_score(paramYTest, yPred, average = 'macro'))
    predDict['Recall'] = recall_score(paramYTest, yPred, average = 'macro')
    print("F1 Score: ", f1_score(paramYTest, yPred, average = 'macro'))
    predDict['F1'] = f1_score(paramYTest, yPred, average = 'macro')
    print("ROC AUC Score: ", roc_auc_score(paramYTest, yPred, multi_class = 'ovo'))
    predDict['ROCAUC'] = roc_auc_score(paramYTest, yPred, multi_class = 'ovo')
    print("\n")
    
    tn, fn, fp, tp = confusion_matrix(paramYTest, yPred).ravel()
    print("True Positive: ", tp)
    print("True Negative: ", tn)
    print("False Positive: ", fp)
    print("False Negative: ", fn)
    
    print("\n")
    return predDict

### Splitting Data for both training and testing, with the ratio of 80% training and 20% testing

In [5]:
cpDf = ksDf.copy()
features = cpDf.drop(['encoded_state'], axis = 1)
target = ksDf['encoded_state']

In [6]:
xtr, xts, ytr, yts = train_test_split(features, target, test_size = .2)

## Logistic Regression

### Parameter for Logistic Regression

In [7]:
penalty = ["l1", "l2", "elasticnet", "none"]
solver = ["newton-cg", "lbfgs", "liblinear", "sag", "saga"]
max_iter = [1, 10, 100, 1000, 10000]

paramLore = {
    "penalty": penalty,
    "solver": solver,
    "max_iter" : max_iter
}

### Hyperparameter tuning for Logistic Regression

In [8]:
lore = LogisticRegression(multi_class = 'ovr')
modelLore = RSCV(estimator= lore, 
                             param_distributions= paramLore,
                             cv= 5)
modelLore.fit(xtr, ytr)

RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=LogisticRegression(C=1.0, class_weight=None,
                                                dual=False, fit_intercept=True,
                                                intercept_scaling=1,
                                                l1_ratio=None, max_iter=100,
                                                multi_class='ovr', n_jobs=None,
                                                penalty='l2', random_state=None,
                                                solver='lbfgs', tol=0.0001,
                                                verbose=0, warm_start=False),
                   iid='deprecated', n_iter=10, n_jobs=None,
                   param_distributions={'max_iter': [1, 10, 100, 1000, 10000],
                                        'penalty': ['l1', 'l2', 'elasticnet',
                                                    'none'],
                                        'solver': ['newton-cg', 

### Using the best parameter for the model

In [13]:
best_lore = modelLore.best_estimator_
best_lore.fit(xtr, ytr)
best_lore_accuracy = findClassificationScore(best_lore, xts, yts)

Model:  LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='ovr', n_jobs=None, penalty='l2',
                   random_state=None, solver='newton-cg', tol=0.0001, verbose=0,
                   warm_start=False)
Balanced Accuracy Score:  0.9993204640978532
Precision Score:  0.9989854963553018
Recall:  0.9993204640978532
F1 Score:  0.9991522340536301
ROC AUC Score:  0.9993204640978532


True Positive:  26560
True Negative:  39679
False Positive:  0
False Negative:  54




In [14]:
best_lore_accuracy

{'Model': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                    intercept_scaling=1, l1_ratio=None, max_iter=100,
                    multi_class='ovr', n_jobs=None, penalty='l2',
                    random_state=None, solver='newton-cg', tol=0.0001, verbose=0,
                    warm_start=False),
 'Accuracy': 0.9993204640978532,
 'Precision': 0.9989854963553018,
 'Recall': 0.9993204640978532,
 'F1': 0.9991522340536301,
 'ROCAUC': 0.9993204640978532}

## Decision Tree Classifier

### Parameter for Decision Tree Classifier

In [15]:
max_depths = np.linspace(1, 32, 32, endpoint=True)
min_samples_splits = np.linspace(0.1, 1.0, 10, endpoint=True)
min_samples_leafs = np.linspace(0.1, 0.5, 5, endpoint=True)
max_features = list(range(1,xtr.shape[1]))
criterion = ["gini", "entropy"]

paramDTC = {
    'criterion' : criterion,
    'max_depth' : max_depths,
    'min_samples_split' : min_samples_splits,
    'min_samples_leaf' : min_samples_leafs,
    'max_features' : max_features
}

### Hyperparameter tuning for Decision Tree Classifier

In [16]:
dtc = DecisionTreeClassifier()

modelDTC = RSCV(estimator= dtc, 
                             param_distributions= paramDTC, 
                             cv= 5)
modelDTC.fit(xtr, ytr)

RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features=None,
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                                                    presort='deprecated',
                                                    random_state=None,
          

In [17]:
best_DTC = modelDTC.best_estimator_
best_DTC.fit(xtr, ytr)
best_DTC_accuracy = findClassificationScore(best_DTC, xts, yts)

Model:  DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=12.0, max_features=12, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=0.1, min_samples_split=0.1,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')
Balanced Accuracy Score:  0.8507933348550369
Precision Score:  0.8377625026808309
Recall:  0.8507933348550369
F1 Score:  0.839368839597261
ROC AUC Score:  0.8507933348550369


True Positive:  23794
True Negative:  32014
False Positive:  2766
False Negative:  7719




## Random Forest Classifier

### Parameter for Random Forest Classifier

In [18]:
n_estimators = list(range(10, 100, 5))
max_features = ['auto', 'sqrt']
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

paramRFC = {'n_estimators': n_estimators,
               'max_features': max_features,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}


In [19]:
RFC = RandomForestClassifier()
modelRFC = RSCV(estimator= RFC, 
                             param_distributions= paramRFC, 
                             cv= 5)
modelRFC.fit(xtr, ytr)

RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    max_samples=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
               

In [20]:
best_RFC = modelRFC.best_estimator_
best_RFC.fit(xtr, ytr)
best_RFC_accuracy = findClassificationScore(best_RFC, xts, yts)

Model:  RandomForestClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=2, min_samples_split=10,
                       min_weight_fraction_leaf=0.0, n_estimators=70,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
Balanced Accuracy Score:  0.9966598103144211
Precision Score:  0.9953044105719021
Recall:  0.9966598103144211
F1 Score:  0.9959694672836731
ROC AUC Score:  0.9966598103144209


True Positive:  26543
True Negative:  39493
False Positive:  17
False Negative:  240




### Dump the best model, using joblib

In [21]:
joblib.dump(best_lore, 'best_lore')
joblib.dump(best_DTC, 'best_DTC')
joblib.dump(best_RFC, 'best_RFC')

['best_RFC']

In [22]:
feature_imp = pd.DataFrame(best_RFC.feature_importances_, columns = ['Percentage'], index = features.columns)
feature_imp.sort_values(ascending=False, by='Percentage')

Unnamed: 0,Percentage
usd_pledged_real,0.386889
backers,0.33917
usd_goal_real,0.230637
duration,0.014368
year,0.00621
main_category_Technology,0.003864
main_category_Music,0.003432
encoded_months,0.003265
main_category_Design,0.002045
main_category_Games,0.001961
