In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import balanced_accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, confusion_matrix, classification_report

from sklearn.model_selection import RandomizedSearchCV as RSCV

import joblib
import warnings
warnings.filterwarnings('ignore')

In [19]:
ksDf = pd.read_csv('cleaned_kickstarter_dataset.csv')
ksDf.head()

Unnamed: 0,duration,goal_usd,name_length,status,main_category_art,main_category_comics,main_category_crafts,main_category_dance,main_category_design,main_category_fashion,...,country_IT,country_JP,country_LU,country_MX,country_NL,country_NO,country_NZ,country_SE,country_SG,country_US
0,16.0,2000.0,7,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,30.0,3870.99771,8,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,30.0,1100.0,7,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
3,45.0,3500.0,6,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,60.0,30000.0,4,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


### Dropping the unwanted columns from the Data, because it could affect the outcome

In [3]:
ksDf.drop('Unnamed: 0', axis = 1, inplace = True)

### Made a reusable function to find the score of the model

In [4]:
def findClassificationScore(model, paramXTest, paramYTest):
    yPred = model.predict(paramXTest)
    predDict = dict()
    
    print("Model: ", model)
    predDict['Model'] = model
    
    tn, fn, fp, tp = confusion_matrix(paramYTest, yPred).ravel()
    print("\nTrue Positive: ", tp)
    print("True Negative: ", tn)
    print("False Positive: ", fp)
    print("False Negative: ", fn)
    print("\nBalanced Accuracy Score: ", balanced_accuracy_score(paramYTest, yPred))
    predDict['bACC'] = balanced_accuracy_score(paramYTest, yPred)
    print("ROC AUC Score: ", roc_auc_score(paramYTest, yPred, multi_class = 'ovo'))
    predDict['ROCAUC'] = roc_auc_score(paramYTest, yPred, multi_class = 'ovo')
    print("Classification Report: \n", classification_report(paramYTest, yPred)) 
    print("\n")
    return predDict

### Splitting Data for both training and testing, with the ratio of 80% training and 20% testing

In [5]:
cpDf = ksDf.copy()
features = cpDf.drop(['status'], axis = 1)
target = ksDf['status']

In [6]:
xtr, xts, ytr, yts = train_test_split(features, target, test_size = .2)

## Logistic Regression

### Parameter for Logistic Regression

In [7]:
penalty = ["l1", "l2", "elasticnet", "none"]
solver = ["newton-cg", "lbfgs", "liblinear", "sag", "saga"]
max_iter = [1, 10, 100, 1000, 10000]

paramLore = {
    "penalty": penalty,
    "solver": solver,
    "max_iter" : max_iter
}

### Hyperparameter tuning for Logistic Regression

In [8]:
lore = LogisticRegression(multi_class = 'ovr')
modelLore = RSCV(estimator= lore, 
                             param_distributions= paramLore,
                             cv= 5)
modelLore.fit(xtr, ytr)

RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=LogisticRegression(C=1.0, class_weight=None,
                                                dual=False, fit_intercept=True,
                                                intercept_scaling=1,
                                                l1_ratio=None, max_iter=100,
                                                multi_class='ovr', n_jobs=None,
                                                penalty='l2', random_state=None,
                                                solver='lbfgs', tol=0.0001,
                                                verbose=0, warm_start=False),
                   iid='deprecated', n_iter=10, n_jobs=None,
                   param_distributions={'max_iter': [1, 10, 100, 1000, 10000],
                                        'penalty': ['l1', 'l2', 'elasticnet',
                                                    'none'],
                                        'solver': ['newton-cg', 

### Using the best parameter for the model

In [9]:
best_lore = modelLore.best_estimator_
best_lore.fit(xtr, ytr)
best_lore_accuracy = findClassificationScore(best_lore, xts, yts)

Model:  LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='ovr', n_jobs=None, penalty='none',
                   random_state=None, solver='newton-cg', tol=0.0001, verbose=0,
                   warm_start=False)

True Positive:  20233
True Negative:  6326
False Positive:  3265
False Negative:  8686

Balanced Accuracy Score:  0.6412241103930769
ROC AUC Score:  0.6412241103930768
Classification Report: 
               precision    recall  f1-score   support

           0       0.66      0.42      0.51     15012
           1       0.70      0.86      0.77     23498

    accuracy                           0.69     38510
   macro avg       0.68      0.64      0.64     38510
weighted avg       0.68      0.69      0.67     38510





In [10]:
best_lore_accuracy

{'Model': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                    intercept_scaling=1, l1_ratio=None, max_iter=1000,
                    multi_class='ovr', n_jobs=None, penalty='none',
                    random_state=None, solver='newton-cg', tol=0.0001, verbose=0,
                    warm_start=False),
 'bACC': 0.6412241103930769,
 'ROCAUC': 0.6412241103930768,
 'ClassificationRep': '              precision    recall  f1-score   support\n\n           0       0.66      0.42      0.51     15012\n           1       0.70      0.86      0.77     23498\n\n    accuracy                           0.69     38510\n   macro avg       0.68      0.64      0.64     38510\nweighted avg       0.68      0.69      0.67     38510\n'}

## Decision Tree Classifier

### Parameter for Decision Tree Classifier

In [11]:
max_depths = np.linspace(1, 32, 32, endpoint=True)
min_samples_splits = np.linspace(0.1, 1.0, 10, endpoint=True)
min_samples_leafs = np.linspace(0.1, 0.5, 5, endpoint=True)
max_features = list(range(1,xtr.shape[1]))
criterion = ["gini", "entropy"]

paramDTC = {
    'criterion' : criterion,
    'max_depth' : max_depths,
    'min_samples_split' : min_samples_splits,
    'min_samples_leaf' : min_samples_leafs,
    'max_features' : max_features
}

### Hyperparameter tuning for Decision Tree Classifier

In [12]:
dtc = DecisionTreeClassifier()

modelDTC = RSCV(estimator= dtc, 
                             param_distributions= paramDTC, 
                             cv= 5)
modelDTC.fit(xtr, ytr)

RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features=None,
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                                                    presort='deprecated',
                                                    random_state=None,
          

In [13]:
best_DTC = modelDTC.best_estimator_
best_DTC.fit(xtr, ytr)
best_DTC_accuracy = findClassificationScore(best_DTC, xts, yts)

Model:  DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=15.0, max_features=51, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=0.1,
                       min_samples_split=0.30000000000000004,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

True Positive:  19040
True Negative:  6315
False Positive:  4458
False Negative:  8697

Balanced Accuracy Score:  0.6154725976644848
ROC AUC Score:  0.6154725976644848
Classification Report: 
               precision    recall  f1-score   support

           0       0.59      0.42      0.49     15012
           1       0.69      0.81      0.74     23498

    accuracy                           0.66     38510
   macro avg       0.64      0.62      0.62     38510
weighted avg       0.65      0.66      0.64     38510





## Random Forest Classifier

### Parameter for Random Forest Classifier

In [14]:
n_estimators = list(range(10, 100, 5))
max_features = ['auto', 'sqrt']
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

paramRFC = {'n_estimators': n_estimators,
               'max_features': max_features,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}


In [15]:
RFC = RandomForestClassifier()
modelRFC = RSCV(estimator= RFC, 
                             param_distributions= paramRFC, 
                             cv= 5)
modelRFC.fit(xtr, ytr)

RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    max_samples=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
               

In [16]:
best_RFC = modelRFC.best_estimator_
best_RFC.fit(xtr, ytr)
best_RFC_accuracy = findClassificationScore(best_RFC, xts, yts)

Model:  RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=10,
                       min_weight_fraction_leaf=0.0, n_estimators=65,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

True Positive:  19755
True Negative:  7874
False Positive:  3743
False Negative:  7138

Balanced Accuracy Score:  0.6826117850010287
ROC AUC Score:  0.6826117850010287
Classification Report: 
               precision    recall  f1-score   support

           0       0.68      0.52      0.59     15012
           1       0.73      0.84      0.78     23498

    accuracy                           0.72     38510
   macro avg       0.71      0.68

## Conclusion 

### Based on the overall score of all models, we could conclude that Random Forest Classifier is the best model. Conclusion were made based on the f1-score of both positive and negative value which Random Forest Classifier's value is higher.

### Dump the best model, using joblib

In [17]:
joblib.dump(best_RFC, 'best_RFC')

['best_RFC']

In [18]:
feature_imp = pd.DataFrame(best_RFC.feature_importances_, columns = ['Percentage'], index = features.columns)
feature_imp.sort_values(ascending=False, by='Percentage')[:5]

Unnamed: 0,Percentage
goal_usd,0.332745
duration,0.172791
name_length,0.126937
main_category_food,0.028289
main_category_technology,0.023946
