In [19]:
import pandas as pd
pd.set_option('display.max_columns', 500)
from datetime import datetime
import numpy as np
import timeit
import json
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.externals.six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score, precision_score, recall_score
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression




In [2]:
main_df = pd.read_csv('main_df.csv')

In [3]:
main_df.head(1)

Unnamed: 0,project_id,outcome_state,final_usd_pledged,final_backers_count,project_name,project_description,live_state,in_mid_duration_range,usd_goal,usd_pledged,currency,launched_at,deadline,staff_pick,country,main_category,duration,days_from_launch,backers_count,project_link,creator_link,pledged_to_goal_ratio,description_len
0,464921389,successful,5660.0,40,Good Fishermen Know A Lot About Sex,A musical dramedy about family and dealing wit...,live,True,5000.0,1575.0,USD,08-21-2019,10-11-2019,False,US,Theater,50,21,24,https://www.kickstarter.com/projects/213094288...,https://www.kickstarter.com/profile/2130942887,0.315,58


In [4]:
model_cols = ['usd_goal', 'pledged_to_goal_ratio', 'duration', 'staff_pick',
              'country', 'main_category', 'backers_count', 'description_len', 'outcome_state']
continuous_cols = ['usd_goal', 'duration', 'backers_count', 'pledged_to_goal_ratio', 'description_len']
categorical_cols = ['staff_pick', 'country', 'main_category']

drop_cols = list(set(main_df.columns.to_list()) - set(model_cols))

In [5]:
# create model_df 
model_df = main_df.drop(drop_cols, axis=1)
model_df.head(1)

Unnamed: 0,outcome_state,usd_goal,staff_pick,country,main_category,duration,backers_count,pledged_to_goal_ratio,description_len
0,successful,5000.0,False,US,Theater,50,24,0.315,58


In [6]:
# encoding binary variables 
model_df['outcome_state'] = model_df['outcome_state'].map({'successful': 1, 'failed': 0})
model_df['staff_pick'] = model_df['staff_pick'].astype(int)


In [7]:
model_df.head(3)

Unnamed: 0,outcome_state,usd_goal,staff_pick,country,main_category,duration,backers_count,pledged_to_goal_ratio,description_len
0,1,5000.0,0,US,Theater,50,24,0.315,58
1,0,1000.0,0,US,Crafts,45,2,0.006,117
2,0,25000.0,0,US,Film & Video,45,2,8e-05,78


In [8]:
# Function to dummy categorical variables
def dummy_df(df, todummy_list):
    for x in todummy_list:
        dummies = pd.get_dummies(df[x], prefix=x, dummy_na=False)
        df = df.drop(x, 1)
        df = pd.concat([df, dummies], axis=1)
    return df

In [9]:
dummy_model_df = dummy_df(model_df, ['country', 'main_category'])

X = dummy_model_df.drop('outcome_state', axis=1)
y = dummy_model_df.loc[:, 'outcome_state']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size =0.2, random_state=60)

In [12]:
def plot_feature_importances(model):
    n_features = X_train.shape[1]
    plt.figure(figsize=(8,8))
    plt.barh(range(n_features), model.feature_importances_, align='center') 
    plt.yticks(np.arange(n_features), X_train.columns.values) 
    plt.xlabel('Feature importance')
    plt.ylabel('Feature')


In [13]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion Matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

## Logistic Regression

In [10]:
logreg = LogisticRegression(solver='saga')
logreg.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [17]:
logreg_test_preds = logreg.predict(X_test)

In [24]:
pd.set_option('display.width', 100)


In [28]:
pd.set_option('max_colwidth', 100)

In [22]:
THRESHOLD = 0.5
preds = np.where(logreg.predict_proba(X_test)[:,1] > THRESHOLD, 1, 0)

pd.DataFrame(data=[accuracy_score(y_test, preds), recall_score(y_test, preds),
                   precision_score(y_test, preds), f1_score(y_test, preds)], 
             index=["accuracy", "recall", "precision", "F1"])

Unnamed: 0,0
accuracy,0.930372
recall,0.950178
precision,0.946809
F1,0.94849


In [23]:
THRESHOLD = 0.7
preds = np.where(logreg.predict_proba(X_test)[:,1] > THRESHOLD, 1, 0)

pd.DataFrame(data=[accuracy_score(y_test, preds), recall_score(y_test, preds),
                   precision_score(y_test, preds), f1_score(y_test, preds)], 
             index=["accuracy", "recall", "precision", "F1"])

Unnamed: 0,0
accuracy,0.876351
recall,0.845196
precision,0.967413
F1,0.902184


In [30]:
# Confusion matrix metrics
print(f"Precision Score (Test): {precision_score(y_test, logreg_test_preds)}")
print(f"Recall Score (Test): {recall_score(y_test, logreg_test_preds)}")
print(f"accuracy Score (Test): {accuracy_score(y_test, logreg_test_preds)}")
print(f"F1 Score (Test): {f1_score(y_test, logreg_test_preds)}")


Precision Score (Test): 0.9468085106382979
Recall Score (Test): 0.9501779359430605
accuracy Score (Test): 0.9303721488595438
F1 Score (Test): 0.9484902309058614
