In [None]:
import pandas as pd
pd.set_option('display.max_columns', 500)
from datetime import datetime
import numpy as np
import timeit
import json
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.externals.six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score, precision_score, recall_score
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier




In [None]:
main_df = pd.read_csv('main_df_new.csv')

In [None]:
main_df.head(1)

In [None]:
main_df.pledged_to_goal_ratio.describe()

In [None]:
len(main_df[main_df.pledged_to_goal_ratio > 0.5]) / main_df.shape[0] 

In [None]:
main_df[(main_df.pledged_to_goal_ratio > 0.5) & (main_df.outcome_state == 'failed')]

In [None]:
main_df[main_df.pledged_to_goal_ratio > 0.5]

In [None]:
model_cols = ['usd_goal', 'pledged_to_goal_ratio', 'duration', 'staff_pick',
              'country', 'main_category', 'backers_count', 'description_len', 'outcome_state']
continuous_cols = ['usd_goal', 'duration', 'backers_count', 'pledged_to_goal_ratio', 'description_len']
categorical_cols = ['staff_pick', 'country', 'main_category']

drop_cols = list(set(main_df.columns.to_list()) - set(model_cols))

In [None]:
# create model_df 
model_df = main_df.drop(drop_cols, axis=1)
model_df.head(1)

In [None]:
# encoding binary variables 
model_df['outcome_state'] = model_df['outcome_state'].map({'successful': 1, 'failed': 0})
model_df['staff_pick'] = model_df['staff_pick'].astype(int)


In [None]:
model_df.head(3)

In [None]:
# Function to dummy categorical variables
def dummy_df(df, todummy_list):
    for x in todummy_list:
        dummies = pd.get_dummies(df[x], prefix=x, dummy_na=False)
        df = df.drop(x, 1)
        df = pd.concat([df, dummies], axis=1)
    return df

In [None]:
dummy_model_df = dummy_df(model_df, ['country'])

X = dummy_model_df.drop('outcome_state', axis=1)
y = dummy_model_df.loc[:, 'outcome_state']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size =0.2, random_state=60)

### Decision tree 

In [None]:
tree_classifier = DecisionTreeClassifier(max_depth=2)  
tree_classifier.fit(X_train, y_train)

In [None]:
# visualizing the classifier tree
tree_classifier_pred = tree_classifier.predict(X_test)

# Confusion matrix metrics
print(f"Precision Score (Test): {precision_score(y_test, tree_classifier_pred)}")
print(f"Recall Score (Test): {recall_score(y_test, tree_classifier_pred)}")
print(f"accuracy Score (Test): {accuracy_score(y_test, tree_classifier_pred)}")
print(f"F1 Score (Test): {f1_score(y_test, tree_classifier_pred)}")

print({precision_score(y_test, tree_classifier_pred)})
print({recall_score(y_test, tree_classifier_pred)})
print({accuracy_score(y_test, tree_classifier_pred)})
print({f1_score(y_test, tree_classifier_pred)})


In [None]:
def plot_feature_importances(model):
    n_features = X_train.shape[1]
    plt.figure(figsize=(8,8))
    plt.barh(range(n_features), model.feature_importances_, align='center') 
    plt.yticks(np.arange(n_features), X_train.columns.values) 
    plt.xlabel('Feature importance')
    plt.ylabel('Feature')


In [None]:
plot_feature_importances(tree_classifier)

## Bagged trees

In [None]:
bagged_tree =  BaggingClassifier(DecisionTreeClassifier(criterion='gini', max_depth=2), 
                                 n_estimators=20)
bagged_tree.fit(X_train, y_train)


In [None]:
bagged_tree_pred = bagged_tree.predict(X_test)

# Confusion matrix metrics
print(f"Precision Score (Test): {precision_score(y_test, bagged_tree_pred)}")
print(f"Recall Score (Test): {recall_score(y_test, bagged_tree_pred)}")
print(f"accuracy Score (Test): {accuracy_score(y_test, bagged_tree_pred)}")
print(f"F1 Score (Test): {f1_score(y_test, bagged_tree_pred)}")

print({precision_score(y_test, bagged_tree_pred)})
print({recall_score(y_test, bagged_tree_pred)})
print({accuracy_score(y_test, bagged_tree_pred)})
print({f1_score(y_test, bagged_tree_pred)})


### Random Forest

In [None]:
forest = RandomForestClassifier(n_estimators=100, max_depth= 2)
forest.fit(X_train, y_train)

In [None]:
forest_pred = forest.predict(X_test)


In [None]:
# Confusion matrix metrics
print(f"Precision Score (Test): {precision_score(y_test, forest_pred)}")
print(f"Recall Score (Test): {recall_score(y_test, forest_pred)}")
print(f"accuracy Score (Test): {accuracy_score(y_test, forest_pred)}")
print(f"F1 Score (Test): {f1_score(y_test, forest_pred)}")

print({precision_score(y_test, forest_pred)})
print({recall_score(y_test, forest_pred)})
print({accuracy_score(y_test, forest_pred)})
print({f1_score(y_test, forest_pred)})

In [None]:
plot_feature_importances(forest)

In [None]:
print(f'Forest accuracy_score: {accuracy_score(y_test, tree_preds)}')
print(f'Forest accuracy_score: {accuracy_score(y_test, tree_preds)}')

In [None]:
# dt_param_grid = {
#     'n_estimators':[10,100,500], 
#     'criterion': ['gini', 'entropy'],
#     'max_depth':[2,5,7],
#     'min_samples_split':[2, 5, 10],
#     'max_features':[3,7,10,20] }

In [None]:
# # Instantiate GridSearchCV
# forest_grid_search = GridSearchCV(forest, dt_param_grid, cv=3, return_train_score=True)

# # Fit to the data
# forest_grid_search.fit(X_train, y_train)

In [None]:
# dt_cv_score = cross_val_score(dt_clf, X_train, y_train, cv=3)
# mean_dt_cv_score = np.mean(dt_cv_score)

# print(f"Mean Cross Validation Score: {mean_dt_cv_score :.2%}")

In [None]:
# # Mean training score
# # grid_training_score = np.mean(dt_grid_search.cv_results_['mean_train_score'])

# # Mean test score
# grid_testing_score = forest_grid_search.score(X_test, y_test)

# # print(f"Mean Training Score: {dt_gs_training_score :.2%}")
# print(f"Mean Test Score: {grid_testing_score }")
# print("Best Parameter Combination Found During Grid Search:")
# forest_grid_search.best_params_

In [None]:
forest_best = RandomForestClassifier(n_estimators=10, max_depth= 5, max_features=20, min_samples_split=10, criterion='entropy')


In [None]:
forest_best.fit(X_train, y_train)

forest_best_preds = forest_best.predict(X_test)

In [None]:
# Confusion matrix metrics
print(f"Precision Score (Test): {precision_score(y_test, forest_best_pred)}")
print(f"Recall Score (Test): {recall_score(y_test, forest_best_pred)}")
print(f"accuracy Score (Test): {accuracy_score(y_test, forest_best_pred)}")
print(f"F1 Score (Test): {f1_score(y_test, forest_best_pred)}")

print({precision_score(y_test, forest_best_pred)})
print({recall_score(y_test, forest_best_pred)})
print({accuracy_score(y_test, forest_best_pred)})
print({f1_score(y_test, forest_best_pred)})