# Overfitting

In [17]:
import matplotlib.pyplot as plt
import numpy as np 
import pandas as pd
import seaborn as sns 
sns.set(style="whitegrid")

from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import make_scorer, accuracy_score
from sklearn import preprocessing
from sklearn.model_selection import learning_curve

%matplotlib inline

In [2]:
data = pd.read_csv("ks-projects-201801.csv")
data.columns = data.columns.str.replace(' ', '_')
data.columns

Index(['ID', 'name', 'category', 'main_category', 'currency', 'deadline',
       'goal', 'launched', 'pledged', 'state', 'backers', 'country',
       'usd_pledged', 'usd_pledged_real', 'usd_goal_real'],
      dtype='object')

In [3]:
# Fixing a small entry error
data.country = data.country.replace(to_replace='N,0"', value='NO')

successdata = data[data.state == 'successful']
faildata = data[data.state == 'failed']
alldata = pd.concat([successdata, faildata])
print(alldata.head())

            ID                                               name  \
5   1000014025                               Monarch Espresso Bar   
6   1000023410  Support Solar Roasted Coffee & Green Energy!  ...   
11   100005484                                   Lisa Lim New CD!   
14  1000057089  Tombstone: Old West tabletop game and miniatur...   
18  1000070642                Mike Corey's Darkness & Light Album   

          category main_category currency    deadline     goal  \
5      Restaurants          Food      USD  2016-04-01  50000.0   
6             Food          Food      USD  2014-12-21   1000.0   
11      Indie Rock         Music      USD  2013-04-08  12500.0   
14  Tabletop Games         Games      GBP  2017-05-03   5000.0   
18           Music         Music      USD  2012-08-17    250.0   

               launched  pledged       state  backers country  usd_pledged  \
5   2016-02-26 13:38:27  52375.0  successful      224      US     52375.00   
6   2014-12-01 18:30:44   1205.0

In [4]:
# Change launched and deadline to datetime format and create a new column called duration

alldata.launched = pd.to_datetime(alldata.launched, format='%Y-%m-%d %H:%M:%S')
alldata.deadline = pd.to_datetime(alldata.deadline, format='%Y-%m-%d %H:%M:%S')

alldata['duration']=alldata['deadline']-alldata['launched']
alldata.duration = alldata.duration.dt.days

alldata.drop_duplicates()
alldata.isnull().sum()
    
alldata.drop(columns=['ID', 'category', 'currency', 'deadline', 'goal', 'launched', 'name', 'pledged', 'usd_pledged'], 
              inplace=True)

alldata.isnull().sum()

main_category       0
state               0
backers             0
country             0
usd_pledged_real    0
usd_goal_real       0
duration            0
dtype: int64

In [9]:
# Create a function to make and fit a classifier, predict y_values, and build some accuracy metrics 

def RandomForest(randomState, X_train, X_test, y_train, y_test):
    # Creation and fit
    global classifier
    #global cm
    #global y_pred
    classifier = RandomForestClassifier()
    classifier.fit(X_train, y_train)
    
    # Prediction
    #y_pred = classifier.predict(X_test)
    
#     # Important metrics
#     cm = confusion_matrix(y_test, y_pred)
#     accuracy = accuracy_score(y_test, y_pred)
#     print(accuracy)

In [6]:
# Creating dummy variables for categories - one-hot encoding
alldata_enc = pd.get_dummies(alldata, columns=['state', 'main_category', 'country'])
alldata_enc.head()

# Extracting column names for tree visualization later
cols = list(alldata_enc.columns)
cols.remove('state_failed')
cols.remove('state_successful')

#Create a training set and a test set 
X = alldata_enc.drop(['state_successful', 'state_failed'], axis='columns').values
y = alldata_enc.state_successful.values
X_t, X_test, y_t, y_test = train_test_split(X, y, test_size = 0.18)

# Adding a validation set 
X_train, X_val, y_train, y_val = train_test_split(X_t, y_t, test_size = 0.22)

# Initialize sklearn's built-in scorer
scorer = make_scorer(accuracy_score)

What does our new train/val/test split yield?

In [14]:
RandomForest(0, X_train, X_val, y_train, y_val)
val_pred = classifier.predict(X_val)
val_cm = confusion_matrix(y_val, val_pred)
print(val_cm)

test_pred = classifier.predict(X_test)
test_cm = confusion_matrix(y_test, test_pred)
print(test_cm)

[[35434   232]
 [  110 24059]]
[[35553   218]
 [  117 23814]]


In [15]:
#true positives/total predicted positives

val_precision = (35434/(35343+110)*100)
test_precision = (35553/(35553+117)*100)
print('Validation set precision: ', val_precision,'%')
print('Test set precision: ', test_precision,'%')

Validation set precision:  99.94640792034525 %
Test set precision:  99.67199327165686 %


Learning curve

In [None]:
train_sizes = np.arange(5,len(X_train),10000)

train_sizes, train_scores, validation_scores = learning_curve(
                                                   classifier, X, y, train_sizes = train_sizes, cv = 3,
                                                   scoring = 'accuracy')

train_scores_mean = train_scores.mean(axis = 1)
validation_scores_mean = validation_scores.mean(axis = 1)

plt.style.use('seaborn')

plt.plot(train_sizes, train_scores_mean, label = 'Train set')
plt.plot(train_sizes, validation_scores_mean, label = 'Validation set')

plt.ylabel('Accuracy', fontsize = 14)
plt.xlabel('Training set size', fontsize = 14)
plt.title('Learning curve for random forest classifier', fontsize = 18, y = 1.03)
plt.legend()
plt.ylim(0.4,1)

ROC curve

In [None]:
from sklearn.metrics import roc_curve, auc

# Calculate roc auc
rf_probs = classifier.predict_proba(X_test)[:, 1]

fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(2):
    fpr[i], tpr[i], _ = roc_curve(y_test, rf_probs)
    roc_auc[i] = auc(fpr[i], tpr[i])

print(roc_auc_score(y_test, rf_probs))
plt.figure()
plt.plot(fpr[1], tpr[1])
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.show()

print(auc(fpr[1], tpr[1]))

# measure of separability, close to 1 = good at predicting correctly, x is false positive and y is true positive 
# https://www.kaggle.com/learn-forum/53782