# IMPORTS

In [1]:
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, precision_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
# from roc import plot_roc
from sklearn.metrics import roc_curve

# READ IN DATA

In [4]:
# read in user data
path_users = "/Users/gandalf/Documents/data/data_users.pkl"
user_df = pd.read_pickle(path_users)
len(user_df)

45010

In [5]:
# read in message data
path_response = "/Users/gandalf/Documents/data/data_response.pkl"
response_df = pd.read_pickle(path_response)
len(response_df)

53405

In [None]:
# read in message data
path_response = "/Users/gandalf/Documents/data/data_response.pkl"
response_df = pd.read_pickle(path_response)
response_df.head(5)

In [None]:
# cull to just messges from users outside NA
print("All conversations: {}".format(len(response_df)))
outside_NA_conv = response_df[response_df.first_uid.isin(outside_NA_uid)]
print("Outside NA conversations: {}".format(len(outside_NA_conv)))
outside_NA_conv.head()

In [None]:
# response rate?
outside_NA_conv.response.sum()/len(outside_NA_conv)

In [None]:
# get recipients
outside_NA_conv['first_ten'] = outside_NA_conv.conv_id.apply(lambda x: x[:10])
outside_NA_conv['last_ten'] = outside_NA_conv.conv_id.apply(lambda x: x[10:])

for index, row in outside_NA_conv.iterrows():
    if row.second_uid == None:
        if row.first_ten == row.first_uid: row.second_uid = row.last_ten
        elif row.last_ten == row.first_uid: row.second_uid = row.first_ten
        else: print('uh')

In [None]:
outside_NA_conv[outside_NA_conv.response].head()

In [None]:
# make it simple
simple_cols = ['first_uid','second_uid','response']
simple_df = outside_NA_conv[simple_cols]
# tf = {True:1, False:0, 1:1, 0:0}
# simple_df.response = simple_df.response.map(tf)
simple_df.head()

In [None]:
# build up wiht user info
ad,sg,sr,sc,sn,ss = [],[],[],[],[],[]

for index, row in simple_df.iterrows():
    first = user_df.loc[str(row.first_uid)]
    second = user_df.loc[str(row.second_uid)]
    ad.append(abs(first.age - second.age))
    sg.append(first.gender == second.gender)
    sr.append(first.inRelationship == second.inRelationship)
    sc.append(first.isClean == second.isClean)
    sn.append(first.isNight == second.isNight)
    ss.append(first.isStudent == second.isStudent)

simple_df['age_dif'] = ad
simple_df['same_gender'] = sg
simple_df['same_relate'] = sr
simple_df['same_clean'] = sc
simple_df['same_night'] = sn
simple_df['same_student'] = ss

# filled na at some point, should undo

simple_df.head(2)

In [None]:
# change into x and y
X = simple_df
y = X.response.map({True:1,False:0})
X = X.drop(['first_uid','second_uid','response'], axis=1)
X.head()
y.sum()

In [None]:
# train, test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 17)

In [None]:
# Use sklearn's RandomForestClassifier to build a model of your data
model = RandomForestClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)


print("Prediction:   {}".format(model.predict(X_test)))
print("Actual:       {}".format(y_test.values))

print("\nModel recall: {} (Of things the model says are 1's, how many are 1's?)".format(recall_score(y_test, y_pred)))
print("Model precision: {} (Of things that are 1's, how many does the model says are 1's?)".format(precision_score(y_test, y_pred)))
print("Score of model: {}".format(model.score(X_test, y_test)))
print("Score to beat: {}".format(1-y_test.sum()/len(y_test)))

print ("\nconfusion matrix:")
print ("   N  P")
print (confusion_matrix(y_test, y_predict))

In [None]:
# Build the RandomForestClassifier again setting the out of bag parameter to be true
model = RandomForestClassifier(n_estimators=30, oob_score=True)
model.fit(X_train, y_train)
print ("accuracy score:", model.score(X_test, y_test))
print ("out of bag score:", model.oob_score_)

In [None]:
# Use sklearn's model to get the feature importances
feature_importances = np.argsort(model.feature_importances_)
print("top five:", list(simple_df.columns[feature_importances[-1:-6:-1]]))

In [None]:
# Calculate the standard deviation for feature importances across all trees

n = 10 # top 10 features

#importances = forest_fit.feature_importances_[:n]
importances = model.feature_importances_[:n]
std = np.std([tree.feature_importances_ for tree in model.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]
features = list(X.columns[indices])

print("Feature ranking:")

for f in range(5):
    print("%d. %s (%f)" % (f + 1, features[f], importances[indices[f]]))


In [None]:
features

In [None]:
# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(6), importances[indices], yerr=std[indices], color="r", align="center")
plt.xticks(range(6), features, rotation=45)
plt.xlim([-1, 5])
plt.show()

In [None]:
# Try modifying the number of trees

num_trees = range(5, 50, 5)
accuracies = []
for n in num_trees:
    tot = 0
    for i in range(5):
        rf = RandomForestClassifier(n_estimators=n)
        rf.fit(X_train, y_train)
        tot += rf.score(X_test, y_test)
    accuracies.append(tot / 5)
plt.plot(num_trees, accuracies)
plt.xlabel="num_trees"
plt.ylabel="accuracy"
plt.show()

In [None]:
# Try modifying the number of trees
for nn in range(10):
    num_trees = range(5, 50, 5)
    accuracies = []
    for n in num_trees:
        tot = 0
        for i in range(5):
            rf = RandomForestClassifier(n_estimators=n)
            rf.fit(X_train, y_train)
            tot += rf.score(X_test, y_test)
        accuracies.append(tot / 5)
    plt.plot(num_trees, accuracies)
plt.xlabel="num_trees"
plt.ylabel="accuracy"
plt.show()

In [None]:
# Modifying the max features parameter
for nn in range(10):
    num_features = range(2, len(X.columns))
    accuracies = []
    for n in num_features:
        tot = 0
        for i in range(5):
            rf = RandomForestClassifier(max_features=n)
            rf.fit(X_train, y_train)
            tot += rf.score(X_test, y_test)
        accuracies.append(tot / 5)
    plt.plot(num_features, accuracies)
    plt.xlabel="num_features"
    plt.ylabel="accuracy"
plt.show()

In [None]:
# Run all the other classifiers that we have learned so far in class
def get_scores(classifier, X_train, X_test, y_train, y_test, **kwargs):
    model = classifier(**kwargs)
    model.fit(X_train, y_train)
    y_predict = model.predict(X_test)
    return model.score(X_test, y_test), \
           precision_score(y_test, y_predict), \
           recall_score(y_test, y_predict)

print ("    Model,                Accuracy, Precision, Recall")
print ("    Random Forest:       ", get_scores(RandomForestClassifier, X_train, X_test, y_train, y_test, n_estimators=25, max_features=5))
print ("    Logistic Regression: ", get_scores(LogisticRegression, X_train, X_test, y_train, y_test))
print ("    Decision Tree:       ", get_scores(DecisionTreeClassifier, X_train, X_test, y_train, y_test))
print ("    Naive Bayes:         ", get_scores(MultinomialNB, X_train, X_test, y_train, y_test))

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from scipy import interp
from sklearn.cross_validation import KFold
from sklearn.preprocessing import StandardScaler

In [None]:
def plot_roc(X, y, clf_class, title, **kwargs):
# def plot_roc(X, y, clf_class, kwargs):
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    kf = KFold(len(y), n_folds=5, shuffle=True)
    y_prob = np.zeros((len(y),2))
    mean_tpr = 0.0
    mean_fpr = np.linspace(0, 1, 100)
    all_tpr = []
    for i, (train_index, test_index) in enumerate(kf):
        X_train, X_test = X[train_index], X[test_index]
        y_train = y[train_index]
        clf = clf_class(**kwargs)
        clf.fit(X_train,y_train)
        # Predict probabilities, not classes
        y_prob[test_index] = clf.predict_proba(X_test)
        fpr, tpr, thresholds = roc_curve(y[test_index], y_prob[test_index, 1])
        mean_tpr += interp(mean_fpr, fpr, tpr)
        mean_tpr[0] = 0.0
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' % (i, roc_auc))
    mean_tpr /= len(kf)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    plt.plot(mean_fpr, mean_tpr, 'k--',label='Mean ROC (area = %0.2f)' % mean_auc, lw=2)

    plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Random')
    plt.xlim([-0.05, 1.05])
    plt.ylim([-0.05, 1.05])
#     plt.xlabel('False Positive Rate')
#     plt.ylabel('True Positive Rate')
    plt.title(title + 'ROC')
    plt.legend(loc="lower right")
    plt.show()

In [None]:
xfake, yfake = np.random.rand(100,6), np.random.randint(0,2, size=(100,))

In [None]:
plt.figure(figsize=(20,16))
print ("Visualize the roc curve of each model")
plot_roc(xfake, yfake, RandomForestClassifier, 'Random_Forest', n_estimators=25, max_features=5)
#plot_roc(X, y, LogisticRegression, 'Logistic_Regrssion')
#plot_roc(X, y, DecisionTreeClassifier, 'Decision_Tree')
#plot_roc(X, y, MultinomialNB, 'Naive_Bayes') error
print('\nPlotting completed.')


In [None]:
print ("Visualize the roc curve of each model")
plot_roc(X, y, RandomForestClassifier, 'Random_Forest', n_estimators=25, max_features=5)
plot_roc(X, y, LogisticRegression, 'Logistic_Regrssion')
plot_roc(X, y, DecisionTreeClassifier, 'Decision_Tree')
#plot_roc(X, y, MultinomialNB, 'Naive_Bayes') error
print('\nPlotting completed.')

In [None]:
to do:
    roc curve
    split data
    how to t/t
    