## INCLUDES:

- LINEAR REGRESSION

- RANDOM FOREST REGRESSOR

- GRADIENT BOOSTING REGRESSOR

In [1]:
import pandas as pd
import numpy as np
import my_pickle as mp
import my_functions as mf
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, precision_score, recall_score
from sklearn.ensemble import RandomForestRegressor

import my_resample as ms

import warnings
warnings.filterwarnings('ignore')


from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble.partial_dependence import plot_partial_dependence


from importlib import reload

import warnings
warnings.filterwarnings('ignore')

In [2]:
X = mp.unjson_it('data_X')
y = mp.unjson_it('data_y')['convo_length']
text_similarity_df = mp.unjson_it('data_text_similarity')
X['count_similarity'] = text_similarity_df['count_similarity']
X['tfidf_similarity'] = text_similarity_df['tfidf_similarity']

# PREPARE DATA

same train test split, resample, and scaling for everything

In [7]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X.as_matrix(), y.as_matrix(), random_state=17)

# resample
X_train, y_train = ms.oversample(X_train, y_train, .5)

# scale data
# scaler = StandardScaler()
# scaler.fit(X_train)
# X_train = scaler.transform(X_train)
# X_test = scaler.transform(X_test)

# RELOAD FN

In [11]:
reload(mf)

<module 'my_functions' from '/Users/gandalf/Documents/Galvanize/MatchingService/my_functions.py'>

# LINEAR REGRESSION

In [12]:
# fit model
model = LinearRegression()
model.fit(X_train, y_train)
print(model.score(X_train, y_train))

# make predictions
y_pred = model.predict(X_test)

# show metrics
mf.display_metrics(model, X_test, y_test)

# show importances
mf.display_importances_linear(model, X).head(5)

0.0624753394745

METRICS
Model recall: 0.18235294117647058
Model precision: 0.1497584541062802
Model accuracy: -1.105840856027342

CONFUSION MATRIX
[[2436  176]
 [ 139   31]]

key:
 TN   FP 
 FN   TP 

RECALL AND ACCURACY FOR DIFFERNET MODELS
recall     	 precision   	model
0.182352941176 	 0.149758454106 my model
0.0 	 	 0.0 		predict all zero
1.0 	 	 0.0611071171819 predict all one
0.494117647059 	 0.0587412587413 predict 50-50
0.1 0.0553745928339 predict 90-10


Unnamed: 0,feature,coefficient,abs_value,sign
72,tfidf_similarity,5.78,5.78,1.0
56,same_state,-0.79,0.79,-1.0
68,type_sender,-0.55,0.55,-1.0
47,same_city,-0.54,0.54,-1.0
43,question_count_receiver,0.52,0.52,1.0


# RANDOM FOREST REGRESSOR

In [15]:
# fit model
model = RandomForestRegressor(n_estimators=100)
model.fit(X_train, y_train, sample_weight=None)

# make predictions
y_pred = model.predict(X_test)

# show metrics
mf.display_metrics(model, X_test, y_test)

# show importances
mf.display_importances_trees(model, X).head(5)


METRICS
Model recall: 0.31176470588235294
Model precision: 0.14363143631436315
Model accuracy: -0.6514063153500771

CONFUSION MATRIX
[[2296  316]
 [ 117   53]]

key:
 TN   FP 
 FN   TP 

RECALL AND ACCURACY FOR DIFFERNET MODELS
recall     	 precision   	model
0.311764705882 	 0.143631436314 my model
0.0 	 	 0.0 		predict all zero
1.0 	 	 0.0611071171819 predict all one
0.458823529412 	 0.0552016985138 predict 50-50
0.111764705882 0.0763052208835 predict 90-10


Unnamed: 0,feature,coefficient
6,distance,0.17
69,urgency_receiver,0.16
72,tfidf_similarity,0.04
21,hobbies_overlap,0.03
2,age_dif,0.03


# GRADIENT BOOSTING REGRESSOR

In [14]:
# fit model
model = GradientBoostingRegressor()
model.fit(X_train, y_train, sample_weight=None)

# make predictions
y_pred = model.predict(X_test)

# show metrics
mf.display_metrics(model, X_test, y_test)

# show importances
mf.display_importances_trees(model, X).head(5)


METRICS
Model recall: 0.12352941176470589
Model precision: 0.1721311475409836
Model accuracy: -1.0434314166709875

CONFUSION MATRIX
[[2511  101]
 [ 149   21]]

key:
 TN   FP 
 FN   TP 

RECALL AND ACCURACY FOR DIFFERNET MODELS
recall     	 precision   	model
0.123529411765 	 0.172131147541 my model
0.0 	 	 0.0 		predict all zero
1.0 	 	 0.0611071171819 predict all one
0.511764705882 	 0.0631349782293 predict 50-50
0.0705882352941 0.0495867768595 predict 90-10


Unnamed: 0,feature,coefficient
6,distance,0.15
69,urgency_receiver,0.13
72,tfidf_similarity,0.09
70,urgency_sender,0.08
45,rent_overlap,0.07


### RESAMPLE AS A HYPERPARAMETER

In [None]:
def all_together(X_df,resamp,cutoff):
    # GET DATA
    y = X_df.convo_length
    X = X_df.drop(['convo_length'], axis=1)

    # TRAIN TEST SPLIT
    X_train, X_test, y_train, y_test = train_test_split(X.as_matrix(), y.as_matrix(), random_state=17)

    # RESAMPLE
    X_train, y_train = ms.oversample(X_train, y_train, resamp)

    # SCALE DATA
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    
    # fit model
    model = LinearRegression()
    model.fit(X_train, y_train)

    # make predictions
    y_pred = model.predict(X_test)
    
    y_pred_binary = [1 if x > cutoff else 0 for x in y_pred]
    y_test_binary = [1 if x > cutoff else 0 for x in y_test]
    
    return recall_score(y_test_binary, y_pred_binary), precision_score(y_test_binary, y_pred_binary)

In [None]:
recall = []
precision = []
x_values = np.arange(.01,10,.01)
for x in x_values:
    r,p = all_together(X_df,x, 1)
    recall.append(r)
    precision.append(p)

In [None]:
fig, ax = plt.subplots()
ax.plot(x_values, recall)
ax.plot(x_values, precision)
ax.set_title('RECALL AND PRECISION')
plt.show()

In [None]:
def change_cutoff(cutoff, lotsa = True):
    y_pred_binary = [1 if x > cutoff else 0 for x in y_pred]
    y_test_binary = [1 if x > cutoff else 0 for x in y_test]

    if lotsa:
        print("\nMETRICS")
        print("Model recall: {}".format(recall_score(y_test_binary, y_pred_binary)))
        print("Model precision: {}".format(precision_score(y_test_binary, y_pred_binary)))
        print("Model accuracy: {}".format(model.score(X_test, y_test_binary)))

        print ("\nCONFUSION MATRIX")
        print (confusion_matrix(y_test_binary, y_pred_binary))
        print ("\nkey:")
        print (" TN   FP ")
        print (" FN   TP ")
    
    return recall_score(y_test_binary, y_pred_binary), precision_score(y_test_binary, y_pred_binary)

change_cutoff(1.5)

In [None]:
def plot_recall_precision():
    recall = []
    precision = []
    x_values = np.arange(0,10,.1)
    for cutoff in x_values:
        r,p = change_cutoff(cutoff,False)
        recall.append(r)
        precision.append(p)
    fig, ax = plt.subplots()
    ax.plot(x_values, recall)
    ax.plot(x_values, precision)
    ax.set_title('RECALL AND PRECISION')
    plt.show()
plot_recall_precision()

In [None]:
plot_partial_dependence(model, X_train, feature_importances[-6:], X.columns, n_jobs=-1,figsize = (16,8))
plt.show()

In [None]:
fig, axs = plot_partial_dependence(model, X_train, [71]) 
plt.show()

In [None]:
def change_cutoff(cutoff, lotsa = True):
    y_pred_binary = [1 if x > cutoff else 0 for x in y_pred]
    y_test_binary = [1 if x > cutoff else 0 for x in y_test]

    if lotsa:
        print("\nMETRICS")
        print("Model recall: {}".format(recall_score(y_test_binary, y_pred_binary)))
        print("Model precision: {}".format(precision_score(y_test_binary, y_pred_binary)))
        print("Model accuracy: {}".format(model.score(X_test, y_test_binary)))

        print ("\nCONFUSION MATRIX")
        print (confusion_matrix(y_test_binary, y_pred_binary))
        print ("\nkey:")
        print (" TN   FP ")
        print (" FN   TP ")
    
    return recall_score(y_test_binary, y_pred_binary), precision_score(y_test_binary, y_pred_binary)

change_cutoff(1)

In [None]:
def plot_recall_precision():
    recall = []
    precision = []
    x_values = np.arange(0,10,.1)
    for cutoff in x_values:
        r,p = change_cutoff(cutoff,False)
        recall.append(r)
        precision.append(p)
    fig, ax = plt.subplots()
    ax.plot(x_values, recall)
    ax.plot(x_values, precision)
    ax.set_title('RECALL AND PRECISION')
    plt.show()
plot_recall_precision()