## INCLUDES:

# LINEAR REGRESSION

# RANDOM FOREST REGRESSOR

# GRADIENT BOOSTING REGRESSOR

In [None]:
import pandas as pd
import numpy as np
import my_pickle as mp
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, precision_score, recall_score
from sklearn.ensemble import RandomForestRegressor

import my_resample as ms

import warnings
warnings.filterwarnings('ignore')


from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble.partial_dependence import plot_partial_dependence

In [None]:
def convert_to_binary(lst, cutoff=1):
    return [1 if x > cutoff else 0 for x in lst]

In [None]:
X = mp.unjson_it('data_X')
y = mp.unjson_it('data_y')['convo_length']
text_similarity_df = mp.unjson_it('data_text_similarity')
X['count_similarity'] = text_similarity_df['count_similarity']
X['tfidf_similarity'] = text_similarity_df['tfidf_similarity']

# PREPARE DATA

same train test split, resample, and scaling for everything

In [None]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X.as_matrix(), y.as_matrix(), random_state=17)

# resample
X_train, y_train = ms.oversample(X_train, y_train, .9)

# scale data
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# LINEAR REGRESSION

In [None]:
# fit model
model = LinearRegression()
model.fit(X_train, y_train)
print(model.score(X_train, y_train))

# make predictions
y_pred = model.predict(X_test)

### CONVERT TO BINARY AND GET ACURACY ETC

In [None]:
y_pred_binary = convert_to_binary(y_pred,3)
y_test_binary = convert_to_binary(y_test,3)

print("\nMETRICS")
print("Model recall: {}".format(recall_score(y_test_binary, y_pred_binary)))
print("Model precision: {}".format(precision_score(y_test_binary, y_pred_binary)))
print("Model accuracy: {}".format(model.score(X_test, y_test_binary)))

print ("\nCONFUSION MATRIX")
print (confusion_matrix(y_test_binary, y_pred_binary))
print ("\nkey:")
print (" TN   FP ")
print (" FN   TP ")

# return recall_score(y_test_binary, y_pred_binary), precision_score(y_test_binary, y_pred_binary)

In [None]:
def display_feature_importances():
    # show feature importances
    pd.options.display.float_format = '{:,.2f}'.format
    feature_df = pd.DataFrame([X.columns, model.coef_]).T
    feature_df.columns = ['feature','coefficient']
    feature_df['abs_value'] = feature_df.coefficient.apply(abs)
    feature_df['sign'] = feature_df.coefficient/feature_df.abs_value
    return feature_df.sort_values('abs_value', ascending=False)
display_feature_importances().head(20)

### RESAMPLE AS A HYPERPARAMETER

In [None]:
def all_together(X_df,resamp,cutoff):
    # GET DATA
    y = X_df.convo_length
    X = X_df.drop(['convo_length'], axis=1)

    # TRAIN TEST SPLIT
    X_train, X_test, y_train, y_test = train_test_split(X.as_matrix(), y.as_matrix(), random_state=17)

    # RESAMPLE
    X_train, y_train = ms.oversample(X_train, y_train, resamp)

    # SCALE DATA
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    
    # fit model
    model = LinearRegression()
    model.fit(X_train, y_train)
#     print(model.score(X_train, y_train))

    # make predictions
    y_pred = model.predict(X_test)
    
    y_pred_binary = [1 if x > cutoff else 0 for x in y_pred]
    y_test_binary = [1 if x > cutoff else 0 for x in y_test]
    
    return recall_score(y_test_binary, y_pred_binary), precision_score(y_test_binary, y_pred_binary)

In [None]:
recall = []
precision = []
x_values = np.arange(.01,10,.01)
for x in x_values:
    r,p = all_together(X_df,x, 1)
    recall.append(r)
    precision.append(p)

In [None]:
fig, ax = plt.subplots()
ax.plot(x_values, recall)
ax.plot(x_values, precision)
ax.set_title('RECALL AND PRECISION')
plt.show()

# RANDOM FOREST REGRESSOR

In [None]:
# GET DATA
y = X_df.convo_length
X = X_df.drop(['convo_length','response'], axis=1)

# TRAIN TEST SPLIT
X_train, X_test, y_train, y_test = train_test_split(X.as_matrix(), y.as_matrix(), random_state=17)

# RESAMPLE
# X_train, y_train = ms.oversample(X_train, y_train, .5)

# SCALE DATA
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# fit model
model = RandomForestRegressor()
model.fit(X_train, y_train, sample_weight=None)

In [None]:
# get feature importances
feature_importances = np.argsort(model.feature_importances_)
top_n = 10 #len(X.columns)
print("\nFEATURE RANKINGS")
for n in range(top_n):
    print(n+1, '\t',X.columns[feature_importances[-n-1]], '\t',sorted(model.feature_importances_)[-n-1])

In [None]:
y_pred = model.predict(X_test)

In [None]:
model.score(X_test, y_test)

In [None]:
def change_cutoff(cutoff, lotsa = True):
    y_pred_binary = [1 if x > cutoff else 0 for x in y_pred]
    y_test_binary = [1 if x > cutoff else 0 for x in y_test]

    if lotsa:
        print("\nMETRICS")
        print("Model recall: {}".format(recall_score(y_test_binary, y_pred_binary)))
        print("Model precision: {}".format(precision_score(y_test_binary, y_pred_binary)))
        print("Model accuracy: {}".format(model.score(X_test, y_test_binary)))

        print ("\nCONFUSION MATRIX")
        print (confusion_matrix(y_test_binary, y_pred_binary))
        print ("\nkey:")
        print (" TN   FP ")
        print (" FN   TP ")
    
    return recall_score(y_test_binary, y_pred_binary), precision_score(y_test_binary, y_pred_binary)

change_cutoff(1)

In [None]:
def plot_recall_precision():
    recall = []
    precision = []
    x_values = np.arange(0,10,.1)
    for cutoff in x_values:
        r,p = change_cutoff(cutoff,False)
        recall.append(r)
        precision.append(p)
    fig, ax = plt.subplots()
    ax.plot(x_values, recall)
    ax.plot(x_values, precision)
    ax.set_title('RECALL AND PRECISION')
    plt.show()
plot_recall_precision()

# GRADIENT BOOSTING REGRESSOR

In [None]:
# TRAIN TEST SPLIT
X_train, X_test, y_train, y_test = train_test_split(X.as_matrix(), y.as_matrix(), random_state=17)

# RESAMPLE
X_train, y_train = ms.oversample(X_train, y_train, .5)

# SCALE DATA
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# fit model
model = GradientBoostingRegressor()
model.fit(X_train, y_train, sample_weight=None)

In [None]:
# get feature importances
feature_importances = np.argsort(model.feature_importances_)
top_n = 10 #len(X.columns)
print("\nFEATURE RANKINGS")
for n in range(top_n):
    print(n+1, '\t',X.columns[feature_importances[-n-1]], '\t',sorted(model.feature_importances_)[-n-1],'\t',feature_importances[-n-1])

In [None]:
feature_importances[-4:]
# X.columns[feature_importances[-4:]]

In [None]:
y_pred = model.predict(X_test)
model.score(X_test, y_test)

In [None]:
def change_cutoff(cutoff, lotsa = True):
    y_pred_binary = [1 if x > cutoff else 0 for x in y_pred]
    y_test_binary = [1 if x > cutoff else 0 for x in y_test]

    if lotsa:
        print("\nMETRICS")
        print("Model recall: {}".format(recall_score(y_test_binary, y_pred_binary)))
        print("Model precision: {}".format(precision_score(y_test_binary, y_pred_binary)))
        print("Model accuracy: {}".format(model.score(X_test, y_test_binary)))

        print ("\nCONFUSION MATRIX")
        print (confusion_matrix(y_test_binary, y_pred_binary))
        print ("\nkey:")
        print (" TN   FP ")
        print (" FN   TP ")
    
    return recall_score(y_test_binary, y_pred_binary), precision_score(y_test_binary, y_pred_binary)

change_cutoff(1.5)

In [None]:
def plot_recall_precision():
    recall = []
    precision = []
    x_values = np.arange(0,10,.1)
    for cutoff in x_values:
        r,p = change_cutoff(cutoff,False)
        recall.append(r)
        precision.append(p)
    fig, ax = plt.subplots()
    ax.plot(x_values, recall)
    ax.plot(x_values, precision)
    ax.set_title('RECALL AND PRECISION')
    plt.show()
plot_recall_precision()

In [None]:
plot_partial_dependence(model, X_train, feature_importances[-6:], X.columns, n_jobs=-1,figsize = (16,8))
plt.show()

In [None]:
fig, axs = plot_partial_dependence(model, X_train, [71]) 
plt.show()