In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt


import my_resample as ms

import seaborn as sns
sns.set()
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import confusion_matrix, precision_score, recall_score

import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import scale

In [None]:
# directory
data_file_path = "/Users/gandalf/Documents/coding/do_not_commit/capstone/"
website_file_path = '/Users/gandalf/Documents/coding/rczyrnik.github.io/capstone/'

# READ IN DATA

In [None]:
X =  pd.read_pickle(data_file_path+'X.pkl')
y1 =  pd.read_pickle(data_file_path+'y1.pkl')
y2 =  pd.read_pickle(data_file_path+'y2.pkl')

In [None]:
columns = X.columns

In [None]:
X.head()

In [None]:
len(X.columns)

In [None]:
y1.head()

In [None]:
y2.head()

# SCALE DATA

In [None]:
X = scale(X)

# SOME HELPFUL FNS

In [None]:
def display_metrics(model, X_test, y_test):
    y_pred = model.predict(X_test)

    print("\nMETRICS")
    print("Model recall: {}".format(recall_score(y_test, y_pred)))
    print("Model precision: {}".format(precision_score(y_test, y_pred)))
    print("Model accuracy: {}".format(model.score(X_test, y_test)))

    print ("\nCONFUSION MATRIX")
    print (confusion_matrix(y_test, y_pred)/len(y_test))
    print ("\nkey:")
    print (" TN   FP ")
    print (" FN   TP ")
    
def display_importances_linear(model, X):
    # show feature importances
    pd.options.display.float_format = '{:,.2f}'.format
    feature_df = pd.DataFrame([columns, model.coef_[0]]).T
    feature_df.columns = ['feature','coefficient']
    feature_df['abs_value'] = feature_df.coefficient.apply(abs)
    feature_df['sign'] = feature_df.coefficient/feature_df.abs_value
    return feature_df.sort_values('abs_value', ascending=False)

def convert_to_binary(lst, cutoff=1):
    return [1 if x > cutoff else 0 for x in lst]

def display_metrics_continuous(model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_pred_binary = convert_to_binary(y_pred,2)
    y_test_binary = convert_to_binary(y_test,2)

    print("\nMETRICS")
    print("Model recall: {}".format(recall_score(y_test_binary, y_pred_binary)))
    print("Model precision: {}".format(precision_score(y_test_binary, y_pred_binary)))
    print("Model accuracy: {}".format(model.score(X_test, y_test)))

    print ("\nCONFUSION MATRIX")
    print (confusion_matrix(y_test_binary, y_pred_binary))
    print ("\nkey:")
    print (" TN   FP ")
    print (" FN   TP ")

# LOGISTIC REGRESSION

In [None]:
%%time

# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y1.as_matrix(), random_state=17)

# resample
X_train, y_train = ms.oversample(X_train, y_train, .5)

# scale data
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# fit model
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

display_metrics(model, X_test, y_test)
print()
print(display_importances_linear(model, X).head(20))
print()

In [None]:
temp = display_importances_linear(model, X).head(20)
for i, row in enumerate(temp.values):
    if row[3] < 0: color = 'red'
    else: color = 'green'
    print("<div style='color: {}'> {}. {} </div>".format(color,i+1,row[0]), end='')

# Guessing Randomly

In [None]:
y_pred = y_test.copy()
np.random.shuffle(y_pred)

print("\nMETRICS")
print("Model recall: {}".format(recall_score(y_test, y_pred)))
print("Model precision: {}".format(precision_score(y_test, y_pred)))
print("Model accuracy: {}".format(model.score(X_test, y_test)))

print ("\nCONFUSION MATRIX")
print (confusion_matrix(y_test, y_pred)/len(y_pred))
print ("\nkey:")
print (" TN   FP ")
print (" FN   TP ")

print(np.array(y_pred).sum())
print(y_test.sum())
print(len(y_test))
print(304/2689)

# Get rid of a lot of the has

In [None]:
X_new = X.copy()

In [None]:
has_sender_cols = ['has_facebook_sender','has_password_sender', 'has_about_sender', 'has_amenities_sender', 
                   'has_birthdate_sender','has_available_sender', 'has_college_sender',
                     'has_email_sender', 'has_hobbies_sender','has_hometown_sender',
                     'has_linkedin_sender', 'has_location_sender','has_neighborhoods_sender',
                     'has_numRoommates_sender','has_picture_sender','has_term_sender','has_work_sender']

has_receiver_cols = ['has_facebook_receiver','has_password_receiver','has_about_receiver','has_amenities_receiver',
    'has_available_receiver','has_birthdate_receiver','has_college_receiver','has_email_receiver',
    'has_hobbies_receiver','has_hometown_receiver','has_linkedin_receiver','has_location_receiver',
    'has_neighborhoods_receiver','has_numRoommates_receiver','has_picture_receiver','has_term_receiver',
    'has_work_receiver']
 
similarity_cols = [
 'same_gender','same_relate','same_clean','same_night','same_student','same_smoking','same_type','same_term',
 'same_work','same_city','same_state','same_country','same_college','same_metro']

In [None]:
def sum_columns(row, lst):
    s = 0
    for l in lst:
        s += row[l]
    return s

# X_new['sender_activity'] = X_new.apply(lambda row: sum_columns(row, has_sender_cols), axis=1)
# X_new['receiver_activity'] = X_new.apply(lambda row: sum_columns(row, has_receiver_cols), axis=1)
X_new['user_similarity'] = X_new.apply(lambda row: sum_columns(row, similarity_cols), axis=1)

In [None]:
X_new = X_new.drop(has_sender_cols+has_receiver_cols+similarity_cols, axis=1)
# X_new = X_new.drop(has_receiver_cols, axis=1)
# X_new = X_new.drop(similarity_cols, axis=1)

In [None]:
len(X_new.columns)

In [None]:
%%time

# train test split
X_train, X_test, y_train, y_test = train_test_split(X_new.as_matrix(), y1.as_matrix(), random_state=17)

# resample
X_train, y_train = ms.oversample(X_train, y_train, .5)

# scale data
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# fit model
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

display_metrics(model, X_test, y_test)
print()
print(display_importances_linear(model, X_new).head(20))
print()

In [None]:
temp = display_importances_linear(model, X_new).head(20)
for i, row in enumerate(temp.values):
    if row[3] < 0: color = 'red'
    else: color = 'green'
    print("<div style='color: {}'> {}. {} </div>".format(color,i+1,row[0]), end='')

In [None]:
temp = display_importances_linear(model, X_new).head(20)
for i, row in enumerate(temp.values):
    print("'{}',".format(row[0]), end='')

# LINEAR REGRESSION

In [None]:
%%time

# train test split
X_train, X_test, y_train, y_test = train_test_split(X.as_matrix(), y2.as_matrix(), random_state=17)

# resample
X_train, y_train = ms.oversample(X_train, y_train, .5)

# scale data
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# fit model
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [None]:
display_metrics(model, X_test, y_test)

In [None]:
plt.scatter(y_pred,y_test,alpha=.1)
plt.show()

# INVESTIGATE RESULTS

### first_message_day_of_year

    plot response rate over time
        x: date
        y: percent responses for that day
    prob have to bin by week because low volume?

In [None]:
temp = master_df[['first_message_day_of_year','response','const2']]
temp = temp.groupby(['first_message_day_of_year']).sum().reset_index()[50:]
temp['percent_response'] = temp.response/temp.const2
temp.percent_response.plot()
plt.show()

In [None]:
bins=7
temp['_bin'] = temp.first_message_day_of_year.apply(lambda x: int(x/bins))
temp = temp.groupby('_bin').sum()
temp.percent_response = temp['percent_response'] = temp.response/temp.const2
temp.head()
temp.percent_response.plot()
plt.show()

### created_day_of_year_receiver

In [None]:
temp = master_df[['created_day_of_year_receiver','response','const2']]
temp = temp.groupby(['created_day_of_year_receiver']).sum().reset_index()[50:]
temp['percent_response'] = temp.response/temp.const2
temp.percent_response.plot()
plt.show()

### urgency_receiver

In [None]:
bins=7
temp['_bin'] = temp.created_day_of_year_receiver.apply(lambda x: int(x/bins))
temp = temp.groupby('_bin').sum()
temp.percent_response = temp['percent_response'] = temp.response/temp.const2
temp.head()
temp.percent_response.plot()
plt.show()

### td_creat_avail_receiver

In [None]:
col = 
temp = master_df[['created_day_of_year_receiver','response','const2']]
temp = temp.groupby(['created_day_of_year_receiver']).sum().reset_index()[50:]
temp['percent_response'] = temp.response/temp.const2
temp.percent_response.plot()
plt.show()