### Imports

In [104]:
import my_pickle as mp
import my_resample as ms
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import confusion_matrix, precision_score, recall_score


from sklearn.model_selection import GridSearchCV

### Read in data

In [43]:
# read in
convo_df = mp.unjson_it('data_master')
user_df = mp.unjson_it('data_user')

# get uid as a column
user_df = user_df.reset_index().rename(index=str, columns={"index": "uid"})

In [119]:
user_df.columns

Index(['uid', 'I_count', 'I_ratio', 'about', 'activeAt', 'age', 'amenities',
       'available', 'birthday', 'college', 'created', 'exclaim_count',
       'exclaim_ratio', 'facebookId', 'gender', 'has_about', 'has_facebookId',
       'has_linkedinId', 'has_picture', 'has_room', 'hobbies', 'hometownCity',
       'hometownCountry', 'hometownState', 'inRelationship', 'isClean',
       'isNight', 'isStudent', 'len_about', 'linkedinId', 'location',
       'maxCost', 'metro', 'minCost', 'neighborhoods', 'numRoommates',
       'onboarded', 'period_count', 'period_ratio', 'petsOk', 'picture',
       'question_count', 'question_ratio', 'sentence_count', 'sentence_ratio',
       'smokingOk', 'term', 'timeframe', 'type', 'updated', 'work', 'interact',
       'sender', 'responder', 'messager'],
      dtype='object')

# The Problem

In [44]:
print("Only {:.3}% of messages get a response.".format(100*convo_df.response.sum()/len(convo_df)))

Only 10.7% of messages get a response.


# Goal 1: filter out users who probably aren't active

## Step 1: identify active users

In [114]:
# create an array with anyone who has sent a message
# assume conversations between just two users
senders = set(convo_df.uid_sender.unique())
responders = set(convo_df[convo_df.response == True].uid_receiver.unique())
messagers = senders.union(responders)

# create column on user_df for active/not active
user_df['sender'] = user_df.uid.apply(lambda x: x in senders)
user_df['responder'] = user_df.uid.apply(lambda x: x in responders)
user_df['messager'] = user_df.uid.apply(lambda x: x in messagers)

print("{:.3}% of users start a conversation".format(100*len(senders)/len(user_df)))
print("{:.3}% of users respond to a message".format(100*len(responders)/len(user_df)))
print("        (but ~10% of messages get a response.)")
print("{:.3}% of users send a message".format(100*len(messagers)/len(user_df)))

9.55% of users start a conversation
2.76% of users respond to a message
        (but ~10% of messages get a response.)
10.9% of users send a message


## Step 2: clean up user data to feed into model

In [168]:
user_df.amenities = user_df.amenities.apply(lambda x: len(x) if isinstance(x,list) else 0)
user_df.hobbies = user_df.hobbies.apply(lambda x: len(x) if isinstance(x,list) else 0)
user_df.neighborhoods = user_df.neighborhoods.apply(lambda x: len(x) if isinstance(x,list) else 0)
user_df.location = user_df.location.apply(lambda x: 1 if isinstance(x,list) else 0)

user_df.college = user_df.college.apply(lambda x: 1 if isinstance(x,str) else 0)
user_df.hometownCity = user_df.hometownCity.apply(lambda x: 1 if isinstance(x,str) else 0)
user_df.hometownCountry = user_df.hometownCountry.apply(lambda x: 1 if isinstance(x,str) else 0)
user_df.hometownState = user_df.hometownState.apply(lambda x: 1 if isinstance(x,str) else 0)
user_df.work = user_df.work.apply(lambda x: 1 if isinstance(x,str) else 0)

col_to_keep = ['I_count','age', 'amenities','college','exclaim_count','gender','has_about', 'has_facebookId',
       'has_linkedinId', 'has_picture', 'has_room','hobbies','hometownCity','hometownState','hometownCountry',
               'inRelationship', 'isClean',
       'isNight', 'isStudent', 'len_about','location','maxCost','minCost','neighborhoods','numRoommates',
       'period_count','petsOk','question_count','smokingOk', 'term','work'] 
user_df['const']=1
user_df['interact'] = user_df['interact'].apply(lambda x: 1 if x else 0)
X = user_df[col_to_keep].fillna(X.mean(axis=0))
y_wconst = user_df[['interact','const']]
y = user_df['interact']

In [169]:
# save X and y for gridsearch
mp.json_it(X, 'data_X_013018')
mp.json_it(y_wconst, 'data_y_013018')

## Step 3: Run models

Goal is to maximize negative predictive value (NPV)

${\displaystyle \mathrm {NPV} ={\frac {\mathrm {TN} }{\mathrm {TN} +\mathrm {FN} }}}$

remove users unlikely to be active while keeping users who may be active

In [132]:
def get_NPV(confusion_matrix):
    TN = confusion_matrix[0][0]
    FN = confusion_matrix[1][0]
    return TN/(TN+FN)

get_NPV(confusion_matrix(y_test, y_pred))

0.90468986384266259

In [96]:
def display_importances_trees():
    # show feature importances
    feature_df = pd.DataFrame([X.columns, model.feature_importances_]).T
    feature_df.columns = ['feature','value']
    return feature_df.sort_values('value', ascending=False)
             
def display_metrics():
    print("\nMETRICS")
    print("Model recall: {}".format(recall_score(y_test, y_pred)))
    print("Model precision: {}".format(precision_score(y_test, y_pred)))
    print("Model accuracy: {}".format(model.score(X_test, y_test)))

    print ("\nCONFUSION MATRIX")
    print (confusion_matrix(y_test, y_pred))
    print ("\nkey:")
    print (" TN   FP ")
    print (" FN   TP ")

    print("\nRECALL AND ACCURACY FOR DIFFERNET MODELS")
    print("recall     \t precision   \tmodel")
    print(recall_score(y_test, y_pred), '\t',precision_score(y_test, y_pred), "my model")
    print(recall_score(y_test, pred_all_0),'\t','\t', precision_score(y_test, pred_all_0), "\t\tpredict all zero")
    print(recall_score(y_test, pred_all_1),'\t','\t', precision_score(y_test, pred_all_1), "predict all one")
    print(recall_score(y_test, pred_50_50),'\t', precision_score(y_test, pred_50_50), "predict 50-50")
    print(recall_score(y_test, pred_90_10), precision_score(y_test, pred_90_10), "predict 90-10")

In [222]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X.as_matrix(), y.as_matrix(), random_state=17)

# resample
X_train, y_train = ms.oversample(X_train, y_train, .25)

# scale data
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# make fake data
pred_all_0 = [0]*len(y_test)
pred_all_1 = [1]*len(y_test)
pred_50_50 = np.random.choice([0,1], size=len(y_test))
pred_90_10 = np.random.choice([0,1], size=len(y_test), p=[.9,.1])

model = GradientBoostingClassifier(learning_rate= 0.4, 
                                   max_depth= 10, 
                                   min_samples_leaf= 2, 
                                   min_samples_split= 3, 
                                   n_estimators= 100, 
                                   subsample= 1)
# model = GradientBoostingClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(get_NPV(confusion_matrix(y_test, y_pred)))
# show metrics
display_metrics()

# # show importances
display_importances_trees().head(5)



0.910308894678

METRICS
Model recall: 0.1392857142857143
Model precision: 0.582089552238806
Model accuracy: 0.9023238925199709

CONFUSION MATRIX
[[2446   28]
 [ 241   39]]

key:
 TN   FP 
 FN   TP 

RECALL AND ACCURACY FOR DIFFERNET MODELS
recall     	 precision   	model
0.139285714286 	 0.582089552239 my model
0.0 	 	 0.0 		predict all zero
1.0 	 	 0.101670297749 predict all one
0.471428571429 	 0.0927617709065 predict 50-50
0.132142857143 0.138059701493 predict 90-10


  'precision', 'predicted', average, warn_for)


Unnamed: 0,feature,value
5,distance,0.0909967
28,len_about_receiver,0.0438527
3,age_receiver,0.0382462
2,age_dif,0.0297432
30,maxCost_receiver,0.0288005


# Filter Users

In [174]:
user_df['prediction'] = model.predict(X)

In [192]:
X = mp.unjson_it('data_X')
y = mp.unjson_it('data_y')['response']
text_similarity_df = mp.unjson_it('data_text_similarity')
X['count_similarity'] = text_similarity_df['count_similarity']
X['tfidf_similarity'] = text_similarity_df['tfidf_similarity']

In [202]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X.as_matrix(), y.as_matrix(), random_state=17)

# resample
X_train, y_train = ms.oversample(X_train, y_train, .5)

# scale data
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# make fake data
pred_all_0 = [0]*len(y_test)
pred_all_1 = [1]*len(y_test)
pred_50_50 = np.random.choice([0,1], size=len(y_test))
pred_90_10 = np.random.choice([0,1], size=len(y_test), p=[.9,.1])

model = GradientBoostingClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# show metrics
display_metrics()

# show importances
display_importances_trees().head(5)


METRICS
Model recall: 0.5857142857142857
Model precision: 0.20838627700127066
Model accuracy: 0.7316630355846042

CONFUSION MATRIX
[[1851  623]
 [ 116  164]]

key:
 TN   FP 
 FN   TP 

RECALL AND ACCURACY FOR DIFFERNET MODELS
recall     	 precision   	model
0.585714285714 	 0.208386277001 my model
0.0 	 	 0.0 		predict all zero
1.0 	 	 0.101670297749 predict all one
0.521428571429 	 0.102600140548 predict 50-50
0.0964285714286 0.0992647058824 predict 90-10


  'precision', 'predicted', average, warn_for)


Unnamed: 0,feature,value
28,len_about_receiver,0.118654
5,distance,0.0913954
69,urgency_receiver,0.0911112
3,age_receiver,0.071858
70,urgency_sender,0.0712608


In [200]:
predicted_active_users = set(user_df[user_df.prediction == 1].uid)

In [210]:
X['predicted_active_receiver'] = convo_df.uid_receiver.apply(lambda x: x in predicted_active_users)

In [214]:
y_filtered = y[X.predicted_active_receiver == 1]
X_filtered = X[X.predicted_active_receiver == 1]

In [221]:
print("Messages sent to pre-screened users receive responses {}% of the time".format(100*y_filtered.sum()/len(y_filtered)))

Messages sent to pre-screened users receive responses 15.294117647058824% of the time


In [216]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X_filtered.as_matrix(), y_filtered.as_matrix(), random_state=17)

# resample
X_train, y_train = ms.oversample(X_train, y_train, .5)

# scale data
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# make fake data
pred_all_0 = [0]*len(y_test)
pred_all_1 = [1]*len(y_test)
pred_50_50 = np.random.choice([0,1], size=len(y_test))
pred_90_10 = np.random.choice([0,1], size=len(y_test), p=[.9,.1])

model = GradientBoostingClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# show metrics
display_metrics()

# show importances
display_importances_trees().head(5)




METRICS
Model recall: 0.5068493150684932
Model precision: 0.22023809523809523
Model accuracy: 0.6918819188191881

CONFUSION MATRIX
[[676 262]
 [ 72  74]]

key:
 TN   FP 
 FN   TP 

RECALL AND ACCURACY FOR DIFFERNET MODELS
recall     	 precision   	model
0.506849315068 	 0.220238095238 my model
0.0 	 	 0.0 		predict all zero
1.0 	 	 0.134686346863 predict all one
0.5 	 0.133211678832 predict 50-50
0.0479452054795 0.0729166666667 predict 90-10


  'precision', 'predicted', average, warn_for)


Unnamed: 0,feature,value
3,age_receiver,0.112021
28,len_about_receiver,0.0892394
69,urgency_receiver,0.0677629
5,distance,0.0505383
33,minCost_sender,0.0434035


In [220]:
display_importances_trees().reset_index(drop=True)

Unnamed: 0,feature,value
0,age_receiver,0.112021
1,len_about_receiver,0.0892394
2,urgency_receiver,0.0677629
3,distance,0.0505383
4,minCost_sender,0.0434035
5,maxCost_sender,0.0417146
6,urgency_sender,0.0390941
7,numRoommates_sender,0.0311809
8,minCost_receiver,0.0295081
9,age_dif,0.0271157


### Q1: Are users influenced by this low rate?
- compare users who get a response quickly to those who don't?