### Imports

In [104]:
import my_pickle as mp
import my_resample as ms
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import confusion_matrix, precision_score, recall_score


from sklearn.model_selection import GridSearchCV

### Read in data

In [43]:
# read in
convo_df = mp.unjson_it('data_master')
user_df = mp.unjson_it('data_user')

# get uid as a column
user_df = user_df.reset_index().rename(index=str, columns={"index": "uid"})

In [119]:
user_df.columns

Index(['uid', 'I_count', 'I_ratio', 'about', 'activeAt', 'age', 'amenities',
       'available', 'birthday', 'college', 'created', 'exclaim_count',
       'exclaim_ratio', 'facebookId', 'gender', 'has_about', 'has_facebookId',
       'has_linkedinId', 'has_picture', 'has_room', 'hobbies', 'hometownCity',
       'hometownCountry', 'hometownState', 'inRelationship', 'isClean',
       'isNight', 'isStudent', 'len_about', 'linkedinId', 'location',
       'maxCost', 'metro', 'minCost', 'neighborhoods', 'numRoommates',
       'onboarded', 'period_count', 'period_ratio', 'petsOk', 'picture',
       'question_count', 'question_ratio', 'sentence_count', 'sentence_ratio',
       'smokingOk', 'term', 'timeframe', 'type', 'updated', 'work', 'interact',
       'sender', 'responder', 'messager'],
      dtype='object')

# The Problem

In [44]:
print("Only {:.3}% of messages get a response.".format(100*convo_df.response.sum()/len(convo_df)))

Only 10.7% of messages get a response.


# Goal 1: filter out users who probably aren't active

## Step 1: identify active users

In [114]:
# create an array with anyone who has sent a message
# assume conversations between just two users
senders = set(convo_df.uid_sender.unique())
responders = set(convo_df[convo_df.response == True].uid_receiver.unique())
messagers = senders.union(responders)

# create column on user_df for active/not active
user_df['sender'] = user_df.uid.apply(lambda x: x in senders)
user_df['responder'] = user_df.uid.apply(lambda x: x in responders)
user_df['messager'] = user_df.uid.apply(lambda x: x in messagers)

print("{:.3}% of users start a conversation".format(100*len(senders)/len(user_df)))
print("{:.3}% of users respond to a message".format(100*len(responders)/len(user_df)))
print("        (but ~10% of messages get a response.)")
print("{:.3}% of users send a message".format(100*len(messagers)/len(user_df)))

9.55% of users start a conversation
2.76% of users respond to a message
        (but ~10% of messages get a response.)
10.9% of users send a message


## Step 2: clean up user data to feed into model

In [138]:
user_df.amenities = user_df.amenities.apply(lambda x: len(x) if isinstance(x,list) else 0)
user_df.hobbies = user_df.hobbies.apply(lambda x: len(x) if isinstance(x,list) else 0)
user_df.neighborhoods = user_df.neighborhoods.apply(lambda x: len(x) if isinstance(x,list) else 0)
user_df.location = user_df.location.apply(lambda x: 1 if isinstance(x,list) else 0)

user_df.college = user_df.college.apply(lambda x: 1 if isinstance(x,str) else 0)
user_df.hometownCity = user_df.hometownCity.apply(lambda x: 1 if isinstance(x,str) else 0)
user_df.hometownCountry = user_df.hometownCountry.apply(lambda x: 1 if isinstance(x,str) else 0)
user_df.hometownState = user_df.hometownState.apply(lambda x: 1 if isinstance(x,str) else 0)
user_df.work = user_df.work.apply(lambda x: 1 if isinstance(x,str) else 0)

# TF = {True:1, False:0}
# TF_col = ['facebookId','has_about','has_room','linkedinId','picture','has_about']
# for col in TF_col:
#     X[col]= X[col].map(TF)
# to_drop = ['type','uid','about','metro','picture','facebookId','linkedinId']
# X = X.drop(to_drop, axis=1)

In [139]:
col_to_keep = ['I_count','age', 'amenities','college','exclaim_count','gender','has_about', 'has_facebookId',
       'has_linkedinId', 'has_picture', 'has_room','hobbies','hometownCity','hometownState','hometownCountry',
               'inRelationship', 'isClean',
       'isNight', 'isStudent', 'len_about','location','maxCost','minCost','neighborhoods','numRoommates',
       'period_count','petsOk','question_count','smokingOk', 'term','work'] 

X = user_df[col_to_keep].fillna(X.mean(axis=0))
y = user_df['interact']

# still missing features! (ammenities, etc)

In [140]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X.as_matrix(), y.as_matrix(), random_state=17)

# resample
X_train, y_train = ms.oversample(X_train, y_train, .5)

# scale data
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# make fake data
pred_all_0 = [0]*len(y_test)
pred_all_1 = [1]*len(y_test)
pred_50_50 = np.random.choice([0,1], size=len(y_test))
pred_90_10 = np.random.choice([0,1], size=len(y_test), p=[.9,.1])

## Step 3: Run models

Goal is to maximize negative predictive value (NPV)

${\displaystyle \mathrm {NPV} ={\frac {\mathrm {TN} }{\mathrm {TN} +\mathrm {FN} }}}$

remove users unlikely to be active while keeping users who may be active

In [132]:
def get_NPV(confusion_matrix):
    TN = confusion_matrix[0][0]
    FN = confusion_matrix[1][0]
    return TN/(TN+FN)

get_NPV(confusion_matrix(y_test, y_pred))

0.90468986384266259

In [96]:
def display_importances_trees():
    # show feature importances
    feature_df = pd.DataFrame([X.columns, model.feature_importances_]).T
    feature_df.columns = ['feature','value']
    return feature_df.sort_values('value', ascending=False)
             
def display_metrics():
    print("\nMETRICS")
    print("Model recall: {}".format(recall_score(y_test, y_pred)))
    print("Model precision: {}".format(precision_score(y_test, y_pred)))
    print("Model accuracy: {}".format(model.score(X_test, y_test)))

    print ("\nCONFUSION MATRIX")
    print (confusion_matrix(y_test, y_pred))
    print ("\nkey:")
    print (" TN   FP ")
    print (" FN   TP ")

    print("\nRECALL AND ACCURACY FOR DIFFERNET MODELS")
    print("recall     \t precision   \tmodel")
    print(recall_score(y_test, y_pred), '\t',precision_score(y_test, y_pred), "my model")
    print(recall_score(y_test, pred_all_0),'\t','\t', precision_score(y_test, pred_all_0), "\t\tpredict all zero")
    print(recall_score(y_test, pred_all_1),'\t','\t', precision_score(y_test, pred_all_1), "predict all one")
    print(recall_score(y_test, pred_50_50),'\t', precision_score(y_test, pred_50_50), "predict 50-50")
    print(recall_score(y_test, pred_90_10), precision_score(y_test, pred_90_10), "predict 90-10")

In [143]:
model = GradientBoostingClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(get_NPV(confusion_matrix(y_test, y_pred)))
# show metrics
display_metrics()

# # show importances
# display_importances_trees().head(5)

0.954610951009

METRICS
Model recall: 0.7208271787296898
Model precision: 0.21488331131660063
Model accuracy: 0.6935508935508935

CONFUSION MATRIX
[[3975 1783]
 [ 189  488]]

key:
 TN   FP 
 FN   TP 

RECALL AND ACCURACY FOR DIFFERNET MODELS
recall     	 precision   	model
0.72082717873 	 0.214883311317 my model
0.0 	 	 0.0 		predict all zero
1.0 	 	 0.105205905206 predict all one
0.490398818316 	 0.104010025063 predict 50-50
0.0871491875923 0.0902140672783 predict 90-10


  'precision', 'predicted', average, warn_for)


### Q1: Are users influenced by this low rate?
- compare users who get a response quickly to those who don't?