# IMPORTS

In [22]:
import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt
# %matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.metrics import precision_score, recall_score
# from sklearn.metrics import confusion_matrix
from my_pickle import unpickle_it
from my_resample import div_count_pos_neg, undersample, oversample
from my_features import feature_time

# UNPICKLE

In [2]:
user_df = unpickle_it('user_df')
convo_df = unpickle_it('convo_df')

print("Make sure we're not useing ECT data")
print(pd.to_datetime(convo_df.timestamp.max()*1000000))

Make sure we're not useing ECT data
2017-10-02 01:35:58.644000


# GET X AND y

In [3]:
X = feature_time(convo_df, user_df)
y = X.response.map({True:1,False:0})
X = X.drop(['conv_id', 'response', 'first_uid', 'second_uid', 'first_mid',
       'second_mid', 'timestamp'], axis=1)
print(len(X))
print(len(y))#.sum()

Num bad rows: 8
11396
11396


In [4]:
X.head()

Unnamed: 0,age_dif,same_gender,same_relate,same_clean,same_night,same_student
16296,11,True,False,False,True,False
9241,11,False,True,False,True,False
1069,2,True,True,True,True,False
9414,1,True,True,True,False,False
4142,1,False,True,False,False,False


# BASELINE RANDOM FOREST

In [5]:
print("Messages with responses:    {}".format((convo_df.response == True).sum()))
print("Messages without responses: {}".format((convo_df.response == False).sum()))
print("Total messages:             {}".format(len(convo_df)))

Messages with responses:    1234
Messages without responses: 10170
Total messages:             11404


In [15]:
X_unsamp, y_unsamp = X.as_matrix(), y.as_matrix()

X_train, X_test, y_train, y_test = train_test_split(X_unsamp, y_unsamp, random_state=17)

print("messages sent: {}".format(len(y_train)))
print("responses:     {}\n".format(y_train.sum()))

model = RandomForestClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Prediction:   {}...".format(model.predict(X_test)[:20]))
print("Actual:       {}...".format(y_test[:20]))
print("Model recall: {}".format(recall_score(y_test, y_pred)))
print("Model precision: {}".format(precision_score(y_test, y_pred)))
print("Score of model: {}".format(model.score(X_test, y_test)))
print("Score to beat: {}".format(1-y_test.sum()/len(y_test)))

messages sent: 8547
responses:     902

Prediction:   [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]...
Actual:       [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]...
Model recall: 0.006042296072507553
Model precision: 0.15384615384615385
Score of model: 0.8806598806598807
Score to beat: 0.8838188838188838


# UNDERSAMPLE

In [12]:
X_und, y_und = undersample(X, y, .5)

X_train, X_test, y_train, y_test = train_test_split(X_und, y_und, random_state=17)

print("messages sent: {}".format(len(y_train)))
print("responses:     {}\n".format(y_train.sum()))

model = RandomForestClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Prediction:   {}...".format(model.predict(X_test)[:20]))
print("Actual:       {}...".format(y_test[:20]))
print("Model recall: {}".format(recall_score(y_test, y_pred)))
print("Model precision: {}".format(precision_score(y_test, y_pred)))
print("Score of model: {}".format(model.score(X_test, y_test)))
print("Score to beat: {}".format(1-y_test.sum()/len(y_test)))

messages sent: 1850
responses:     937

Prediction:   [1 1 0 0 1 0 0 1 1 0 0 0 0 1 0 0 0 1 0 0]...
Actual:       [0 0 1 1 0 0 1 0 0 0 1 0 1 0 0 1 1 1 1 1]...
Model recall: 0.5
Model precision: 0.5
Score of model: 0.520259319286872
Score to beat: 0.520259319286872


# OVERSAMPLE

In [13]:
X_ove, y_ove = oversample(X.as_matrix(), y.as_matrix(), .5)

X_train, X_test, y_train, y_test = train_test_split(X_ove, y_ove, random_state=17)

print("messages sent: {}".format(len(y_train)))
print("responses:     {}\n".format(y_train.sum()))

model = RandomForestClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Prediction:   {}...".format(model.predict(X_test)[:20]))
print("Actual:       {}...".format(y_test[:20]))
print("Model recall: {}".format(recall_score(y_test, y_pred)))
print("Model precision: {}".format(precision_score(y_test, y_pred)))
print("Score of model: {}".format(model.score(X_test, y_test)))
print("Score to beat: {}".format(1-y_test.sum()/len(y_test)))

messages sent: 15244
responses:     7645

Prediction:   [1 0 1 0 1 1 0 1 1 1 1 0 1 1 1 1 0 1 0 1]...
Actual:       [0 0 1 0 1 1 0 1 1 1 0 0 1 0 0 1 1 0 0 1]...
Model recall: 0.7283558379666402
Model precision: 0.6080901856763926
Score of model: 0.6328217237308147
Score to beat: 0.50452577725305


# OTHER MODELS KEPT HERE

In [27]:
model = LogisticRegression()
model.fit(X_train, y_train)
print("Prediction:   {}".format(model.predict(X_test)))
print("Actual:       {}".format(y_test))
print("Score of model: {}".format(model.score(X_test, y_test)))
print("Score to beat: {}".format(1-y_test.sum()/len(y_test)))

Prediction:   [0 0 0 ..., 0 0 0]
Actual:       [0 0 0 ..., 1 0 0]
Score of model: 0.8838188838188838
Score to beat: 0.8838188838188838


In [20]:
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Prediction:   {}".format(model.predict(X_test)))
print("Actual:       {}".format(y_test))
print("Model recall: {}".format(recall_score(y_test, y_pred)))
print("Model precision: {}".format(precision_score(y_test, y_pred)))
print("Score of model: {}".format(model.score(X_test, y_test)))
print("Score to beat: {}".format(1-y_test.sum()/len(y_test)))

Prediction:   [0 0 0 ..., 0 0 0]
Actual:       [0 0 0 ..., 1 0 0]
Model recall: 0.006042296072507553
Model precision: 0.18181818181818182
Score of model: 0.8813618813618813
Score to beat: 0.8838188838188838


In [25]:
model = GradientBoostingClassifier()

model.fit(X_train, y_train)
print("Prediction:   {}".format(model.predict(X_test)))
print("Actual:       {}".format(y_test))
print("Model recall: {}".format(recall_score(y_test, y_pred)))
print("Model precision: {}".format(precision_score(y_test, y_pred)))
print("Score of model: {}".format(model.score(X_test, y_test)))
print("Score to beat: {}".format(1-y_test.sum()/len(y_test)))

Prediction:   [0 0 0 ..., 0 0 0]
Actual:       [0 0 0 ..., 1 0 0]
Model recall: 0.006042296072507553
Model precision: 0.18181818181818182
Score of model: 0.8838188838188838
Score to beat: 0.8838188838188838
