In [115]:
import pandas as pd
import numpy as np

import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)

In [116]:
## read_data

user_df = pd.read_csv('takehome_users.csv', encoding = "ISO-8859-1")
eng_df = pd.read_csv('takehome_user_engagement.csv', parse_dates = ["time_stamp"])

In [117]:
user_df.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0


In [118]:
eng_df.head()

Unnamed: 0,time_stamp,user_id,visited
0,2014-04-22 03:53:30,1,1
1,2013-11-15 03:45:04,2,1
2,2013-11-29 03:45:04,2,1
3,2013-12-09 03:45:04,2,1
4,2013-12-25 03:45:04,2,1


In [119]:
user_df = user_df.rename(columns = {'object_id' : 'user_id'})

In [120]:
eng_df = eng_df.set_index('time_stamp')

In [121]:
eng_df.user_id.value_counts()

3623     606
906      600
1811     593
7590     590
8068     585
        ... 
6763       1
3773       1
5822       1
10040      1
2047       1
Name: user_id, Length: 8823, dtype: int64

In [122]:
users = eng_df.user_id.unique()
adopted = []

for i in users:
    tmp_df = eng_df[eng_df.user_id == i]
    tmp_df = tmp_df.resample("1D").count()
    tmp_df = tmp_df.rolling(window = 7).sum()
    tmp_df = tmp_df.dropna()
    adopted.append(any(tmp_df['visited'].values >= 3))

In [123]:
## adding adopted feature into df
users_adopted = list(zip(users, adopted))

adopt_df = pd.DataFrame(users_adopted)
adopt_df.columns = ['user_id', 'adopted']

# merging to user_df
final_df = user_df.merge(adopt_df, on = 'user_id', how = 'left')

## making boolean feature to numeric
final_df['adopted'] = final_df['adopted'].replace([True, False, np.nan], [0, 1, 0])
final_df.dropna(subset = ['adopted'], inplace = True)

In [124]:
final_df['adopted'] = final_df.adopted.astype(int)

In [125]:
# transforming invited_by_user_id
trfm_func = lambda row : 0 if np.isnan(row) else 1
final_df['invited_by_user'] = final_df['invited_by_user_id'].apply(trfm_func)

In [126]:
final_df.head()

Unnamed: 0,user_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,adopted,invited_by_user
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0,1,1
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0,0,1
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0,1,1
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0,1,1
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0,1,1


In [127]:
## selecting only needed features
final_df = final_df[["adopted", 'invited_by_user', 'creation_source', 'opted_in_to_mailing_list', 'enabled_for_marketing_drip']]

In [128]:
final_df.head()

Unnamed: 0,adopted,invited_by_user,creation_source,opted_in_to_mailing_list,enabled_for_marketing_drip
0,1,1,GUEST_INVITE,1,0
1,0,1,ORG_INVITE,0,0
2,1,1,ORG_INVITE,0,0
3,1,1,GUEST_INVITE,0,0
4,1,1,GUEST_INVITE,0,0


In [129]:
### import scikit-learn libraries

from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

## building the pipeline
X = final_df[final_df.columns[1:]]
y = final_df[final_df.columns[0]]

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.6, random_state = 42)

pipeline = Pipeline(steps = [("encoder", OneHotEncoder()), ('rf', RandomForestClassifier(random_state = 42))])

params = {"rf__max_depth": [5, 10, 20],
          "rf__n_estimators": [50, 100, 200]}

cv = GridSearchCV(pipeline, param_grid = params, cv = 3)
cv.fit(X_train, y_train)

print("Top paramteters for our model are {}".format(cv.best_params_))
print("Training accuracy for the best model is {}".format(cv.best_score_ * 100))

Top paramteters for our model are {'rf__max_depth': 5, 'rf__n_estimators': 50}
Training accuracy for the best model is 68.22916666666666


In [130]:
## test set score
y_pred = cv.predict(X_test)
test_acc = accuracy_score(y_test, y_pred)
print("Model accuracy is {}".format(test_acc * 100))

Model accuracy is 67.33333333333333


In [131]:
## finding feature importances

X_tmp = pd.get_dummies(X_test)
pipeline.fit(X_tmp, y_test)

feats = pipeline.named_steps["rf"].feature_importances_

feat_imps = dict(zip(X_tmp.columns, feats))
feat_imps = sorted(feat_imps.items(), key = lambda x : x[1], reverse = True)
feat_imps

[('opted_in_to_mailing_list', 0.05072025756777686),
 ('creation_source_SIGNUP', 0.007185561914506304),
 ('creation_source_ORG_INVITE', 0.005030922756315115),
 ('creation_source_PERSONAL_PROJECTS', 0.004329794288287732),
 ('enabled_for_marketing_drip', 0.003986245124958358),
 ('creation_source_GUEST_INVITE', 0.0032358688635802852),
 ('creation_source_SIGNUP_GOOGLE_AUTH', 0.0014538727506660232),
 ('invited_by_user', 0.0)]

Based on the feature importances we have selected the following features for building the model.


1.invited_by_user - if a user was referred by another user (custom feature) <br>
2.creation_source - how the account was created (stock feature) <br>
3.opted_in_to_mailing_list - whether user has opted into receiving marketing emails (stock feature) <br>
4.enabled_for_marketing_drip - whether they are on the regular marketing email drip (stock feature) <br>
