In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn import preprocessing

Goals:
Define an adopted user as someone who has logged in on 3 days of a 7 days period.
Identify which factors best predict user adoption.
Write a 1 page summary on results, along with any graphs or code to help explain my approach.

In [2]:
#check columns of available data
users_df = pd.read_csv('takehome_users.csv', header=0, index_col=0, encoding='latin-1')
engagement_df = pd.read_csv('takehome_user_engagement.csv', header=0, encoding='latin-1')

print(users_df.head())
print(engagement_df.head())

                 creation_time               name                       email  \
object_id                                                                       
1          2014-04-22 03:53:30     Clausen August    AugustCClausen@yahoo.com   
2          2013-11-15 03:45:04      Poole Matthew      MatthewPoole@gustr.com   
3          2013-03-19 23:14:52  Bottrill Mitchell  MitchellBottrill@gustr.com   
4          2013-05-21 08:09:28    Clausen Nicklas   NicklasSClausen@yahoo.com   
5          2013-01-17 10:14:20          Raw Grace          GraceRaw@yahoo.com   

          creation_source  last_session_creation_time  \
object_id                                               
1            GUEST_INVITE                1.398139e+09   
2              ORG_INVITE                1.396238e+09   
3              ORG_INVITE                1.363735e+09   
4            GUEST_INVITE                1.369210e+09   
5            GUEST_INVITE                1.358850e+09   

           opted_in_to_mailing_l

In [3]:
#prep engagement table for merge
users_df['creation_time'] = pd.to_datetime(users_df['creation_time'])
engagement_df['time_stamp'] = pd.to_datetime(engagement_df['time_stamp'])

users_df = users_df.sort_values(by='creation_time', ascending=True)
engagement_df = engagement_df.sort_values(by='time_stamp', ascending=True)

engagement_df_agg = engagement_df.groupby(by='user_id').min()
engagement_df_agg['visited'] = engagement_df.groupby(by='user_id').sum()['visited']
engagement_df_agg.rename(columns={'time_stamp':'creation_time'}, inplace=True)
engagement_df_agg.reset_index(inplace=True)
print(engagement_df_agg.head(10))

   user_id       creation_time  visited
0        1 2014-04-22 03:53:30        1
1        2 2013-11-15 03:45:04       14
2        3 2013-03-19 23:14:52        1
3        4 2013-05-22 08:09:28        1
4        5 2013-01-22 10:14:20        1
5        6 2013-12-19 03:37:06        1
6        7 2012-12-20 13:24:32        1
7       10 2013-01-16 22:08:03      284
8       11 2013-12-27 03:55:54        1
9       13 2014-03-30 16:19:38        1


In [4]:
#need to figure out who is considered to be an adopted user (3 visits within one 7-day period)
engagement_df_agg['adopted'] = False
potentially_adopted = engagement_df_agg['user_id'][engagement_df_agg['visited']>=3]

week = datetime.timedelta(weeks=1)

for user_id in potentially_adopted:
    dates = engagement_df['time_stamp'][engagement_df['user_id']==user_id]
    
    for i in dates[:-2].index:
        if(dates[i+2]-dates[i] >= week):
            engagement_df_agg['adopted'][engagement_df_agg['user_id']==user_id] = True
            break

print(engagement_df_agg[engagement_df_agg['adopted']==True].head())

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  engagement_df_agg['adopted'][engagement_df_agg['user_id']==user_id] = True


    user_id       creation_time  visited  adopted
1         2 2013-11-15 03:45:04       14     True
7        10 2013-01-16 22:08:03      284     True
13       20 2014-03-11 11:46:38        7     True
24       33 2014-03-11 06:29:09       18     True
28       42 2012-11-13 19:05:07      342     True


In [12]:
#merge tables
joined_df = users_df.merge(engagement_df_agg, on='creation_time')
print(joined_df.head())

        creation_time              name                       email  \
0 2012-05-31 08:20:06    Spikes Danille  DanilleJSpikes@hotmail.com   
1 2012-05-31 15:47:36     Spears Arthur     ArthurJSpears@gmail.com   
2 2012-05-31 17:19:37   Jørgensen Sofie    SofieNJrgensen@yahoo.com   
3 2012-05-31 21:58:33   Faulkner Hayden    HaydenFaulkner@gmail.com   
4 2012-06-01 00:17:30  Morrison Natasha   NatashaMorrison@gustr.com   

      creation_source  last_session_creation_time  opted_in_to_mailing_list  \
0          ORG_INVITE                1.338452e+09                         0   
1  SIGNUP_GOOGLE_AUTH                1.352822e+09                         1   
2          ORG_INVITE                1.338485e+09                         0   
3  SIGNUP_GOOGLE_AUTH                1.399932e+09                         0   
4          ORG_INVITE                1.339719e+09                         0   

   enabled_for_marketing_drip  org_id  invited_by_user_id  user_id  visited  \
0                  

In [13]:
#prep for modeling
joined_df = pd.get_dummies(joined_df, columns=['creation_source'])
y = joined_df['adopted']
X = joined_df
X['year'] = X['creation_time'].apply(lambda x: x.year)
X['month'] = X['creation_time'].apply(lambda x: x.month)
X['day'] = X['creation_time'].apply(lambda x: x.day)
X['weekday'] = X['creation_time'].apply(lambda x: x.weekday())
X['hour'] = X['creation_time'].apply(lambda x: x.hour)
X['minute'] = X['creation_time'].apply(lambda x: x.minute)
X['second'] = X['creation_time'].apply(lambda x: x.second)
X['invited_by_user_id'] = X['invited_by_user_id'].notna() #just want to know if they were invited by someone else or not

#X['last_session_creation_time'] = X['last_session_creation_time'].notna()
X['last_session_creation_time'] = X['last_session_creation_time'].fillna(1)
#X['last_session_creation_time'] = X['last_session_creation_time'].apply(lambda x: np.log(x))

X = X.drop(columns=['adopted', 'name', 'email', 'creation_time', 'user_id', 'visited', 'org_id'])

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y)
scaler = preprocessing.StandardScaler().fit_transform(X_train)

In [15]:
#train and test model
model = LogisticRegression(penalty='l2', max_iter=1000, random_state=1020)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(accuracy_score(y_pred, y_test))
print(confusion_matrix(y_pred, y_test))

for i in range(len(X.columns)):
    print(X.columns[i], model.coef_[0][i])
    
print(min(model.coef_[0]), max(model.coef_[0]))

0.7435661764705882
[[809 279]
 [  0   0]]
last_session_creation_time -0.008169888862001753
opted_in_to_mailing_list 0.04762551734077864
enabled_for_marketing_drip 0.051438916878316365
invited_by_user_id 0.1555957347047609
creation_source_GUEST_INVITE 0.2746192466636301
creation_source_ORG_INVITE -0.11902351195896527
creation_source_PERSONAL_PROJECTS 0.11671468369528643
creation_source_SIGNUP -0.07889222051892769
creation_source_SIGNUP_GOOGLE_AUTH -0.19301694523915897
year -0.0005126788579250498
month 0.020231763602281315
day -0.006630439744021794
weekday 0.005656719178833185
hour -0.0028678618676720538
minute -0.0031733417787054904
second 0.0011763570113700159
-0.19301694523915897 0.2746192466636301
