In [21]:
#import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime as dt

In [44]:
user = pd.read_csv('../takehome_users.csv', encoding='latin-1')
user_engagement = pd.read_csv("../takehome_user_engagement.csv", encoding='latin-1')

In [7]:
user.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0


In [8]:
user_engagement.head()

Unnamed: 0,time_stamp,user_id,visited
0,2014-04-22 03:53:30,1,1
1,2013-11-15 03:45:04,2,1
2,2013-11-29 03:45:04,2,1
3,2013-12-09 03:45:04,2,1
4,2013-12-25 03:45:04,2,1


In [9]:
user_engagement.shape

(207917, 3)

Note that 'adopted user' is who has logged into the product on three separate days in at least one seven-day period. We first change the time_stamp column of user_engagement table to "%Y-%m-%d" format. 

In [45]:
user_engagement['time_stamp'] = user_engagement['time_stamp'].apply(lambda x: dt.strptime(x, "%Y-%m-%d %H:%M:%S").date())


In [52]:
user_engagement

Unnamed: 0,time_stamp,user_id,visited
0,2012-05-31,1693,1
1,2012-05-31,3428,1
2,2012-05-31,9899,1
3,2012-05-31,10012,1
4,2012-06-01,1995,1
...,...,...,...
207912,2014-06-04,11885,1
207913,2014-06-04,11895,1
207914,2014-06-04,11906,1
207915,2014-06-04,11924,1


In [53]:
user_engagement = user_engagement.sort_values(['user_id', 'time_stamp'])

In [65]:
user_engagement['time_stamp'] = pd.to_datetime(user_engagement['time_stamp'])

In [67]:
def rolling_count(df, frequency):
    return df.rolling(frequency, on = 'time_stamp')['user_id'].count()
user_engagement['adopted'] = user_engagement.groupby('user_id', as_index = False, group_keys = False).apply(rolling_count, '7D')

The "adopted users" are the ones who have adopted value >= 3. We now have a list of "adopted users" as follows, which contains the users have 'adopted' >= 3.

In [69]:
user_engagement[user_engagement['adopted'] >= 3]

Unnamed: 0,time_stamp,user_id,visited,adopted
137777,2014-02-09,2,1,3.0
139947,2014-02-13,2,1,3.0
18541,2013-02-19,10,1,3.0
20374,2013-03-02,10,1,3.0
20870,2013-03-05,10,1,3.0
...,...,...,...,...
201798,2014-05-23,11988,1,6.0
202447,2014-05-24,11988,1,6.0
203659,2014-05-26,11988,1,5.0
204242,2014-05-27,11988,1,5.0


We label these adopted user as 1, and 0 otherwise.

In [71]:
user_adopted = user_engagement.groupby('user_id')['adopted'].max().reset_index()
user_adopted['adopted'] = user_adopted['adopted'].apply(lambda x: 1 if x >= 3 else 0)

In [73]:
user_adopted.head()

Unnamed: 0,user_id,adopted
0,1,0
1,2,1
2,3,0
3,4,0
4,5,0


We now join the user_adopted table with user table to see which factors affect the user adoption.

In [74]:
df = pd.merge(user, user_adopted, how = 'outer', left_on = 'object_id', right_on = 'user_id')

In [75]:
df.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,user_id,adopted
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0,1.0,0.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0,2.0,1.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0,3.0,0.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0,4.0,0.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0,5.0,0.0


We check the correlation of adopted column with other columns.

In [76]:
corr = df.apply(lambda x : pd.factorize(x)[0]).corr(method = 'pearson')
corr.iloc[:,-1:]

Unnamed: 0,adopted
object_id,0.009812
creation_time,0.009478
name,0.006222
email,0.009997
creation_source,-0.027753
last_session_creation_time,0.563438
opted_in_to_mailing_list,-0.012156
enabled_for_marketing_drip,0.011439
org_id,0.029197
invited_by_user_id,0.027659


The factors that has most correlation with 'user adoption' is 'last_session_creation_time'. The other factors have no significant correlation.