# Relax Data Challenge

In [1]:
# Import necessary libraries 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import confusion_matrix
import itertools
from sklearn import metrics
from datetime import date
from scipy import stats
from scipy.stats import norm



In [2]:
user_engagements = pd.read_csv('takehome_user_engagement.csv')
users = pd.read_csv('takehome_users.csv', encoding='latin-1')

In [3]:
user_engagements.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207917 entries, 0 to 207916
Data columns (total 3 columns):
time_stamp    207917 non-null object
user_id       207917 non-null int64
visited       207917 non-null int64
dtypes: int64(2), object(1)
memory usage: 4.8+ MB


In [4]:
users.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0


In [5]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 10 columns):
object_id                     12000 non-null int64
creation_time                 12000 non-null object
name                          12000 non-null object
email                         12000 non-null object
creation_source               12000 non-null object
last_session_creation_time    8823 non-null float64
opted_in_to_mailing_list      12000 non-null int64
enabled_for_marketing_drip    12000 non-null int64
org_id                        12000 non-null int64
invited_by_user_id            6417 non-null float64
dtypes: float64(2), int64(4), object(4)
memory usage: 937.6+ KB


### Problem statement:

Defining an "adopted user" as a user who has logged into the product on three separate days in at least one seven-day period, identify which factors predict future user adoption.

To start this problem, I'll start by creating the target column adopted_user. 

In [6]:
seven_day_delta = pd.Timedelta('7 days')

In [7]:
seven_day_delta

Timedelta('7 days 00:00:00')

In [8]:
user_engagements.head()

Unnamed: 0,time_stamp,user_id,visited
0,2014-04-22 03:53:30,1,1
1,2013-11-15 03:45:04,2,1
2,2013-11-29 03:45:04,2,1
3,2013-12-09 03:45:04,2,1
4,2013-12-25 03:45:04,2,1


In [9]:
user_engagements['time_stamp'] = pd.to_datetime(user_engagements['time_stamp']) - seven_day_delta

In [10]:
user_weekly_engagement = pd.Series([])
days = ['MON', 'TUE', 'WED', 'THU', 'FRI', 'SAT', 'SUN']
for day in days:
    k = user_engagements.groupby(['user_id', pd.Grouper(key='time_stamp', freq='W-' + day)])['visited'].count().reset_index().sort_values('user_id')
    k = k.groupby('user_id')['visited'].max()
    if len(user_weekly_engagement) == 0:
        user_weekly_engagement = k
    else:
        user_weekly_engagement = pd.DataFrame([user_weekly_engagement, k]).max()

In [11]:
users['adopted_user'] = users['object_id'].apply(lambda x: 1 if x in user_weekly_engagement and user_weekly_engagement[x] >= 3 else 0)

In [12]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 11 columns):
object_id                     12000 non-null int64
creation_time                 12000 non-null object
name                          12000 non-null object
email                         12000 non-null object
creation_source               12000 non-null object
last_session_creation_time    8823 non-null float64
opted_in_to_mailing_list      12000 non-null int64
enabled_for_marketing_drip    12000 non-null int64
org_id                        12000 non-null int64
invited_by_user_id            6417 non-null float64
adopted_user                  12000 non-null int64
dtypes: float64(2), int64(5), object(4)
memory usage: 1.0+ MB


In [13]:
users['adopted_user'].mean()*100

13.350000000000001

On an average,13.35% of the users are adopted.

In [14]:
#engagement of users by creation source
users_by_creation_src = users[['creation_source','adopted_user']].groupby('creation_source')['adopted_user'].mean()
users_by_creation_src

creation_source
GUEST_INVITE          0.166436
ORG_INVITE            0.129995
PERSONAL_PROJECTS     0.077688
SIGNUP                0.140393
SIGNUP_GOOGLE_AUTH    0.167509
Name: adopted_user, dtype: float64

From above, it is clear that the users signed up from google have more adoption rate.

In [15]:
#engagement of users by mailing opt in
users[['opted_in_to_mailing_list','adopted_user']].groupby('opted_in_to_mailing_list')['adopted_user'].mean()

opted_in_to_mailing_list
0    0.131912
1    0.138277
Name: adopted_user, dtype: float64

The % of adopted users who have opted in to mailing list is higher than % of adopted users in the given sample.

In [16]:
users['invited_by_user_id']=pd.Series(users['invited_by_user_id'],dtype=pd.Int64Dtype()).fillna(0)

### Feature Engineering:

In [17]:
users['creation_time'] = pd.to_datetime(users['creation_time'])
users['month'] = users['creation_time'].apply(lambda x: x.month)
users['year'] = users['creation_time'].apply(lambda x: x.year)

In [18]:
avg = users['last_session_creation_time'].mean()
users['last_session_creation_time'].fillna(avg, inplace=True)

In [19]:
users.columns

Index(['object_id', 'creation_time', 'name', 'email', 'creation_source',
       'last_session_creation_time', 'opted_in_to_mailing_list',
       'enabled_for_marketing_drip', 'org_id', 'invited_by_user_id',
       'adopted_user', 'month', 'year'],
      dtype='object')

In [20]:
users = users.drop(['object_id','creation_time','name','org_id','invited_by_user_id'], axis=1)

In [21]:
users = pd.get_dummies(users, prefix='is')

### Machine learning:

In [22]:
X=users.drop('adopted_user', axis=1)
y=users['adopted_user']

In [23]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size=0.85, test_size=0.15, stratify=y)

In [24]:
rf_clf= RandomForestClassifier()
rf_clf.fit(Xtrain, ytrain)
rf_clf.score(Xtest, ytest)

0.96

In [25]:
clf = GradientBoostingClassifier()
clf.fit(Xtrain, ytrain)
clf.score(Xtest, ytest)

0.9627777777777777

In [26]:
feat_importance = clf.feature_importances_
#sort the feature by importance, from the most important ones to least important ones
feat_imp_sort_ix=feat_importance.argsort()[::-1]
#the most important features are
Xtrain.columns[feat_imp_sort_ix[0:10]]

Index(['last_session_creation_time', 'year', 'month',
       'is_MaximilianGloeckner@yahoo.com', 'is_EveFetherston@hotmail.com',
       'is_WilliamMills@hotmail.com', 'is_MorganBriggs@yahoo.com',
       'is_LucasKuhn@cuvox.de', 'is_EricFarber@yahoo.com',
       'is_FrankMauer@cuvox.de'],
      dtype='object')

### Final Observation:
    
- On an average,there are only 13.35% of the users adopted.
- From above analysis, it can be observed that the three most important features are 'last_session_creation_time', 'year',
  'month' and all other variables do not show any correlation with the tarhet variable.

    