# Relax Inc. Take-Home Challenge

## Data Cleaning and Pre-processing

First, let's check 'everydaylabs_studentinfo.csv' and 'everydaylabs_studentabsences.csv' and the data they provide.

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import uuid

In [2]:
user_engagement = pd.read_csv('takehome_user_engagement.csv')
user_engagement.head()

Unnamed: 0,time_stamp,user_id,visited
0,2014-04-22 03:53:30,1,1
1,2013-11-15 03:45:04,2,1
2,2013-11-29 03:45:04,2,1
3,2013-12-09 03:45:04,2,1
4,2013-12-25 03:45:04,2,1


In [3]:
user_engagement['time_stamp'] = pd.to_datetime(user_engagement['time_stamp'])
user_engagement['date'] = pd.to_datetime(user_engagement['time_stamp'].dt.date)
user_engagement.set_index('date', inplace=True)
user_engagement.sort_index(inplace=True)
user_engagement.drop(['time_stamp'], axis=1, inplace=True)

In [4]:
user_engagement.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 207917 entries, 2012-05-31 to 2014-06-06
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype
---  ------   --------------   -----
 0   user_id  207917 non-null  int64
 1   visited  207917 non-null  int64
dtypes: int64(2)
memory usage: 4.8 MB


In [5]:
user_engagement_7d = pd.DataFrame(user_engagement.groupby('user_id').rolling('7D').visited.count())
user_engagement_7d = user_engagement_7d.rename(columns={"visited":"visit count 7day"})
user_adoption = pd.DataFrame(user_engagement_7d.groupby('user_id')['visit count 7day'].max())
user_adoption['adopted'] = user_adoption['visit count 7day'].apply(lambda x: 1 if x>=3 else 0)
user_adoption.drop(['visit count 7day'], axis=1, inplace=True)
user_adoption = user_adoption.reset_index()
user_adoption.columns = ['object_id', 'adopted']

In [6]:
user_adoption.head()

Unnamed: 0,object_id,adopted
0,1,0
1,2,1
2,3,0
3,4,0
4,5,0


In [7]:
users = pd.read_csv('takehome_users.csv', encoding =  "ISO-8859-1")
users.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0


In [8]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 10 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   object_id                   12000 non-null  int64  
 1   creation_time               12000 non-null  object 
 2   name                        12000 non-null  object 
 3   email                       12000 non-null  object 
 4   creation_source             12000 non-null  object 
 5   last_session_creation_time  8823 non-null   float64
 6   opted_in_to_mailing_list    12000 non-null  int64  
 7   enabled_for_marketing_drip  12000 non-null  int64  
 8   org_id                      12000 non-null  int64  
 9   invited_by_user_id          6417 non-null   float64
dtypes: float64(2), int64(4), object(4)
memory usage: 937.6+ KB


In [9]:
users.describe()

Unnamed: 0,object_id,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
count,12000.0,8823.0,12000.0,12000.0,12000.0,6417.0
mean,6000.5,1379279000.0,0.2495,0.149333,141.884583,5962.957145
std,3464.24595,19531160.0,0.432742,0.356432,124.056723,3383.761968
min,1.0,1338452000.0,0.0,0.0,0.0,3.0
25%,3000.75,1363195000.0,0.0,0.0,29.0,3058.0
50%,6000.5,1382888000.0,0.0,0.0,108.0,5954.0
75%,9000.25,1398443000.0,0.0,0.0,238.25,8817.0
max,12000.0,1402067000.0,1.0,1.0,416.0,11999.0


In [10]:
users['creation_time'] = pd.to_datetime(users['creation_time'])
users.fillna(0, inplace=True)

In [11]:
df = users.merge(user_adoption, on='object_id')

In [12]:
df.drop_duplicates(inplace=True)

In [13]:
df = pd.get_dummies(df, columns=['creation_source', 'opted_in_to_mailing_list',
                                 'enabled_for_marketing_drip' ], drop_first=True)

In [14]:
X = df.drop(columns=['object_id', 'creation_time', 'name', 'email', 'adopted'], axis=1)
y = df['adopted']

In [15]:
import statsmodels.api as sm
X = sm.add_constant(X)
model = sm.OLS(y, X)
results = model.fit()
print(results.summary())

  import pandas.util.testing as tm


                            OLS Regression Results                            
Dep. Variable:                adopted   R-squared:                       0.164
Model:                            OLS   Adj. R-squared:                  0.164
Method:                 Least Squares   F-statistic:                     192.7
Date:                Sun, 13 Sep 2020   Prob (F-statistic):               0.00
Time:                        15:50:26   Log-Likelihood:                -3316.2
No. Observations:                8823   AIC:                             6652.
Df Residuals:                    8813   BIC:                             6723.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                                         coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------------------
cons

Based on coefficinets and p-values above, it seems that `org_id` and `creation_source` are the important features.