# Relax Inc Challenge

Find the best prediction for whether a user will be "adopted" or not.

Defining  an  "adopted  user"   as  a  user  who   has  logged  into  the  product  on  three  separate
days  in  at  least  one  seven day  period ,  identify  which  factors  predict  future  user
adoption .

## EXAMINE AND CLEAN DATA

In [1]:
#Import necessary modules
import pandas as pd

In [2]:
engagement = pd.read_csv('takehome_user_engagement.csv')

In [3]:
#had error code
#UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe6 in position 11: invalid continuation byte
#Fixed by adding encoding='latin-1'

users = pd.read_csv('takehome_users.csv', encoding='latin-1')

In [4]:
#Examine the user data and user data types
users.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0


In [5]:
users.dtypes

object_id                       int64
creation_time                  object
name                           object
email                          object
creation_source                object
last_session_creation_time    float64
opted_in_to_mailing_list        int64
enabled_for_marketing_drip      int64
org_id                          int64
invited_by_user_id            float64
dtype: object

In [6]:
#Examine the data and the data types
engagement.head()

Unnamed: 0,time_stamp,user_id,visited
0,2014-04-22 03:53:30,1,1
1,2013-11-15 03:45:04,2,1
2,2013-11-29 03:45:04,2,1
3,2013-12-09 03:45:04,2,1
4,2013-12-25 03:45:04,2,1


In [7]:
engagement.dtypes

time_stamp    object
user_id        int64
visited        int64
dtype: object

In [8]:
#In order to merge data we need to match the user ID names
#We will cahnge the name under the 'users' data profile

users = users.rename({'object_id':'user_id'}, axis=1)

In [9]:
#Change time_stamp to date time for easier wrangling
engagement['time_stamp'] = pd.to_datetime(engagement['time_stamp'])

In [10]:
print('Given data set is over a', engagement['time_stamp'].max() - engagement['time_stamp'].min(), 'time period.')


Given data set is over a 736 days 06:38:44 time period.


In [11]:
#Since data spans a wide amount of time we need to construct a rolling count over a 7 day time period

def get_rolling_count(df, freq):
    return df.rolling(freq, on='time_stamp')['visited'].count()

engagement['visited_7days'] = engagement.groupby('user_id', as_index=False, group_keys=False).apply(get_rolling_count, freq='7D')

engagement.head()

Unnamed: 0,time_stamp,user_id,visited,visited_7days
0,2014-04-22 03:53:30,1,1,1.0
1,2013-11-15 03:45:04,2,1,1.0
2,2013-11-29 03:45:04,2,1,1.0
3,2013-12-09 03:45:04,2,1,1.0
4,2013-12-25 03:45:04,2,1,1.0


In [12]:
#Would prefer the data type to be int as compared to float
engagement['visited_7days'] = engagement['visited_7days'].astype('int64')

#Now that we have their data split into 7 day windows we will need to find the timestamps
#That have a count of at least 3
adopted = pd.DataFrame()

adopted = engagement.loc[(engagement['visited_7days'] >= 3)]
adopted = adopted.drop_duplicates('user_id', keep = 'first')

adopted.head()



Unnamed: 0,time_stamp,user_id,visited,visited_7days
9,2014-02-09 03:45:04,2,1,3
27,2013-02-19 22:08:03,10,1,3
312,2014-03-13 11:46:38,20,1,3
331,2014-03-23 06:29:09,33,1,3
354,2012-12-26 19:05:07,42,1,3


In [18]:
# Use the id numbers to identify who has been adopted

users['adopted'] = users['user_id'].isin(adopted['user_id']).astype('int64')

In [19]:
df = users.merge(engagement, on = 'user_id', how = 'left')
df.head()

Unnamed: 0,user_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,adopted,time_stamp,visited,visited_7days
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0,0,2014-04-22 03:53:30,1.0,1.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0,1,2013-11-15 03:45:04,1.0,1.0
2,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0,1,2013-11-29 03:45:04,1.0,1.0
3,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0,1,2013-12-09 03:45:04,1.0,1.0
4,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0,1,2013-12-25 03:45:04,1.0,1.0


In [23]:
#Dropping email as it is not a predicting factor
#Dropping time stamp since it has been used as much as needed
#Dropping visited 7 days as it is equitable to adopted

df = df.drop(['email', 'time_stamp', 'visited_7days'], axis =1)
df.head()

KeyError: "['email' 'time_stamp' 'visited_7days'] not found in axis"

In [16]:
#Examine for missing Data
print(df.isnull().sum())

user_id                           0
creation_time                     0
name                              0
email                             0
creation_source                   0
last_session_creation_time     3177
opted_in_to_mailing_list          0
enabled_for_marketing_drip        0
org_id                            0
invited_by_user_id            92566
adopted                           0
time_stamp                     3177
visited                        3177
visited_7days                  3177
dtype: int64


In [17]:
#Replace missing data
df['last_session_creation_time'] = df['last_session_creation_time'].fillna(0)
df['visited'] = df['visited'].fillna(0)

### QUESTION: IS THERE A FASTER WAY TO DO WHAT IM DOING BELOW?

In [None]:
#Change whether they were invited by friend to yes/no 0/1 boolean type of variable
#for x in range(len(df)):
#    if df.iloc[x,9] > 0:
#        df.iloc[x,9] = 1
#    else:
#        df.iloc[x,9] = 0

In [None]:
#df.head()

## BEGIN MACHINE LEARNING

### Model: Random Forest