### Relax Inc. Data science challenge

In [15]:
import pandas as pd
import chardet
import time

In [16]:
users = pd.read_csv('C:/Users/sandhya.mukkamala/Downloads/1481069814_relax_challenge/relax_challenge/takehome_users.csv', encoding='ISO-8859-1', index_col='object_id')
engagement = pd.read_csv('C:/Users/sandhya.mukkamala/Downloads/1481069814_relax_challenge/relax_challenge/takehome_user_engagement.csv')

In [17]:
users.head()

Unnamed: 0_level_0,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
object_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0
2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0
4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0
5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0


In [18]:
engagement.head()

Unnamed: 0,time_stamp,user_id,visited
0,2014-04-22 03:53:30,1,1
1,2013-11-15 03:45:04,2,1
2,2013-11-29 03:45:04,2,1
3,2013-12-09 03:45:04,2,1
4,2013-12-25 03:45:04,2,1


In [19]:
engagement.visited.value_counts()

1    207917
Name: visited, dtype: int64

Looks like visited column has all 1's so dropping it

In [20]:
engagement.drop(['visited'], axis=1, inplace=True)

In [23]:
# Data cleaning
engagement['time_stamp'] = pd.to_datetime(engagement['time_stamp']) # convert to datetime
engagement.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207917 entries, 0 to 207916
Data columns (total 2 columns):
time_stamp    207917 non-null datetime64[ns]
user_id       207917 non-null int64
dtypes: datetime64[ns](1), int64(1)
memory usage: 3.2 MB


### Finding out if a user is a adopted user?

In [26]:
from datetime import datetime, timedelta

def custom(x):
    """
    Takes property object x
    converts x to list
    returns 1 if a user logged into the product on three separate
    days in at least one seven day period 
    """
    if len(x) >= 3:  # condition to eliminate if user has less than 3 time stamps
        x = [i for i in x]  # convert property object to list
        x.sort()  # sort the dates in increasing order
        x = [x[i+1] - x[i] for i in range(len(x)-2)]  # compute cumulative difference of current and next day
        # sum i, i+1, i+2 terms and check if it`s less than 7 which gives if it`s in a seven day period
        x = [1 for i in range(len(x)-2) if x[i] + x[i+1] + x[i+2] <= timedelta(days=7)]
        # condition to check if there`s a 1 in x, return 1 if true
        if 1 in x:
            return 1

df1 = engagement.groupby('user_id').agg(custom)  # group by user_id and aggregate using custom function
df1.fillna(0, inplace=True)  # fill null values with 0
df1.columns = ['adopted_user']
df1.head()

Unnamed: 0_level_0,adopted_user
user_id,Unnamed: 1_level_1
1,0.0
2,0.0
3,0.0
4,0.0
5,0.0


In [27]:
print('Total number of users:', len(users))
print('Number of adopted users:', len(df1[df1['adopted_user'] == 1]))

Total number of users: 12000
Number of adopted users: 1322


In [28]:
final = users.join(df1, how='left')
final.head()

Unnamed: 0_level_0,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,adopted_user
object_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0,0.0
2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0,0.0
3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0,0.0
4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0,0.0
5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0,0.0


### Data Preprocessing

- We don`t need name, email.
- Fill adopted_user null values with 0, because those could be considered not adopted users
- Let`s also fill invited_by_user_id null values with 0 and check how it performs. 0 because we can consider 0 to be invited by none.
- Drop null rows
- Make a column 'usage' which is difference between last_session_creation_time and creation_time
- Map strings to ints

In [30]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, confusion_matrix
from sklearn import decomposition
from sklearn.preprocessing import MinMaxScaler

In [31]:
x = final.drop(['name', 'email'], axis=1)

# mapping creation source to integers
creation = {'PERSONAL_PROJECTS': 1,
            'GUEST_INVITE': 2,
            'ORG_INVITE': 3,
            'SIGNUP': 4,
            'SIGNUP_GOOGLE_AUTH': 5}
x['creation_source'] = x['creation_source'].map(lambda x: creation[x])

# fill '0' if not invited by anyone
x['invited_by_user_id'].fillna(0, inplace=True)

# drop null rows, just for sanity check
x.dropna(axis=0, inplace=True)

# unix timestamp to datetime string
x['last_session_creation_time'] = x['last_session_creation_time'].map(lambda x: datetime.
                                            fromtimestamp(int(x)).strftime('%Y-%m-%d %H:%M:%S'))
# string to datetime
x['last_session_creation_time'] = pd.to_datetime(x['last_session_creation_time'])
# creation time string to datetime
x['creation_time'] = pd.to_datetime(x['creation_time'])

# create a column usage
x['usage'] = x['last_session_creation_time'] - x['creation_time']
# drop the time columns
x.drop(['creation_time', 'last_session_creation_time'], axis=1, inplace=True)
# if last usage date is less than created date then difference is less than '0'
# So drop 'usage' values less than '0 days'
x = x[x['usage'] >= timedelta(days=0)]

# convert back to unix timestamp, because it is easy to operate on
x['usage'] = x['usage'].map(lambda x: x.total_seconds())

complete = x

### Decision Tree classifier

In [35]:
# including "usage" feature
y = complete['adopted_user']
x = complete.drop(['adopted_user'], axis=1)
scaler = MinMaxScaler()
scaler.fit(x)
scaler.fit_transform(x)

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42, stratify=y)
tree = DecisionTreeClassifier()

tree.fit(X_train, y_train)

y_pred = tree.predict(X_test)

# scores
print('******Feature importance******')
for idx, val in enumerate(tree.feature_importances_):
    print("{:30}{:3f}".format(x.columns[idx], val))
print('-----------------------------------------------')

print("{:30}{:3f}".format('F1 score', f1_score(y_test, y_pred)))
print("{:30}{:3f}".format('Test accuracy', accuracy_score(y_test, y_pred)))
print()

print('******Confusion Matrix******')
print(confusion_matrix(y_test, y_pred))

******Feature importance******
creation_source               0.013607
opted_in_to_mailing_list      0.005977
enabled_for_marketing_drip    0.004631
org_id                        0.063386
invited_by_user_id            0.034692
usage                         0.877707
-----------------------------------------------
F1 score                      0.873832
Test accuracy                 0.943544

******Confusion Matrix******
[[1431   46]
 [  62  374]]


In [37]:
# without including "usage" feature
y = complete['adopted_user']
# drop 'usage' feature
x = complete.drop(['adopted_user', 'usage'], axis=1)
# use MinMaxScaler to scale values to [0,1]
scaler = MinMaxScaler()
scaler.fit(x)
scaler.fit_transform(x)

# 67% train and 33% test data
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42, stratify=y)
tree = DecisionTreeClassifier()

tree.fit(X_train, y_train)

y_pred = tree.predict(X_test)

# scores
print('******Feature importance******')
for idx, val in enumerate(tree.feature_importances_):
    print("{:30}{:3f}".format(x.columns[idx], val))


print("{:30}{:3f}".format('F1 score', f1_score(y_test, y_pred)))
print("{:30}{:3f}".format('Test accuracy', accuracy_score(y_test, y_pred)))
print()

print('******Confusion Matrix******')
print(confusion_matrix(y_test, y_pred))

******Feature importance******
creation_source               0.059038
opted_in_to_mailing_list      0.067704
enabled_for_marketing_drip    0.063666
org_id                        0.476988
invited_by_user_id            0.332604
F1 score                      0.207071
Test accuracy                 0.671720

******Confusion Matrix******
[[1203  274]
 [ 354   82]]


In [39]:
# including only usage feature
y = complete['adopted_user']
x = complete.drop(['adopted_user', 'creation_source', 'opted_in_to_mailing_list', 
                   'enabled_for_marketing_drip', 'org_id', 'invited_by_user_id'], axis=1)
scaler = MinMaxScaler()
scaler.fit(x)
scaler.fit_transform(x)

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42, stratify=y)
tree = DecisionTreeClassifier()

tree.fit(X_train, y_train)

y_pred = tree.predict(X_test)

# scores
print('******Feature importance******')
for idx, val in enumerate(tree.feature_importances_):
    print("{:30}{:3f}".format(x.columns[idx], val))
print('-----------------------------------------------')

print("{:30}{:3f}".format('F1 score', f1_score(y_test, y_pred)))
print("{:30}{:3f}".format('Test accuracy', accuracy_score(y_test, y_pred)))
print()

print('******Confusion Matrix******')
print(confusion_matrix(y_test, y_pred))

******Feature importance******
usage                         1.000000
-----------------------------------------------
F1 score                      0.884661
Test accuracy                 0.949294

******Confusion Matrix******
[[1444   33]
 [  64  372]]


### Conclusion

#### usage history is a better feature in predicting adopted user

Imporant features are

 1. usage
 2. org_id - seems reasonable
 3. invited_by_user_id - may be 0 (no reference) is used to classify that is why it is given more importance
 4. creation_source
 5. opted_in_to_mailing_list
 6. enabled_for_marketing_drip