In [1]:
import pandas as pd
import numpy as np

users = pd.read_csv('takehome_users.csv', encoding='latin-1',parse_dates=True)
engage = pd.read_csv('takehome_user_engagement.csv', parse_dates=True)

In [2]:
users.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0


In [3]:
engage.head()

Unnamed: 0,time_stamp,user_id,visited
0,2014-04-22 03:53:30,1,1
1,2013-11-15 03:45:04,2,1
2,2013-11-29 03:45:04,2,1
3,2013-12-09 03:45:04,2,1
4,2013-12-25 03:45:04,2,1


In [4]:
import datetime

engage.time_stamp = pd.to_datetime(engage.time_stamp)
engage = engage.set_index('time_stamp', drop= True)

In [5]:
from datetime import timedelta

def label_adopted(x):    
    df_temp = engage.loc[engage['user_id'] == x] 
    df_temp = df_temp.resample('D').mean().dropna()
    adopted = 0
    for i in range(len(df_temp)-2): 
        if df_temp.index[i + 2] - df_temp.index[i] <= timedelta(days=7):
            adopted = 1
            break
        else:
            adopted = 0
    return adopted



In [6]:
users['adopted_user'] = users['object_id'].apply(label_adopted)

In [7]:
print(sum(users['adopted_user']))
print(sum(users.adopted_user)/len(users.adopted_user))

1656
0.138


In [8]:
users.creation_time = pd.to_datetime(users.creation_time)

In [9]:
users['last_session_creation_time'] = users['last_session_creation_time'].map(lambda data: 
                                    datetime.datetime.fromtimestamp(int(data)).strftime('%Y-%m-%d %H:%M:%S'),
                                                                              na_action='ignore')

In [10]:
users['last_session_creation_time'] = pd.to_datetime(users['last_session_creation_time'])

In [11]:
users['usage_length'] = users['last_session_creation_time'] - users['creation_time']

In [12]:
users['usage_length'] = [x.total_seconds() for x in users['usage_length']]

In [13]:
users.isnull().sum()

object_id                        0
creation_time                    0
name                             0
email                            0
creation_source                  0
last_session_creation_time    3177
opted_in_to_mailing_list         0
enabled_for_marketing_drip       0
org_id                           0
invited_by_user_id            5583
adopted_user                     0
usage_length                  3177
dtype: int64

In [14]:
users.invited_by_user_id = users.invited_by_user_id.fillna(0)

In [15]:
users.isnull().sum()

object_id                        0
creation_time                    0
name                             0
email                            0
creation_source                  0
last_session_creation_time    3177
opted_in_to_mailing_list         0
enabled_for_marketing_drip       0
org_id                           0
invited_by_user_id               0
adopted_user                     0
usage_length                  3177
dtype: int64

In [16]:
feature_df = users.iloc[:,4:]
feature_df = feature_df.drop('last_session_creation_time', axis=1)
feature_df['usage_length'] = feature_df['usage_length'].fillna(0)

In [17]:
from sklearn.preprocessing import LabelEncoder

In [19]:
gle = LabelEncoder()
creation_labels = gle.fit_transform(users['creation_source'])
feature_df.creation_source = creation_labels

org_id_labels = gle.fit_transform(users['org_id'])
feature_df.org_id = org_id_labels

invited_labels = gle.fit_transform(users['invited_by_user_id'])
feature_df.org_id = invited_labels

In [20]:
feature_df.head()

Unnamed: 0,creation_source,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,adopted_user,usage_length
0,0,1,0,2325,10803.0,0,19800.0
1,1,0,0,56,316.0,1,11770200.0
2,1,0,0,298,1525.0,0,19800.0
3,0,0,0,1104,5151.0,0,106200.0
4,0,0,0,1127,5240.0,0,451800.0


In [24]:
from sklearn.model_selection import train_test_split

data = feature_df.drop('adopted_user', axis=1)
labels = feature_df.adopted_user

X_train, y_train, X_test, y_test = train_test_split(data, labels, test_size=0.25, random_state=42)

In [22]:
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold, train_test_split
from sklearn.linear_model import LogisticRegression, Ridge,Lasso, ElasticNetCV
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix,accuracy_score, f1_score,classification_report
from sklearn.preprocessing import StandardScaler
import xgboost as xgb

In [25]:
lr=LogisticRegression()
lr.fit(X_train,X_test)
pred_tr=lr.predict(X_train)
pred_lr=lr.predict(y_train)
print(confusion_matrix(y_test,pred_lr))
score_lr= accuracy_score(y_test,pred_lr)
score_tr= accuracy_score(X_test,pred_tr)
print("Accuracy Score is: ", score_lr, score_tr)
print("F1 Score is: ", f1_score(y_test,pred_lr))
print(classification_report(y_test, pred_lr))

[[1719  861]
 [  56  364]]
Accuracy Score is:  0.6943333333333334 0.6966666666666667
F1 Score is:  0.44255319148936173
              precision    recall  f1-score   support

           0       0.97      0.67      0.79      2580
           1       0.30      0.87      0.44       420

    accuracy                           0.69      3000
   macro avg       0.63      0.77      0.62      3000
weighted avg       0.87      0.69      0.74      3000



In [28]:
lr=LogisticRegression(penalty = 'l2',solver = 'liblinear')
lr.fit(X_train,X_test)
pred_tr=lr.predict(X_train)
pred_lr=lr.predict(y_train)
print(confusion_matrix(y_test,pred_lr))
score_lr= accuracy_score(y_test,pred_lr)
score_tr= accuracy_score(X_test,pred_tr)
print("Accuracy Score is: ", score_lr, score_tr)
print("F1 Score is: ", f1_score(y_test,pred_lr))
print(classification_report(y_test, pred_lr))

[[1718  862]
 [  56  364]]
Accuracy Score is:  0.694 0.6968888888888889
F1 Score is:  0.4422843256379101
              precision    recall  f1-score   support

           0       0.97      0.67      0.79      2580
           1       0.30      0.87      0.44       420

    accuracy                           0.69      3000
   macro avg       0.63      0.77      0.62      3000
weighted avg       0.87      0.69      0.74      3000



In [29]:
dt=DecisionTreeClassifier()
dt.fit(X_train,X_test)
pred_tr=dt.predict(X_train)
pred_dt=dt.predict(y_train)
confusion_matrix(y_test,pred_dt)
confusion_matrix(X_test,pred_tr)

array([[7756,    8],
       [  39, 1197]])

In [36]:
rf=RandomForestClassifier()
rf.fit(X_train,X_test)
pred_rf=rf.predict(y_train)
confusion_matrix(y_test,pred_rf)
rf.score(y_train, y_test)


0.9706666666666667

In [41]:
y_pred = rf.predict(y_train)

print(classification_report(y_test, y_pred))

cm= confusion_matrix(y_test,y_pred)
print('confusion matrix:')
print(cm)
score_lr= accuracy_score(y_test,y_pred)
print("Accuracy Score is: ", score_lr)

              precision    recall  f1-score   support

           0       0.98      0.98      0.98      2580
           1       0.90      0.89      0.89       420

    accuracy                           0.97      3000
   macro avg       0.94      0.94      0.94      3000
weighted avg       0.97      0.97      0.97      3000

confusion matrix:
[[2538   42]
 [  46  374]]
Accuracy Score is:  0.9706666666666667


In [37]:
parameters = {'n_estimators':[10,50,100,150,200],'criterion' : ("gini", "entropy"), 'max_depth':[3,5,10,20,30]}
rc1=RandomForestClassifier()
rc= GridSearchCV(rc1, parameters,cv=5)
rc.fit(X_train,X_test)

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'criterion': ('gini', 'entropy'),
                         'max_depth': [3, 5, 10, 20, 30],
                         'n_estimators': [10, 50, 100, 150, 200]})

In [38]:
pre2=rc.predict(y_train)
score_lr= accuracy_score(y_test,pre2)
print("Accuracy Score is: ", score_lr)
cm1=confusion_matrix(y_test,y_pred)
print(cm1)

Accuracy Score is:  0.977
[[2537   43]
 [  47  373]]


In [39]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98      2580
           1       0.90      0.89      0.89       420

    accuracy                           0.97      3000
   macro avg       0.94      0.94      0.94      3000
weighted avg       0.97      0.97      0.97      3000

