<a href="https://colab.research.google.com/github/saranyapichandi96/task_5/blob/main/predict_user_adoption.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# importing all the required libraries and modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import json
from pandas.io.json import json_normalize
from datetime import datetime, timedelta
import scipy.stats
import matplotlib.dates as mdates
import plotly.graph_objects as go
plt.style.use('bmh')

In [None]:
with open('data/takehome_users.csv') as f:
    print(f)
    #for text in f:
        #print(text)
with open('data/takehome_user_engagement.csv') as f:
    print(f)

In [None]:
users = pd.read_csv('data/takehome_users.csv', parse_dates = ['creation_time'], 
                    encoding = "cp1252")
user_eng = pd.read_csv('data/takehome_user_engagement.csv',  parse_dates = ['time_stamp'], 
                       encoding = "cp1252")

In [None]:
users.info()

In [None]:
users.describe().T


In [None]:
user_eng.info()

In [None]:
user_eng.describe().T

In [None]:
users.head()

In [None]:
user_eng.head()

In [None]:
users['last_session_creation_time'] = pd.to_datetime(users['last_session_creation_time'] ,unit='s')
users.last_session_creation_time.dtypes
users.head(3)

In [None]:
users['last_session_creation_time'].min(), users['last_session_creation_time'].max()


In [None]:
df = user_eng.copy()
df['date'] = pd.to_datetime(df.time_stamp.dt.date)
def rolling_count(df_group, frequency):
    return df_group.rolling(frequency, on='date')['user_id'].count()
df['visits_7_days'] = df.groupby('user_id', as_index=False, group_keys=False).apply(rolling_count, '7D')
df.describe().T

In [None]:
df[df.visits_7_days >= 3.0]

In [None]:
user_adopted.adopted_user.value_counts()

In [None]:
user_adopted.set_index("object_id", inplace = True)

In [None]:
df_users = users.join(user_adopted, on = 'object_id', how='left')
df_users.head()
object_id	creation_time	

In [None]:
df_users.info()

In [None]:
#The null values in the adopted_user and last_session_creation_time can be filled in with 0 because we can assume that those users aren't adopted users.

df_users['last_session_creation_time'].fillna(0, inplace = True)
df_users['adopted_user'].fillna(0, inplace = True)
df_users.describe().T

In [None]:
#Let's see if we can extract useful information from the email variable like it's domain.

df_users['email_domain'] = df_users.email.apply(lambda x: x.split('@')[1])
df_users['email_domain'].value_counts()

In [None]:
# Also checking the creation_source for NON NULL invited_by_user column
df_users[~df_users.invited_by_user_id.isnull()].creation_source.unique()
array(['GUEST_INVITE', 'ORG_INVITE'], dtype=object)

In [None]:
'''
There are too many email domains and most of them seem fake domains so it's good to drop the column entirely. We can also drop the name and object_id columns.

And for the invited_by_user_id let's convert the NULL values to 0 because the column has a Non Null value only if the creation_source was a GUEST_INVITE or a ORG_INVITE anyways.

For the creation_time column let's add a column which calculates how old the account is, i.e. the number of days since the account was created.

And since last_session_creation_time can be removed as well because it was in a sense used to create the adopted_user column.
'''

df_users.drop(['object_id', 'name', 'email', 'email_domain'], axis = 1, inplace = True)
df_users.invited_by_user_id.fillna(0, inplace=True)
df_users['days_since_creation'] = (user_eng.time_stamp.max() - df_users.creation_time).dt.days
df_users.drop(['creation_time', 'last_session_creation_time'], axis = 1, inplace = True)


In [None]:
#Let's OneHotEncode the creation_source column.

In [None]:
df_users = pd.get_dummies(df_users, columns=['creation_source'])
df_users.describe().T

In [None]:
for col in ['opted_in_to_mailing_list', 'enabled_for_marketing_drip', 'creation_source_GUEST_INVITE',
           'creation_source_ORG_INVITE', 'creation_source_PERSONAL_PROJECTS', 'creation_source_SIGNUP',
           'creation_source_SIGNUP_GOOGLE_AUTH']:
    g = sns.FacetGrid(df_users, hue = "adopted_user", height=3, aspect=1.5,)
    g.map(plt.hist, col, alpha=.5, bins = 20)
    g.add_legend() 

In [None]:
sns.distplot(df_users['days_since_creation'], kde = False, bins = 20, hue = 'adopted_user')

In [None]:
#Let's try to fit Random Forest Regression model and find the feature importance. Since we will be using random forest using trees we don't need to scale any features.
# Importing necessary packages
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import time
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, accuracy_score,\
precision_score, recall_score, f1_score
def cv_optimize(model, parameters, Xtrain, ytrain, n_folds = 5):
    """
    Cross validation. Function to hypertune the model "model" with the input paramete distribution using
    "parameters" on the training data.
    The output will be the best estimator whose average score on all folds will be best. 
    """
    clf = GridSearchCV(model, param_grid = parameters, cv = n_folds, scoring = 'accuracy')
    t0 = time.time()
    clf.fit(Xtrain, ytrain)
    time_fit = time.time() - t0 
    print('\n\n\n=============================',type(model).__name__,'=================================\n')
    print("It takes %.3f seconds for tuning " % (time_fit))
     print("BEST PARAMS", clf.best_params_)
    best = clf.best_estimator_
    return best
    
def do_classify(model, parameters, df, targetname, scale = True, cols_to_transform = 'numeric', 
                featurenames = 'all', train_size = 0.8):
      
    # Creating the X and y variables for our model
    if featurenames == 'all':
        X = df.drop([targetname], axis = 1)
    else:
        X = df[featurenames]
        
    y = df[targetname]
    
    Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size = train_size)

    model = cv_optimize(model, parameters, Xtrain, ytrain)
    t0 = time.time()
   model = model.fit(Xtrain, ytrain)
    time_fit = time.time() - t0 
    print("It takes %.3f seconds for fitting" % (time_fit))
    training_accuracy = model.score(Xtrain, ytrain)
    test_accuracy = model.score(Xtest, ytest)
    precision = precision_score(ytest, model.predict(Xtest))
    recall = recall_score(ytest, model.predict(Xtest))
    AUC = roc_auc_score(ytest, model.predict_proba(Xtest)[:,1])
            
    print("Accuracy on training data: {:0.2f}".format(training_accuracy))
    print("Accuracy on test data:     {:0.2f}".format(test_accuracy))
    print("Precision on test data:    {:0.2f}".format(precision))
    print("Recall on test data:       {:0.2f}".format(recall))
    print("AUC on test data:          {:0.2f}".format(AUC))
    print("=======Confusion Matrix=========")
    print(confusion_matrix(ytest, model.predict(Xtest)))
    print("=======Classification report=======")
    print(classification_report(ytest, model.predict(Xtest)))
    print("="*100)
    print("="*100)
    print("="*100)
    return model, Xtrain, ytrain, Xtest, ytest
# Random Forest model
model_rf = RandomForestClassifier(class_weight='balanced') # adding balanced to handle the unbalanced data
parameters_rf = {
                 'n_estimators': [10, 25, 50, 75, 100],
                 'criterion': ["gini", "entropy"],
                 'max_depth': [3, 6, 10, 12],
                 'max_features': ['auto', 'sqrt']
                }
model_rf, Xtrain, ytrain, Xtest, ytest = do_classify(model_rf, parameters_rf, 
                                                                  df_users, targetname = 'adopted_user')


In [None]:
feat_imp = pd.DataFrame({'importance':model_rf.feature_importances_})    
feat_imp['feature'] = Xtrain.columns
feat_imp.sort_values(by='importance', ascending=False, inplace=True)
    
feat_imp.sort_values(by='importance', inplace=True)
feat_imp = feat_imp.set_index('feature', drop=True)
_ = feat_imp.plot.barh(title = 'Random Forest feature importance', figsize = (12,7))

In [None]:
The top 5 important features seem to be:

days_since_creation
org_id
invited_by_user_id
creation_source_PERSONAL_PROJECTS
opted_in_to_mailing_list
Furture possible work: We can also add a feature which calculates the difference between the creation date of the account and the first login of the user.

 