In [17]:
import pandas as pd
import numpy as np

In [28]:
# Importing the files
df_user = pd.read_csv("takehome_users.csv", encoding="ISO-8859-1")
df_eng = pd.read_csv("takehome_user_engagement.csv", parse_dates=["time_stamp"])

df_user = df_user.rename({"object_id":"user_id"}, axis=1)

In [29]:
df_user.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 10 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   user_id                     12000 non-null  int64  
 1   creation_time               12000 non-null  object 
 2   name                        12000 non-null  object 
 3   email                       12000 non-null  object 
 4   creation_source             12000 non-null  object 
 5   last_session_creation_time  8823 non-null   float64
 6   opted_in_to_mailing_list    12000 non-null  int64  
 7   enabled_for_marketing_drip  12000 non-null  int64  
 8   org_id                      12000 non-null  int64  
 9   invited_by_user_id          6417 non-null   float64
dtypes: float64(2), int64(4), object(4)
memory usage: 937.6+ KB


In [30]:
df_eng.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207917 entries, 0 to 207916
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype         
---  ------      --------------   -----         
 0   time_stamp  207917 non-null  datetime64[ns]
 1   user_id     207917 non-null  int64         
 2   visited     207917 non-null  int64         
dtypes: datetime64[ns](1), int64(2)
memory usage: 4.8 MB


In [31]:
# defining an 'adopted user' #
df_agg = df_eng.set_index("time_stamp")

users = df_agg["user_id"].unique()
adoption = []

for i in users:
    id_filter = df_agg["user_id"] == i
    df_filter = df_agg[id_filter].resample("1D").count()
    df_filter = df_filter.rolling(window=7).sum()
    df_filter = df_filter.dropna()
    adoption.append(any(df_filter["visited"].values >= 7))

In [32]:
# applying 'adopted_user' logic onto df #
user_adoption = list(zip(users, adoption))

df_adopt = pd.DataFrame(user_adoption)

df_adopt
df_adopt.columns = ["user_id", "adopted_user"]

df_adopt
df = df_user.merge(df_adopt, on="user_id", how="left")

In [33]:
# mapping 'adopted_user' #
df.loc[:, "adopted_user"] = df["adopted_user"].map({False:0, True:1, np.nan:0})
df.dropna(subset=["adopted_user"], inplace=True)
df["adopted_user"] = df["adopted_user"].astype(int)


In [34]:
# mapping 'invited_by_user' #
invite = lambda row: 0 if np.isnan(row) else 1
df["invited_by_user"] = df["invited_by_user_id"].apply(invite)

In [35]:
# final df #
df = df[["adopted_user", "invited_by_user", "creation_source", \
         "opted_in_to_mailing_list", "enabled_for_marketing_drip"]]

In [36]:
df.head()

Unnamed: 0,adopted_user,invited_by_user,creation_source,opted_in_to_mailing_list,enabled_for_marketing_drip
0,0,1,GUEST_INVITE,1,0
1,0,1,ORG_INVITE,0,0
2,0,1,ORG_INVITE,0,0
3,0,1,GUEST_INVITE,0,0
4,0,1,GUEST_INVITE,0,0


In [37]:
#model building
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# machine learning pipeline #
X = df[df.columns[1:]]
y = df[df.columns[0]]

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.6, random_state=42)

pipeline = Pipeline(steps=[("encoder", OneHotEncoder()), \
                           ("rf", RandomForestClassifier(random_state = 42))])

params = {"rf__n_estimators" : [50, 75, 100],
          "rf__max_depth" : [5, 10, 15]}

cv = GridSearchCV(pipeline, param_grid=params, cv=3)
cv.fit(X_train, y_train)

print(f"Best parameters: {cv.best_params_}")
print(f"Training accuracy score from tuned model: \
       {cv.best_score_*100:.1f}%")

Best parameters: {'rf__max_depth': 5, 'rf__n_estimators': 50}
Training accuracy score from tuned model:        94.8%


In [38]:
y_pred = cv.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)
print(f"Model accuracy: {test_accuracy*100:.2f}%")

Model accuracy: 94.82%


In [39]:
X_ohe = pd.get_dummies(X_test)
pipeline.fit(X_ohe, y_test)

fe = pipeline.named_steps["rf"].feature_importances_

feature_importance = zip(X_ohe.columns, fe)
feature_importance = sorted(feature_importance, key=lambda x:x[1], reverse=True)

for i, j in feature_importance:
    print(f"Weight: {j:.3f} | Feature: {i}")

Weight: 0.127 | Feature: creation_source_PERSONAL_PROJECTS
Weight: 0.110 | Feature: creation_source_GUEST_INVITE
Weight: 0.084 | Feature: enabled_for_marketing_drip
Weight: 0.074 | Feature: creation_source_ORG_INVITE
Weight: 0.055 | Feature: invited_by_user
Weight: 0.031 | Feature: creation_source_SIGNUP
Weight: 0.005 | Feature: creation_source_SIGNUP_GOOGLE_AUTH
Weight: 0.000 | Feature: opted_in_to_mailing_list


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


From the raw data, we have utilised the following features

1. invited_by_user - if a user was referred by another user (custom feature)
2. creation_source - how the account was created (stock feature)
3. opted_in_to_mailing_list - whether user has opted into receiving marketing emails (stock feature)
4. enabled_for_marketing_drip - whether they are on the regular marketing email drip (stock feature)

The model proposed here is having an accuracy comparable to the cross-validation traning score(~ 94%).
This suggests that the pipeline feature realiable in determining the predictors.


#### Suggestions:
We could focus more on the personalised experience for the users as personal workspace and guest invite rank highest on how user interacted.
The marketing drip needs to kept rolling as this has shown results.
The user opting it to the mailing-list has shown to be a least effective predcitor, hence the newsletter call-to-action won't result in any significant sales increase. 