In [89]:
import pywget
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, roc_auc_score
from sklearn.utils import resample

import h2o
from h2o.frame import H2OFrame
from h2o.estimators import H2ORandomForestEstimator
from h2o.grid.grid_search import H2OGridSearch

import helper 

from jupyterthemes import jtplot
jtplot.reset()
# jtplot.style(theme='onedork')

%matplotlib inline

In [None]:
df = pd.read_csv('conversion_data.csv')
df.head()

# Exploratory Analysis

In [None]:
df.describe()
df.info()
df.isna().sum(axis=0)
df.nunique()

# How features affect target

In [87]:
# calculate overall conversion rate
cr = df['converted'].agg(['count', np.sum])
cr['conversion_rate'] = cr['sum'] / cr['count']
cr

count              316200.000000
sum                 10200.000000
conversion_rate         0.032258
Name: converted, dtype: float64

We have a very imbalanced classification problem!

In [None]:
# calcualte the conversion rate for each country
# temp = (df
#  .groupby('country')
#  .agg(counts=('age', np.size), 
#       converted=('converted', lambda x: np.sum(x==1)))
#  .assign(conversion_rate=lambda df: df['converted'] / df['counts']))

temp = df.groupby('country')['converted'].mean()  # mean will give the conversion rate
 
fig, ax = plt.subplots()
ax.plot(temp.index, temp.values, 'o-')
ax.set_xlabel('pages visited')
ax.set_ylabel('conversion rate')

In [None]:
# distribution of age
fig, ax = plt.subplots()
ax.hist(df['age'])

In [None]:
df = df.loc[df['age']<=60]

In [None]:
# calcualte the conversion rate for age
temp = df.groupby('age')['converted'].mean()

fig, ax = plt.subplots()
ax.plot(temp.index, temp.values, 'o-')
ax.set_xlabel('pages visited')
ax.set_ylabel('conversion rate')

In [None]:
df.groupby('new_user')['converted'].mean()

In [None]:
df.groupby('source')['converted'].mean()

In [None]:
df.groupby('total_pages_visited')['converted'].mean()

the effects of different features are quite clear. We should be able to build a model with good prediction accuracy.

In [None]:
# for the numerical features, calculate their correlation with target
columns = ['age', 'new_user', 'total_pages_visited', 'converted']
corr = df[columns].corr(method='pearson')
corr

In [None]:
sns.heatmap(corr, center=0, vmin=-1, vmax=1)

In [None]:
# df.plot.bar(x='source', y='converted')

In [None]:
sns.barplot(x='source',y='converted',data=df)

In [None]:
df.head()

In [None]:
data = pd.get_dummies(df)
data.head()

In [None]:
# df_country = pd.get_dummies(df['country'])
# df = pd.concat([df, df_country], axis=1)
# df_source = pd.get_dummies(df['source'])
# df = pd.concat([df, df_source], axis=1)
# df.head()
target = 'converted'
features = data.columns.tolist() 
features.remove(target)
features

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data[features], data[target], test_size=0.2)

In [None]:
df.shape[0]*0.03

In [None]:
clf = RandomForestClassifier()  # class_weight='balanced'
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)

report = classification_report(y_test, y_pred, labels=[0, 1], target_names=['not_converted', 'converted'])
print(report)
# clf.score(X_test,y_test)

cm = confusion_matrix(y_test, y_pred)


In [None]:
plot_confusion_matrix(cm, ['not_converted', 'converted'])

The accuracy on the test set is 98%, the baseline accuracy is:

In [None]:
1-sum(df['converted'])/len(df['converted'])

# fit on balanced data set

In [None]:
data0 = data.loc[lambda df: df['converted']==0]
data1 = data.loc[lambda df: df['converted']==1]
len0, len1 = data0.shape[0], data1.shape[0]

data0_resampled = resample(data0, replace=False, n_samples=len1)

data_downsampled = pd.concat([data0_resampled, data1])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data_downsampled[features], 
                                                    data_downsampled[target], 
                                                    test_size=0.2)

y_train.groupby(y_train.values).size()

In [None]:
clf1 = RandomForestClassifier()  # class_weight='balanced'
clf1.fit(X_train, y_train)

print(clf1.score(X_train, y_train))
print(clf1.score(X_test, y_test))

## use cross validation to optimize hyper-parameters

In [None]:
rf = RandomForestClassifier()
param_grid = {'n_estimators': np.arange(300, 600, 100), 
              'max_depth':[4, 5, 6]}
clf_best = GridSearchCV(rf, param_grid, cv=5, refit=True, n_jobs=-1)
clf_best.fit(X_train, y_train)

In [None]:
pd.DataFrame(clf_best.cv_results_).sort_values('rank_test_score')

In [None]:
# check the classification score of the model trained with balanced data
clf_best.score(X_test, y_test)
y_test_pred = clf_best.predict(X_test)
report = classification_report(y_test, y_test_pred, labels=[0, 1], target_names=['not_converted', 'converted'])
print(report)

In [None]:
y_all = data[target]
X_all = data[features]
y_all_pred = clf_best.predict(X_all)
report = classification_report(y_all, y_all_pred, labels=[0, 1], target_names=['not_converted', 'converted'])
print(report)

so it seems using downsampling to achieve balanced dataset does NOT necessarily improve accuracy, not even for the minority class. 

# Feature Importance

In [None]:
importance = pd.Series(data=clf_best.best_estimator_.feature_importances_, 
                       index = features).sort_values(ascending=True)

helper.plot_barh(importance)

The most important feature is total_pages_visited. However, it is difficult for the company to cause people to visit more pages. Also, people probably visit more pages to buy things anyway, so it is probably caused by the action of a purchase, rather than causing a purchase.

Now I will remove `total_pages_visited` since it is now really helpful to the company.

The first split of the decision tree is whether or not it is a new user. Thus, the site should try to incentivize older users to come back. The next split of the tree is on country, and the tree is split on the user not being from China. Since China has a large population, the site should try to increase the number of Chinese visitors. The third level is split on age. Basically, the site works better for young users (less than about 30 years of age). Thus, the site should advertise towards young people to maximize the conversion rate of their target audience.

# use random forest in H2O package 

In [None]:
h2o.init()

In [None]:
df.head()
df.dtypes

In [None]:
# df.head()

h2o_df = H2OFrame(df)
h2o_df['new_user'] = h2o_df['new_user'].asfactor()
h2o_df['converted'] = h2o_df['converted'].asfactor()

h2o_df.summary()

In [None]:
# train test split
train, test = h2o_df.split_frame(ratios=[0.80])
features = h2o_df.columns
target = 'converted'
features.remove(target)

In [None]:
model = H2ORandomForestEstimator(balance_classes=True, 
                                 ntrees=200, 
                                 max_depth=6)
model.train(x=features,
            y=target,
            training_frame=train,
            validation_frame=test)

In [None]:
model.varimp_plot()

In [None]:
importance

In [None]:
importance = model.varimp(use_pandas=True).set_index('variable')
helper.plot_barh(importance['percentage'])


In [None]:
y_test_pred = model.predict(test[features])
y_train_pred = model.predict(train[features])
y_train_pred.head()

In [None]:
y_test_pred_class = y_test_pred.as_data_frame()['predict']
y_test = test.as_data_frame()[target]
y_train = train.as_data_frame()[target]
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test, y_test_pred_class, labels=[0, 1], target_names=['not converted', 'converted']))

In [None]:
h2o_df.as_data_frame().dtypes

In [None]:
y_test_pred_prob = y_test_pred.as_data_frame()['p1']
y_train_pred_prob = y_train_pred.as_data_frame()['p1']
fpr_test, tpr_test, thresholds_test = roc_curve(y_test, y_test_pred_prob, pos_label=1)
fpr_train, tpr_train, thresholds_train = roc_curve(y_train, y_train_pred_prob, pos_label=1)



fig, ax = plt.subplots()
ax.plot(fpr_train, tpr_train, 'r-', label='train')
ax.plot(fpr_test, tpr_test, 'b-', label='test')
ax.set_xlabel('FPR')
ax.set_ylabel('TPR')
ax.legend()
roc_auc_score(y_test, y_test_pred_prob), roc_auc_score(y_train, y_train_pred_prob)

In [None]:
fig, ax = plt.subplots()
ax.plot(thresholds, fpr, 'r-', label='fpr')
ax.plot(thresholds, tpr, 'g-', label='tpr')
ax.set_xlabel('threshold')
ax.set_ylabel('TPR, FPR')
ax.legend()