In [1]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline
import scipy.stats as ss
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.feature_selection import chi2, f_classif
seed = 999

In [2]:
# Load the data
testdata = pd.read_csv('test_results.csv', index_col='user_id')
testdata.rename(columns={'operative_system':'os'}, inplace=True)
testdata.head()

Unnamed: 0_level_0,timestamp,source,device,os,test,price,converted
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
604839,2015-05-08 03:38:34,ads_facebook,mobile,iOS,0,39,0
624057,2015-05-10 21:08:46,seo-google,mobile,android,0,39,0
317970,2015-04-04 15:01:23,ads-bing,mobile,android,0,39,0
685636,2015-05-07 07:26:01,direct_traffic,mobile,iOS,1,59,0
820854,2015-05-24 11:04:40,ads_facebook,web,mac,0,39,0


In [3]:
testdata.describe()

Unnamed: 0,test,price,converted
count,316800.0,316800.0,316800.0
mean,0.360079,46.205051,0.018333
std,0.480024,9.601487,0.134154
min,0.0,39.0,0.0
25%,0.0,39.0,0.0
50%,0.0,39.0,0.0
75%,1.0,59.0,0.0
max,1.0,59.0,1.0


In [4]:
testdata.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 316800 entries, 604839 to 832372
Data columns (total 7 columns):
timestamp    316800 non-null object
source       316800 non-null object
device       316800 non-null object
os           316800 non-null object
test         316800 non-null int64
price        316800 non-null int64
converted    316800 non-null int64
dtypes: int64(3), object(4)
memory usage: 19.3+ MB


In [5]:
# delete timestamp. For some reason it cannot be processed with pandas to_datetime
X = testdata.copy()
del X['timestamp']

# let's convert the string values of src, os and device to numbers
src_label_encoder = LabelEncoder()
os_label_encoder = LabelEncoder()
device_label_encoder = LabelEncoder()

X['source'] = src_label_encoder.fit_transform(testdata['source'])
X['os'] = os_label_encoder.fit_transform(testdata['os'])
X['device'] = device_label_encoder.fit_transform(testdata['device'])

In [6]:
X.head()

Unnamed: 0_level_0,source,device,os,test,price,converted
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
604839,3,0,1,0,39,0
624057,8,0,0,0,39,0
317970,0,0,0,0,39,0
685636,5,0,1,1,59,0
820854,3,1,3,0,39,0


## Answer 1
Should the company sell its software for 39 or 59?

Sales revenue is given by # of units sold times unit price

In [7]:
revenues = testdata.groupby(testdata['test']).apply(lambda df: df['converted'] * df['price'])
ctrl_revenues = revenues[0]
test_revenues = revenues[1]

In [8]:
def group_stats(df):
    return pd.Series({'n_users': df.shape[0],
                     'converted': df['converted'].mean(),
                     'mean_revenue': (df['converted'] * df['price']).mean()})
testdata.groupby('test').apply(group_stats)

Unnamed: 0_level_0,converted,mean_revenue,n_users
test,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.019904,0.776734,202727.0
1,0.015543,0.916843,114073.0


Let's do a t-test to see if the test group's average is greater than control group's average revenue.
* H0 : Mean of test group revenue is equal to mean of control group revenue.
* HA : Mean of test group revenue is greater than the mean of control group revenue.

In [9]:
# ttest_ind is a two tailed test
# since our HA is test_mean > ctrl_mean, we have to divide by 2
ttest_result = ss.ttest_ind(test_revenues, ctrl_revenues, equal_var=False)
ttest_result.pvalue / 2

7.7037493023391909e-09

The obtained p-value is less than 0.05 and hence we reject the null hypothesis and accept the alternate hypothesis. From this we can conclude that test group's revenue is greater than control group assuming the test group and control group is divided randomly as claimed.

## Answer 2

In [10]:
# test data after label encoder
X.head()

Unnamed: 0_level_0,source,device,os,test,price,converted
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
604839,3,0,1,0,39,0
624057,8,0,0,0,39,0
317970,0,0,0,0,39,0
685636,5,0,1,1,59,0
820854,3,1,3,0,39,0


Run chi-square test to see which feature impact the conversion rate the most.

In [11]:
col_names = ['source', 'device', 'os', 'price']
ch2_scores, pvalues = chi2(X.loc[:, col_names], X['converted'])
pd.DataFrame({'chi_score':ch2_scores, 'p_values':pvalues}, index=col_names).sort_values('p_values')

Unnamed: 0,chi_score,p_values
price,150.992849,1.051844e-34
os,7.642955,0.005699447
source,2.373391,0.1234187
device,0.72949,0.3930485


So, from the above test it can be seen that price impacts the conversion rates the most followed by OS.

In [12]:
# How much does price affect the conversion?
conv_by_price = testdata.groupby('price')['converted'].apply(lambda s: s.value_counts(normalize=True)).unstack()
conv_by_price

Unnamed: 0_level_0,0,1
price,Unnamed: 1_level_1,Unnamed: 2_level_1
39,0.980111,0.019889
59,0.98443,0.01557


In [13]:
# How OS affects the conversion
conv_by_os = testdata.groupby('os')['converted'].apply(lambda s: s.value_counts(normalize=True)).unstack()
conv_by_os

Unnamed: 0_level_0,0,1
os,Unnamed: 1_level_1,Unnamed: 2_level_1
android,0.985067,0.014933
iOS,0.977678,0.022322
linux,0.991778,0.008222
mac,0.976002,0.023998
other,0.98704,0.01296
windows,0.983045,0.016955


## Model by decision tree
Build a decision tree to get feature importance. Since the question is not asking to build a model to predict the conversion but to get actionable insights that lead to higher conversion, a shallow decision tree can be fit on all the data. There's no need of model's performance on the test set

In [14]:
testdata.head() # take a peak at the data

Unnamed: 0_level_0,timestamp,source,device,os,test,price,converted
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
604839,2015-05-08 03:38:34,ads_facebook,mobile,iOS,0,39,0
624057,2015-05-10 21:08:46,seo-google,mobile,android,0,39,0
317970,2015-04-04 15:01:23,ads-bing,mobile,android,0,39,0
685636,2015-05-07 07:26:01,direct_traffic,mobile,iOS,1,59,0
820854,2015-05-24 11:04:40,ads_facebook,web,mac,0,39,0


In [15]:
X = testdata.copy()
del X['timestamp']
del X['test']
X.head()

Unnamed: 0_level_0,source,device,os,price,converted
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
604839,ads_facebook,mobile,iOS,39,0
624057,seo-google,mobile,android,39,0
317970,ads-bing,mobile,android,39,0
685636,direct_traffic,mobile,iOS,59,0
820854,ads_facebook,web,mac,39,0


Let's get some insight on the features

In [16]:
X['source'].value_counts()

direct_traffic     60357
ads-google         59379
ads_facebook       53396
ads_other          29876
seo-google         23175
ads-bing           22873
seo_facebook       21205
friend_referral    20695
seo-other           9260
ads-yahoo           7583
seo-yahoo           6848
seo-bing            2153
Name: source, dtype: int64

In [17]:
X['device'].value_counts()

mobile    186471
web       130329
Name: device, dtype: int64

In [18]:
X['os'].value_counts()

windows    100976
iOS         95465
android     74935
mac         25085
other       16204
linux        4135
Name: os, dtype: int64

In [19]:
# Get dummies for one hot encoded features
X = pd.get_dummies(X)
X.head()

Unnamed: 0_level_0,price,converted,source_ads-bing,source_ads-google,source_ads-yahoo,source_ads_facebook,source_ads_other,source_direct_traffic,source_friend_referral,source_seo-bing,...,source_seo-yahoo,source_seo_facebook,device_mobile,device_web,os_android,os_iOS,os_linux,os_mac,os_other,os_windows
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
604839,39,0,0,0,0,1,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
624057,39,0,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
317970,39,0,1,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
685636,59,0,0,0,0,0,0,1,0,0,...,0,0,1,0,0,1,0,0,0,0
820854,39,0,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0


In [20]:
# A categorical value of K different values only need only K-1 vectors
# can use 'drop_first' parameter in get dummies but it cannot specify which level to drop
del X['source_ads_other']
del X['device_web']
del X['os_other']
X.head()

Unnamed: 0_level_0,price,converted,source_ads-bing,source_ads-google,source_ads-yahoo,source_ads_facebook,source_direct_traffic,source_friend_referral,source_seo-bing,source_seo-google,source_seo-other,source_seo-yahoo,source_seo_facebook,device_mobile,os_android,os_iOS,os_linux,os_mac,os_windows
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
604839,39,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0
624057,39,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0
317970,39,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0
685636,59,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0
820854,39,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0


In [21]:
X_train = X
y_train = X.pop('converted')

In [22]:
dt = DecisionTreeClassifier(max_depth=4)
dt.fit(X_train, y_train)
export_graphviz(dt, feature_names=X.columns, proportion=True, leaves_parallel=True)



<img src='tree.png'></img>

from the above tree diagram we can conclude that, to reach the leaf node with highest conversion rate (the third leaf from tight with the conversion=0.05), the path is:
1. Souce 'friend_referal' = true
2. price <= 49 which is 39
3. os 'ios' = True

In [23]:
pd.Series(dt.feature_importances_, index=X_train.columns).sort_values(ascending=False)

source_friend_referral    0.582278
os_iOS                    0.136605
os_mac                    0.093907
source_direct_traffic     0.059002
source_ads_facebook       0.041034
source_ads-google         0.038543
price                     0.020118
os_linux                  0.018002
source_ads-bing           0.010511
source_ads-yahoo          0.000000
os_windows                0.000000
source_seo-bing           0.000000
source_seo-other          0.000000
source_seo-yahoo          0.000000
source_seo_facebook       0.000000
device_mobile             0.000000
os_android                0.000000
source_seo-google         0.000000
dtype: float64

## Actionable Insights
* friend's referal, ios, and mac seems to influence conversion rate
* if we want to increase the price, which impacts negatively on the convesion rate, we must compensate the other two factors. 
    * launching special marketing program targeting apple users
    * run a program which rewards users who invite their acquaintances to use the software
* Linux users do not seem to convert a lot compared to others. May be development team should look at compatible and other issues of software w.r.t linux.