# Capstone Project - Google Predictive Analytics

## Part_5: Modeling 

### Performs some more EDA and Modeling

In [1]:
import pandas as pd
import numpy as np

In [2]:
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')

%config InlineBackend.figure_format = 'retina'
sns.set_style('whitegrid')
plt.style.use('fivethirtyeight')
%matplotlib inline

In [3]:
df = pd.read_csv("../capstone_data/kaggle_data/df_combined_tables.csv")

In [4]:
df.head()

Unnamed: 0,channelGrouping,date,fullVisitorId,sessionId,visitId,visitNumber,visitStartTime,browser,deviceCategory,operatingSystem,...,campaign,isTrueDirect,keyword,referralPath,source,bounces,hits,newVisits,pageviews,transactionRevenue
0,Organic Search,1970-01-01 00:00:00.020160902,1.13166e+18,1.13166e+28,1472830385,1,1472830385,Chrome,desktop,Windows,...,(not set),-999,(not provided),-999,google,1,1,1,1,0.0
1,Organic Search,1970-01-01 00:00:00.020160902,3.77306e+17,3.77306e+27,1472880147,1,1472880147,Firefox,desktop,Macintosh,...,(not set),-999,(not provided),-999,google,1,1,1,1,0.0
2,Organic Search,1970-01-01 00:00:00.020160902,3.895546e+18,3.895546e+28,1472865386,1,1472865386,Chrome,desktop,Windows,...,(not set),-999,(not provided),-999,google,1,1,1,1,0.0
3,Organic Search,1970-01-01 00:00:00.020160902,4.763447e+18,4.763447e+28,1472881213,1,1472881213,UC Browser,desktop,Linux,...,(not set),-999,google + online,-999,google,1,1,1,1,0.0
4,Organic Search,1970-01-01 00:00:00.020160902,2.729444e+16,2.729444e+26,1472822600,2,1472822600,Chrome,mobile,Android,...,(not set),True,(not provided),-999,google,1,1,0,1,0.0


In [5]:
# going ahead to drop city because it shows quite a bit of 'not available in demo dataset' 
# but the continent is present which should be sufficient. Also isTrueDirect, keyword and referralPath would be dropped 
# because of too many nan values and some of the content with other columns could be used to explain these columns. 
df = df.drop(['city', 'isTrueDirect', 'keyword', "referralPath", 'campaign', 'date'], axis=1)

In [6]:
# I will be dropping some more categorical varibles due to memory errors when getting dummies. 
# We are working with over 900,000 records. 
df = df.drop(['operatingSystem', "networkDomain"], axis=1)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 903653 entries, 0 to 903652
Data columns (total 16 columns):
channelGrouping       903653 non-null object
fullVisitorId         903653 non-null float64
sessionId             903653 non-null float64
visitId               903653 non-null int64
visitNumber           903653 non-null int64
visitStartTime        903653 non-null int64
browser               903653 non-null object
deviceCategory        903653 non-null object
continent             903653 non-null object
adContent             903653 non-null object
source                903653 non-null object
bounces               903653 non-null int64
hits                  903653 non-null int64
newVisits             903653 non-null int64
pageviews             903653 non-null int64
transactionRevenue    903653 non-null float64
dtypes: float64(3), int64(7), object(6)
memory usage: 110.3+ MB


In [8]:
df.head()

Unnamed: 0,channelGrouping,fullVisitorId,sessionId,visitId,visitNumber,visitStartTime,browser,deviceCategory,continent,adContent,source,bounces,hits,newVisits,pageviews,transactionRevenue
0,Organic Search,1.13166e+18,1.13166e+28,1472830385,1,1472830385,Chrome,desktop,Asia,-999,google,1,1,1,1,0.0
1,Organic Search,3.77306e+17,3.77306e+27,1472880147,1,1472880147,Firefox,desktop,Oceania,-999,google,1,1,1,1,0.0
2,Organic Search,3.895546e+18,3.895546e+28,1472865386,1,1472865386,Chrome,desktop,Europe,-999,google,1,1,1,1,0.0
3,Organic Search,4.763447e+18,4.763447e+28,1472881213,1,1472881213,UC Browser,desktop,Asia,-999,google,1,1,1,1,0.0
4,Organic Search,2.729444e+16,2.729444e+26,1472822600,2,1472822600,Chrome,mobile,Europe,-999,google,1,1,0,1,0.0


In [9]:
df_categorical = list(df.select_dtypes(include=['object']).columns)

In [10]:
df[df_categorical].head(10)

Unnamed: 0,channelGrouping,browser,deviceCategory,continent,adContent,source
0,Organic Search,Chrome,desktop,Asia,-999,google
1,Organic Search,Firefox,desktop,Oceania,-999,google
2,Organic Search,Chrome,desktop,Europe,-999,google
3,Organic Search,UC Browser,desktop,Asia,-999,google
4,Organic Search,Chrome,mobile,Europe,-999,google
5,Organic Search,Chrome,desktop,Europe,-999,google
6,Organic Search,Chrome,desktop,Asia,-999,google
7,Organic Search,Chrome,desktop,Oceania,-999,google
8,Organic Search,Internet Explorer,desktop,Europe,-999,google
9,Organic Search,Firefox,desktop,Europe,-999,google


In [11]:
df_dummies = pd.get_dummies(df[df_categorical])
df_dummies.head()

Unnamed: 0,channelGrouping_(Other),channelGrouping_Affiliates,channelGrouping_Direct,channelGrouping_Display,channelGrouping_Organic Search,channelGrouping_Paid Search,channelGrouping_Referral,channelGrouping_Social,browser_(not set),browser_0,...,source_web.whatsapp.com,source_wheretoget.it,source_wunderbin.corp.google.com,source_x20web.corp.google.com,source_xbidprodmirror.corp.google.com,source_yahoo,source_yahoo.com,source_yandex,source_youtube.com,source_yt-go-12345.googleplex.com
0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
df_dummies.shape

(903653, 496)

In [13]:
numerical_features = list(df._get_numeric_data().columns)

In [14]:
df[numerical_features].head()

Unnamed: 0,fullVisitorId,sessionId,visitId,visitNumber,visitStartTime,bounces,hits,newVisits,pageviews,transactionRevenue
0,1.13166e+18,1.13166e+28,1472830385,1,1472830385,1,1,1,1,0.0
1,3.77306e+17,3.77306e+27,1472880147,1,1472880147,1,1,1,1,0.0
2,3.895546e+18,3.895546e+28,1472865386,1,1472865386,1,1,1,1,0.0
3,4.763447e+18,4.763447e+28,1472881213,1,1472881213,1,1,1,1,0.0
4,2.729444e+16,2.729444e+26,1472822600,2,1472822600,1,1,0,1,0.0


In [15]:
df[numerical_features].shape

(903653, 10)

##### Combining categorical and numerical features

In [16]:
X_df = pd.concat([df_dummies, df[numerical_features]], axis=1)

In [17]:
X_df = X_df.drop('transactionRevenue', axis=1)

In [18]:
X_df.head()

Unnamed: 0,channelGrouping_(Other),channelGrouping_Affiliates,channelGrouping_Direct,channelGrouping_Display,channelGrouping_Organic Search,channelGrouping_Paid Search,channelGrouping_Referral,channelGrouping_Social,browser_(not set),browser_0,...,source_yt-go-12345.googleplex.com,fullVisitorId,sessionId,visitId,visitNumber,visitStartTime,bounces,hits,newVisits,pageviews
0,0,0,0,0,1,0,0,0,0,0,...,0,1.13166e+18,1.13166e+28,1472830385,1,1472830385,1,1,1,1
1,0,0,0,0,1,0,0,0,0,0,...,0,3.77306e+17,3.77306e+27,1472880147,1,1472880147,1,1,1,1
2,0,0,0,0,1,0,0,0,0,0,...,0,3.895546e+18,3.895546e+28,1472865386,1,1472865386,1,1,1,1
3,0,0,0,0,1,0,0,0,0,0,...,0,4.763447e+18,4.763447e+28,1472881213,1,1472881213,1,1,1,1
4,0,0,0,0,1,0,0,0,0,0,...,0,2.729444e+16,2.729444e+26,1472822600,2,1472822600,1,1,0,1


In [19]:
y= df.transactionRevenue

In [20]:
y.head()

0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
Name: transactionRevenue, dtype: float64

#### Train_test_split and model with Linear Regression

In [21]:
from sklearn.linear_model import RidgeCV, LassoCV, ElasticNetCV, LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

In [22]:
ss = StandardScaler()
ss.fit(X_df)

X_s = ss.transform(X_df)

X_s

array([[-0.01152441, -0.1359686 , -0.4336323 , ..., -0.37302941,
         0.5341481 , -0.40559985],
       [-0.01152441, -0.1359686 , -0.4336323 , ..., -0.37302941,
         0.5341481 , -0.40559985],
       [-0.01152441, -0.1359686 , -0.4336323 , ..., -0.37302941,
         0.5341481 , -0.40559985],
       ...,
       [-0.01152441, -0.1359686 , -0.4336323 , ...,  2.01250841,
         0.5341481 ,  2.44137601],
       [-0.01152441, -0.1359686 , -0.4336323 , ...,  2.01250841,
         0.5341481 ,  2.58372481],
       [-0.01152441, -0.1359686 , -0.4336323 , ...,  2.73854166,
         0.5341481 ,  3.86486395]])

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X_s, y, random_state=42)

In [24]:
# X_train.corrwith(y_train).sort_values()

In [25]:
lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [26]:
lr.score(X_train, y_train), lr.score(X_test, y_test)

(0.02798355943529396, -1.1352755306492406e+26)

In [27]:
# pipe = Pipeline([
#     #('pf', PolynomialFeatures()),
#     ('ss', StandardScaler()),
#     ('lr', LinearRegression())
# ])

# pipe.fit(X_train, y_train)

# print(pipe.score(X_train, y_train))
# print(pipe.score(X_test, y_test))

##### Using Ridge and lasso regressing

In [28]:
# rcv = RidgeCV(alphas=np.logspace(0, 5,100), cv = 3)

# rcv.fit(X_train, y_train)

# rcv.score(X_train, y_train), rcv.score(X_test, y_test)

In [27]:
lcv = LassoCV(cv=3)

lcv.fit(X_train, y_train)

lcv.score(X_train, y_train), lcv.score(X_test, y_test)

(0.028095641353674154, 0.029466466384250323)

In [29]:
variables = X_df.columns
coef_values = lcv.coef_

variables_dictionary = {
    'variables': variables,
    'coefficient': coef_values
}

In [31]:
pd.DataFrame(variables_dictionary).set_index('variables').sort_values('coefficient', ascending=False).head(20)

Unnamed: 0_level_0,coefficient
variables,Unnamed: 1_level_1
pageviews,7.297421
visitNumber,2.244989
bounces,1.914733
hits,1.501596
source_dfa,0.920334
source_youtube.com,0.710913
source_mall.googleplex.com,0.6755
visitId,0.607395
source_mail.google.com,0.499084
deviceCategory_desktop,0.388821
