 Objective:
 1.To explore the given dataset and find the insights .
 2.Building a Model to predict revenue per customer.

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
print(os.listdir("../input"))
import json
from pandas.io.json import json_normalize
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity='all'

Loading the file,as some of the columns in the file are in JSON format, we will need to coverts them into csv

In [None]:
def load_dataset(path,nrows=None):
    cols=['channelGrouping', 'date', 'device', 'fullVisitorId', 'geoNetwork',
        'socialEngagementType', 'totals', 'trafficSource', 'visitId','customDimensions',
        'visitNumber', 'visitStartTime']
    json_cols=['device','geoNetwork', 'totals', 'trafficSource']
    df = pd.read_csv(path, 
                     converters={column: json.loads for column in json_cols}, 
                     dtype={'fullVisitorId': 'str'}, # Important!!
                     nrows=nrows, usecols=cols)
    for column in json_cols:
        column_as_df = json_normalize(df[column])
        column_as_df.columns = [f"{column}.{subcolumn}" for subcolumn in column_as_df.columns]
        df = df.drop(column, axis=1).merge(column_as_df, right_index=True, left_index=True)
    return df

In [None]:
%%time
train=load_dataset('../input/train_v2.csv')

In [None]:
%%time
test=load_dataset('../input/test_v2.csv')

In [None]:
#Understanding the data and features in the test and train 
#We can see that in all there are 59 features in training_set, while test set has 58 features with 'trafficSource.campaignCode' missing in it.    
train.shape
test.shape

train.head(5)
test.head(5)
train.columns.difference(test.columns)
test.columns.difference(train.columns)
train.dtypes

del train['trafficSource.campaignCode']

In [None]:
#Assigning the values'Train' and 'Test' to train and test data.
train['set']='Train'
test['set']='Test'

In [None]:
# if we see the features below has only one distinct value, which can't be useful for training the model, hence dropping the columns.
for cols in train.columns:
    if(train[cols].nunique() == 1):
        train[cols].value_counts()
        del train[cols]

for cols in test.columns:
    if(test[cols].nunique() == 1):
        del test[cols]

In [None]:
# Combining the train and test set for analyses, proprocessing and exploratory data analysis and then dividing back as given
train.shape
test.shape
data= pd.concat([train,test],axis=0)

In [None]:
data.columns
data.shape
data.drop('customDimensions',inplace=True,axis=1)

In [None]:
#finding the percentage of missing values in columns
for col in data.columns:
    if data[col].isnull().sum() > 0:
        rate = data[col].isnull().sum() * 100 / data.shape[0]
        print(f'Column {col} has {rate:.4f}% missing values.')
    if data[col].dtype == 'object':
        if (data[col] == 'not available in demo dataset').sum() > 0:
            rate = (data[col] == 'not available in demo dataset').sum() * 100 / data.shape[0]
            print(f'Column {col} has {rate:.4f}% values not available in dataset.')

In [None]:
# the data type of the date variable is object, changing it to date. Lets us break the date in Day, Date, Weeks and check for the revenue generated
import datetime
data['date']=data['date'].apply(lambda x: pd.to_datetime(str(x),format='%Y%m%d'))
data['year']=data['date'].dt.year
data['month']=data['date'].dt.month
data['day']=data['date'].dt.day
data['weekday']=data['date'].dt.weekday

In [None]:
#Analysing the response variable
# we can see that the target variable, almost 98.9 percent of values are null. i.e. the distribution of the data is imbalanced
not_null=pd.notnull(data['totals.transactionRevenue']).sum()
print(f'No of records generating the transactions : {not_null}')
null_val=round(pd.isnull(data['totals.transactionRevenue']).sum()/data.shape[0],4)
print(f'No of records with Null transaction : {null_val}')

data['totals.transactionRevenue']=data['totals.transactionRevenue'].fillna('0')

In [None]:
plt.subplots_adjust(wspace=2)
plt.figure(figsize=(15,5))
#Most of the search for gstore are organic search and Social search.
plt.subplot(1,2,1)
data['channelGrouping'].value_counts().plot(kind='bar',color='orange')

# We can see that the referral channel, has more total revenew
# if we compare both the graphs, we can see the number of visits through channel 'Organic search' and ' Social' are high, but the revenue generated by them is less
# The average revenuw generated from Referral is high
data['totals.transactionRevenue']=data['totals.transactionRevenue'].astype('int')
plt.subplot(1,2,2)
channelwise_renew= pd.pivot_table(data=data,index='channelGrouping',values='totals.transactionRevenue',aggfunc='mean').reset_index()
channelwise_renew=channelwise_renew.sort_values(by='totals.transactionRevenue',ascending=False)
sns.barplot(data=channelwise_renew,x='channelGrouping',y='totals.transactionRevenue',color='orange')
plt.xticks(rotation=90)
plt.show();


In [None]:
# From the below graphs, we can see visualize the percentage of the Channel, generating the revenue
plt.subplots_adjust(wspace=2)
plt.figure(figsize=(15,5))
renew_count=data[data['totals.transactionRevenue']!=0]['channelGrouping'].value_counts().sort_index()
no_renew_count=data[data['totals.transactionRevenue']==0]['channelGrouping'].value_counts().sort_index()
no_renew_count=no_renew_count.rename(cols=['index','NoTrans'])
reve_cnt= pd.concat([renew_count,no_renew_count],axis=1)
reve_cnt=reve_cnt.rename(columns={'channelGrouping':'Revenue',0:'NoRevenue'}).apply(lambda x: x/x.sum()*100,axis=1)
reve_cnt=reve_cnt.reset_index()
plt.subplot(1,3,1)
sns.barplot(data=reve_cnt,x='index',y='NoRevenue',color='orange')
plt.xticks(rotation=90)
plt.xlabel('Channel')
plt.subplot(1,3,2)
sns.barplot(data=reve_cnt,x='index',y='Revenue',color='red')
plt.xticks(rotation=90)
plt.xlabel('Channel')
plt.subplot(1,3,3)
sns.barplot(data=reve_cnt,x='index',y='NoRevenue',color='orange')
sns.barplot(data=reve_cnt,x='index',y='Revenue',color='red')
plt.xticks(rotation=90)
plt.xlabel('Channel')
plt.show();


In [None]:
#checking the revenue growth yearwise, we can see that revenue has been decreased from 2016 to 2018 and the same goes with the channels.
plt.subplots_adjust(wspace=2)
plt.figure(figsize=(15,8))
plt.subplot(1,2,1)
data.groupby('year')['totals.transactionRevenue'].mean().plot(kind='bar',color='orange')
plt.subplot(1,2,2)
year_revenew=pd.pivot_table(data=data,index=['year','channelGrouping'],values='totals.transactionRevenue',aggfunc='mean').reset_index()
sns.barplot(data=year_revenew,x='year',y='totals.transactionRevenue',hue='channelGrouping')
plt.show();

In [None]:
# We can see the spikes in the total revenuw generated and could also see that bigger transaction amount have increased the variance in some months
# If we see the revenue generated in 2016 and 2017, there can be pattern where the revenue is gradually decreasing from august till november and then slowly 
# increasing
plt.subplots_adjust(wspace=2)
plt.figure(figsize=(15,8))
plt.subplot(1,2,1)
year_revenew=pd.pivot_table(data=data,index=['month','year'],values='totals.transactionRevenue',aggfunc='mean').reset_index()
sns.barplot(data=year_revenew,x='month',y='totals.transactionRevenue',color='orange')
ax=plt.subplot(1,2,2)
sns.lineplot(data=year_revenew,x='month',y='totals.transactionRevenue',hue='year',markers=True, style='year')
ax.grid(b=True, which='major', color='w', linewidth=1.0)
plt.xticks(range(0,13))
plt.show();

In [None]:
#Deleting the repeated columsn
# We can see that there are spikes in April'2017, and later in July'2018, may be due to some offers, need to check
plt.figure(figsize=(15,8))
asp=data.groupby('date')['totals.transactionRevenue'].mean().plot()

In [None]:
year_revenew=pd.pivot_table(data=data,index='weekday',values='totals.transactionRevenue',aggfunc='mean').reset_index()
sns.lineplot(data=year_revenew,x='weekday',y='totals.transactionRevenue')
plt.show();

In [None]:
# deleting columns with unique values
data.columns
del data['visitId']
#del data['visitStartTime'] -----can be usefull
del data['visitNumber']

In [None]:
#We can see that most of the browser are not specified as the cookies are disabled . 
# Firefox and Chrome, brings out most the total revenue, this can help us to increase the revenue by concentrating on this browser.
#data['device.browser'].value_counts()

browz=data.groupby('device.browser')['totals.transactionRevenue'].mean().sort_index()
browz=pd.DataFrame(browz)
browz[browz['totals.transactionRevenue']>0].sort_values(ascending=False, by='totals.transactionRevenue').plot(kind='bar')

In [None]:
# If we see with the total revenue generated most of it comes from Desktop,but we can see that there are outliers in revenue amount
# Grouping the categories, channel and revenue, most of the revenue is generated through the display accessed through the desktop application
plt.subplots_adjust(wspace=2)
plt.figure(figsize=(15,6))
chanel_device=pd.pivot_table(data=data,index=['channelGrouping','device.deviceCategory'],values='totals.transactionRevenue',aggfunc='mean').reset_index()
plt.subplot(1,2,1)
sns.barplot(data=chanel_device,x='device.deviceCategory',y='totals.transactionRevenue')
plt.xlabel('DeviceCategory')
plt.subplot(1,2,2)
data['device.deviceCategory'].value_counts()
sns.barplot(data=chanel_device,x='channelGrouping',y='totals.transactionRevenue',hue='device.deviceCategory')
plt.xticks(rotation=90)
plt.show();

In [None]:
# Total revenue generated through the mobile is low as compared to the Desktop
data['device.isMobile'].value_counts()
round(data.groupby('device.isMobile')['totals.transactionRevenue'].mean(),2).plot(kind='bar')

In [None]:
# We can see that total revenue generated by operating system through which the Gstore is accessed is 'Windows' and 'MAC'.
# But there is variance and outliers, that can be seen in blow graphs.

plt.subplots_adjust(hspace=5)
plt.figure(figsize=(20,6))
plt.subplot(1,3,1)
data['device.operatingSystem'].value_counts().plot(kind='bar',color='orange')
plt.subplot(1,3,2)
op_device=pd.pivot_table(data=data,index=['device.deviceCategory','device.operatingSystem','channelGrouping'],values='totals.transactionRevenue',aggfunc='mean').reset_index()
op_device1=op_device[op_device['totals.transactionRevenue']>0]
sns.barplot(data=op_device1,x='device.operatingSystem',y='totals.transactionRevenue',hue='device.deviceCategory',dodge=False)
plt.xticks(rotation=90)
plt.subplot(1,3,3)
sns.barplot(data=op_device1,x='device.operatingSystem',y='totals.transactionRevenue',hue='channelGrouping',dodge=False)
plt.legend(loc='upper left')
plt.xticks(rotation=90)
plt.show();

In [None]:
# Checking the revenue generated city wise
#data['geoNetwork.city'].value_counts()
plt.figure(figsize=(24,6))
city_rev=pd.DataFrame(data.groupby('geoNetwork.city')['totals.transactionRevenue'].mean()).reset_index()
city_rev1=city_rev[city_rev['totals.transactionRevenue']>0].sort_values(by='totals.transactionRevenue',ascending=False)
sns.barplot(data=city_rev1.head(50),x='geoNetwork.city',y='totals.transactionRevenue',color='orange')
plt.xticks(rotation=90)
plt.show();

In [None]:
# average revenue comes from Americas
data.columns
plt.subplots_adjust(hspace=5)
plt.figure(figsize=(15,6))
plt.subplot(1,2,1)
#data.groupby(['geoNetwork.country','geoNetwork.city'])['totals.transactionRevenue'].mean()
data.groupby('geoNetwork.continent')['totals.transactionRevenue'].mean().plot(kind='bar',color='orange')
plt.subplot(1,2,2)
country_rev=pd.DataFrame(data.groupby('geoNetwork.country')['totals.transactionRevenue'].mean()).reset_index()
country_rev=country_rev[country_rev['totals.transactionRevenue']>0].sort_values(by='totals.transactionRevenue',ascending=False)
sns.barplot(data=country_rev.head(15),x='geoNetwork.country',y='totals.transactionRevenue', color='orange')
plt.xticks(rotation=90)
plt.show();

In [None]:
#data.columns
#data['geoNetwork.metro'].value_counts()
plt.figure(figsize=(15,6))
metro_rev=pd.DataFrame(data.groupby('geoNetwork.metro')['totals.transactionRevenue'].mean()).reset_index()
metro_rev=metro_rev[metro_rev['totals.transactionRevenue']>0].sort_values(by='totals.transactionRevenue',ascending=False)
sns.barplot(data=metro_rev.head(30),x='geoNetwork.metro',y='totals.transactionRevenue',color='orange')
plt.xticks(rotation=90)
plt.show();

In [None]:
#data.columns
#data['geoNetwork.region'].value_counts()
plt.figure(figsize=(15,6))
net_reg=pd.pivot_table(data=data, index=['geoNetwork.region'],values='totals.transactionRevenue',aggfunc='mean').reset_index()
net_reg=net_reg[net_reg['totals.transactionRevenue']>0].sort_values(by='totals.transactionRevenue',ascending=False)
sns.barplot(data=net_reg.head(40),x='geoNetwork.region',y='totals.transactionRevenue',color='orange')
plt.xticks(rotation=90)
plt.show();

In [None]:
data['totals.hits']=data['totals.hits'].fillna(1) 
data['totals.hits']=data['totals.hits'].astype('Int64')

In [None]:
# We can see that some relation coming out from hits and total revenue, if the hits are more, there is high chances transaction
plt.figure(figsize=(10,5))
#data['totals.hits'].value_counts()
#data['totals.hits'].isnull().sum()
hits_rev=pd.pivot_table(data=data,index='totals.hits',values='totals.transactionRevenue',aggfunc='mean')
hits_rev=hits_rev.reset_index().sort_values(by='totals.hits',ascending=False)
hits_rev1=hits_rev[hits_rev['totals.transactionRevenue']>0].sort_values(by='totals.hits',ascending=False)
sns.scatterplot(data=hits_rev1,x='totals.hits',y='totals.transactionRevenue')
plt.xticks(rotation=90)
plt.ylim(-10,1.683176e+08)
plt.show();

In [None]:
hits_rev['totals.transactionRevenue'].min()
hits_rev['totals.transactionRevenue'].max()
hits_rev.quantile([0.0,0.50,0.75,0.9])

In [None]:
#filling by1
data['totals.pageviews']=data['totals.pageviews'].fillna(1) 
data['totals.pageviews']=data['totals.pageviews'].astype('Int64')
data['totals.pageviews'].dtypes


In [None]:
# We can also see that with the increase in the pageview, the transactionRevenue also increases.
plt.figure(figsize=(10,5))
#data.columns
#data['totals.pageviews'].value_counts()
page_rev=pd.pivot_table(data=data,index='totals.pageviews',values='totals.transactionRevenue',aggfunc='mean')
page_rev=page_rev.reset_index().sort_values(by='totals.pageviews',ascending=False)
#hits_rev1=hits_rev[hits_rev['totals.transactionRevenue']>0].sort_values(by='totals.hits',ascending=False)
sns.scatterplot(data=page_rev,x='totals.pageviews',y='totals.transactionRevenue')
plt.xticks(rotation=90)
plt.ylim(-10,1.683176e+08)
plt.show();

In [None]:
# Higher the sessionQualityDim, there are high chances transaction and generate the revenue
data['totals.sessionQualityDim']= data['totals.sessionQualityDim'].fillna(0)
data['totals.sessionQualityDim']=data['totals.sessionQualityDim'].astype('Int64')
#data.groupby('totals.sessionQualityDim')['totals.transactionRevenue'].mean()
plt.figure(figsize=(10,5))
session_rev= pd.pivot_table(data=data,index='totals.sessionQualityDim',values='totals.transactionRevenue',aggfunc='mean').reset_index()
#session_rev
sns.scatterplot(data=session_rev,x='totals.sessionQualityDim',y='totals.transactionRevenue')

In [None]:
#fillinh NA values with mode 5
#data['totals.timeOnSite'].isnull().sum()
data['totals.timeOnSite']=data['totals.timeOnSite'].fillna('5')
data['totals.timeOnSite']=data['totals.timeOnSite'].astype('Int64')
plt.figure(figsize=(10,5))
site_rev= pd.pivot_table(data=data,index='totals.timeOnSite',values='totals.transactionRevenue',aggfunc='mean').reset_index()
sns.scatterplot(data=site_rev,x='totals.timeOnSite',y='totals.transactionRevenue')

In [None]:
# Total Transaction revenue is generated through the Referral.
data['totals.totalTransactionRevenue']=data['totals.totalTransactionRevenue'].fillna(0)
data['totals.totalTransactionRevenue']=data['totals.totalTransactionRevenue'].astype('Int64')
data.groupby('channelGrouping')['totals.totalTransactionRevenue'].mean().plot(kind='bar',color='orange')

In [None]:
data['totals.transactions'].value_counts()
data['totals.transactions']=data['totals.transactions'].fillna('0')
data['totals.transactions']=data['totals.transactions'].astype('Int64')
plt.figure(figsize=(10,5))
#data.groupby('totals.transactions')['totals.transactionRevenue'].mean().plot(kind='bar')
total_trans= pd.pivot_table(data=data,index=['totals.transactions','channelGrouping'],values='totals.transactionRevenue',aggfunc='mean').reset_index()
sns.barplot(data=total_trans,x='totals.transactions',y='totals.transactionRevenue',hue='channelGrouping',dodge=False)
plt.legend(loc='upper right')
plt.show();

In [None]:
visit_trans=pd.pivot_table(data=data, index='channelGrouping',values='totals.transactions',aggfunc='count').reset_index()
visit_trans=visit_trans.sort_values(by='totals.transactions',ascending=False)
sns.barplot(data=visit_trans,x='channelGrouping',y='totals.transactions',color='orange')
plt.xticks(rotation=90)
plt.show();
#visit_trans

In [None]:
visit_trans1=pd.pivot_table(data=data, index='totals.transactions',values='totals.transactionRevenue',aggfunc='mean').reset_index()
sns.barplot(data=visit_trans1,x='totals.transactions',y='totals.transactionRevenue',color='orange')

In [None]:
# Traffic Source :
# Revenue generated through the adContent, is Paid Search.
plt.figure(figsize=(16,5))
plt.subplots_adjust(hspace=5)
plt.subplot(1,2,1)
add_rev=pd.DataFrame(data.groupby('trafficSource.adContent')['totals.transactionRevenue'].mean()).reset_index()
add_rev=add_rev.sort_values(by='totals.transactionRevenue',ascending=False)
add_rev1=add_rev[add_rev['totals.transactionRevenue']>0]
sns.barplot(data=add_rev1,x='trafficSource.adContent',y='totals.transactionRevenue',color='blue')
plt.xticks(rotation=90)
plt.subplot(1,2,2)
add_chan_rev=pd.DataFrame(data.groupby(['trafficSource.adContent','channelGrouping'])['totals.transactionRevenue'].mean()).reset_index()
add_chan_rev=add_chan_rev.sort_values(by='totals.transactionRevenue',ascending=False)
add_chan_rev1=add_chan_rev[add_chan_rev['totals.transactionRevenue']>0]
sns.barplot(data=add_chan_rev1,x='trafficSource.adContent',y='totals.transactionRevenue',hue='channelGrouping',dodge=False)
plt.xticks(rotation=90)
plt.legend(loc='upper right')
plt.show();

In [None]:
data['trafficSource.adwordsClickInfo.adNetworkType'].value_counts()/data.shape[0]
data.groupby(['trafficSource.adwordsClickInfo.adNetworkType','channelGrouping'])['totals.transactionRevenue'].mean()


In [None]:
data['trafficSource.adwordsClickInfo.adNetworkType']=data['trafficSource.adwordsClickInfo.adNetworkType'].fillna('No Network')

In [None]:
del data['trafficSource.adwordsClickInfo.gclId']
data['trafficSource.adwordsClickInfo.adNetworkType'].value_counts()
data['trafficSource.adwordsClickInfo.page']=data['trafficSource.adwordsClickInfo.page'].fillna('99')

In [None]:
# Top slot ADD, have more impact on Revenue
data['trafficSource.adwordsClickInfo.slot'].value_counts()
data['trafficSource.adwordsClickInfo.slot'].isnull().sum()
data.groupby('trafficSource.adwordsClickInfo.slot')['totals.transactionRevenue'].mean().plot(kind='bar')
#data['trafficSource.adwordsClickInfo.slot']=data['trafficSource.adwordsClickInfo.slot'].fillna('No Add Slot')

In [None]:
#data['trafficSource.keyword'].value_counts()
#data['trafficSource.keyword'].isnull().sum()
key_word=pd.DataFrame(data.groupby(['trafficSource.keyword'])['totals.transactionRevenue'].mean()).reset_index()
key_word=key_word.sort_values(by='totals.transactionRevenue',ascending=False)


In [None]:
plt.figure(figsize=(10,5))
data['trafficSource.medium'].isnull().sum()
med=pd.DataFrame(data.groupby(['trafficSource.medium','channelGrouping'])['totals.transactionRevenue'].mean()).reset_index()
med=med.sort_values(by='totals.transactionRevenue',ascending=False)
sns.barplot(data=med,x='trafficSource.medium',y='totals.transactionRevenue',hue='channelGrouping',dodge=False)
plt.legend(loc='upper right')

In [None]:
data['trafficSource.referralPath']=data['trafficSource.referralPath'].fillna('No Reference Path')

In [None]:
data['trafficSource.source'].value_counts()
data.isnull().sum()
#del data['trafficSource.adContent']