In [None]:
import warnings
warnings.filterwarnings('ignore')

import os
import json
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize
import matplotlib.pyplot as plt

import seaborn as sns
color = sns.color_palette()

%matplotlib inline
pd.options.mode.chained_assignment = None
pd.options.display.max_columns = 999

from IPython.display import HTML, display
import tabulate
from plotly import tools
import plotly.offline as py
py.init_notebook_mode(connected=True)
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.offline as offline
offline.init_notebook_mode()

<h3>Objective of the notebook:</h3>

In this notebook, We will 
* perform analysis on data, 
* perform missing data imputation and 
* perform simple EDA  

<h3>Objective of the competition:</h3>

In this competition, we a’re challenged to analyze a Google Merchandise Store (also known as GStore, where Google swag is sold) customer dataset to predict revenue per customer.

<h3>File Descriptions</h3>

* train.csv - the training set - contains the same data as the BigQuery rstudio_train_set.

* test.csv - the test set - contains the same data as the BigQuery rstudio_test_set.

<h3>Data Fields</h3>

* fullVisitorId- A unique identifier for each user of the Google Merchandise Store.

* channelGrouping - The channel via which the user came to the Store.

* date - The date on which the user visited the Store.

* device - The specifications for the device used to access the Store.

* geoNetwork - This section contains information about the geography of the user.

* sessionId - A unique identifier for this visit to the store.

* socialEngagementType - Engagement type, either "Socially Engaged" or "Not Socially Engaged".

* totals - This section contains aggregate values across the session.

* trafficSource - This section contains information about the Traffic Source from which the session originated.

* visitId - An identifier for this session. This is part of the value usually stored as the _utmb cookie. This is only unique to the user. For a completely unique ID, you should use a combination of fullVisitorId and visitId.

* visitNumber - The session number for this user. If this is the first session, then this is set to 1.

* visitStartTime - The timestamp (expressed as POSIX time).

***_Credit Note_*** - Using code from [kernal Quick start: read csv and flatten json fields](https://www.kaggle.com/julian3833/1-quick-start-read-csv-and-flatten-json-fields) by [Julián Peller1](https://www.kaggle.com/julian3833)

In [None]:
def load_df(csv_path='../input/train.csv', nrows=None):
    JSON_COLUMNS = ['device', 'geoNetwork', 'totals', 'trafficSource']
    
    df = pd.read_csv(csv_path, 
                     converters={column: json.loads for column in JSON_COLUMNS}, 
                     dtype={'fullVisitorId': 'str'}, # Important!!
                     nrows=nrows)
    
    for column in JSON_COLUMNS:
        column_as_df = json_normalize(df[column])
        column_as_df.columns = [f"{column}.{subcolumn}" for subcolumn in column_as_df.columns]
        df = df.drop(column, axis=1).merge(column_as_df, right_index=True, left_index=True)
    print(f"Loaded {os.path.basename(csv_path)}. Shape: {df.shape}")
    return df

In [None]:
%%time
train_df = load_df()
test_df = load_df("../input/test.csv")

In [None]:
print('size of training data : ', train_df.shape)
print('size of testing data  : ', test_df.shape)

** Train Data snippets : **

In [None]:
train_df.head()

** Train Data snippets : **

In [None]:
test_df.head()

<h2>Missing values assesment</h2>

In this section we will display the counts and percentage of missing value and impute the missing values using the pervious kernal [Simple Missing Data Analysis - GACRP](https://www.kaggle.com/nakapoor/simple-missing-data-analysis-gacrp)

<h4>Below statatistics shows that there are 8 columns with more than 97% missing values</h4>

Next task is to analyse the missing attribute and try to recommnd on imputing the missing values

In [None]:
total = train_df.isnull().sum().sort_values(ascending = False)
percent = (train_df.isnull().sum() / train_df.isnull().count()*100).sort_values(ascending = False)
missing_application_train_data  = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_application_train_data.head(20)



<h3> <span style="color:,brown">Feature # 1 : trafficSource.campaignCode </span></h3>

*** Recomendation : ***   Removing this feature from train Dataframe.

In [None]:
del train_df['trafficSource.campaignCode']

<h3> <span style="color:,brown"> Feature # 2 : totals.transactionRevenue : </span></h3>

Now This is the Target attribute . 

**Recomendation : ** 5332 datapoints has valid numerical values and rest all datapoints can be populated with value 0


In [None]:
train_df['totals.transactionRevenue'].fillna(0, inplace=True)

<h3> <span style="color:,brown">Feature # 3 : trafficSource.adwordsClickInfo.page : </span></h3>

*** Recomendation : *** _With my experience I never go beyond 3 / 4  pages of search results so this feature is a good candidate for binning._ therefore binning it into 3 classes 1) Missing values : -99999, 2) Ads on first page, 3) Any other class 


In [None]:
train_df['trafficSource.adwordsClickInfo.page'].fillna(-99999, inplace=True)
test_df['trafficSource.adwordsClickInfo.page'].fillna(-99999, inplace=True)
print(train_df['trafficSource.adwordsClickInfo.page'].value_counts())
print(test_df['trafficSource.adwordsClickInfo.page'].value_counts())

<h3> <span style="color:,brown">Feature # 4 : trafficSource.adwordsClickInfo.adNetworkType : </span></h3>

*** Recomendation : *** Binning the whole data into 2 bins ***1)*** Google Search and ***2)*** Others

In [None]:
train_df['trafficSource.adwordsClickInfo.adNetworkType'].fillna('Others', inplace=True)
test_df['trafficSource.adwordsClickInfo.adNetworkType'].fillna('Others', inplace=True)
train_df['trafficSource.adwordsClickInfo.adNetworkType'] = np.where(train_df['trafficSource.adwordsClickInfo.adNetworkType'] != 'Google Search' , 'Others',train_df['trafficSource.adwordsClickInfo.adNetworkType'])
test_df['trafficSource.adwordsClickInfo.adNetworkType'] = np.where(test_df['trafficSource.adwordsClickInfo.adNetworkType'] != 'Google Search'  , 'Others',test_df['trafficSource.adwordsClickInfo.adNetworkType'])
print(train_df['trafficSource.adwordsClickInfo.adNetworkType'].value_counts())
print(test_df['trafficSource.adwordsClickInfo.adNetworkType'].value_counts())

<h3> <span style="color:,brown">Feature # 5 : trafficSource.adwordsClickInfo.slot : </span></h3>

*** Recomendation : ***  We can replace all NaN's and Google Display Network with unique value class.

In [None]:
train_df['trafficSource.adwordsClickInfo.slot'].fillna('NA', inplace=True)
test_df['trafficSource.adwordsClickInfo.slot'].fillna('NA', inplace=True)
#train_df['trafficSource.adwordsClickInfo.slot'] = np.where(train_df['trafficSource.adwordsClickInfo.slot'] != ["RHS", "Top"] , 'NA',train_df['trafficSource.adwordsClickInfo.slot'])
test_df['trafficSource.adwordsClickInfo.slot'] = np.where(test_df['trafficSource.adwordsClickInfo.slot'] ==  "Google Display Network" , 'NA',test_df['trafficSource.adwordsClickInfo.slot'])
print(train_df['trafficSource.adwordsClickInfo.slot'].value_counts())
print(test_df['trafficSource.adwordsClickInfo.slot'].value_counts())

<h3> <span style="color:,brown">Feature # 6 : trafficSource.adwordsClickInfo.isVideoAd : </span></h3>

***Recomendation:*** In train and test Dataframe user can replace all NaN's with True.

In [None]:
train_df['trafficSource.adwordsClickInfo.isVideoAd'].fillna(True, inplace=True)
test_df['trafficSource.adwordsClickInfo.isVideoAd'].fillna(True, inplace=True)
print(train_df['trafficSource.adwordsClickInfo.isVideoAd'].value_counts())
print(test_df['trafficSource.adwordsClickInfo.isVideoAd'].value_counts())

<h3> <span style="color:,brown"> Feature # 7 : trafficSource.isTrueDirect : </span></h3>



***Recomendation:*** In train and test Dataframe user can replace all NaN's with False.



In [None]:
train_df['trafficSource.isTrueDirect'].fillna(False, inplace=True)
test_df['trafficSource.isTrueDirect'].fillna(False, inplace=True)
print(train_df['trafficSource.isTrueDirect'].value_counts())
print(test_df['trafficSource.isTrueDirect'].value_counts())


<h3> <span style="color:,brown">Feature # 8 : totals.bounces : </span></h3>

***Recomendation :***  Replace all NaN's with _0_

In [None]:
train_df['totals.bounces'].fillna(0, inplace=True)
test_df['totals.bounces'].fillna(0, inplace=True)
print(train_df['totals.bounces'].value_counts())
print(test_df['totals.bounces'].value_counts())

<h3> <span style="color:,brown">Feature # 9 : totals.newVisits : </span></h3>

***Recomendation :*** replace all NaN's with _0_

In [None]:
train_df['totals.newVisits'].fillna(0, inplace=True)
test_df['totals.newVisits'].fillna(0, inplace=True)
print(train_df['totals.newVisits'].value_counts())
print(test_df['totals.newVisits'].value_counts())

<h3> Revisiting the missing values</h3>

In [None]:
total = train_df.isnull().sum().sort_values(ascending = False)
percent = (train_df.isnull().sum() / train_df.isnull().count()*100).sort_values(ascending = False)
missing_application_train_data  = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_application_train_data.loc[missing_application_train_data['Percent'] > 0]


In [None]:
del missing_application_train_data

<h3> Pre processing and data check </h3>

<h5> Remove constant features </h5>

In [None]:
feats_counts = train_df.nunique(dropna = False).sort_values(ascending = False)
values = feats_counts.values
trace1 = go.Bar(
    x = feats_counts.index,
    y = values ,
)

data = [trace1]

layout = go.Layout(
    title = "# of unique values in each column in dataframe",
    xaxis=dict(
        title='Features Names',
        domain=[0, 0.5]
    ),
    
    yaxis=dict(
        title='# of unique constant values'
        
    )
)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='Campaign code')


We found that there are **19 constant features** as represented in above plot and names below in the data set. We can go ahead and remove the same for fast processing.

In [None]:
constant_features = feats_counts.loc[feats_counts==1].index.tolist()
print (constant_features)
train_df.drop(constant_features,axis = 1,inplace=True)
test_df.drop(constant_features,axis = 1,inplace=True)
del constant_features

In [None]:
feats_counts = train_df.nunique(dropna = False).sort_values(ascending = False)
print(feats_counts)
del feats_counts

<h3>Target variable analysis</h3>

In [None]:
train_df["totals.transactionRevenue"] = train_df["totals.transactionRevenue"].astype('float')
revnSum = train_df.groupby("fullVisitorId")["totals.transactionRevenue"].sum().sort_values(ascending = True).reset_index()
#revnSum = np.log1p(revnSum['totals.transactionRevenue']/1000000)
revnSum = np.log1p(revnSum['totals.transactionRevenue'])
plt.figure(figsize=(8,6))
plt.scatter(revnSum.index, revnSum.values)
plt.xlabel('index', fontsize=12)
plt.ylabel('TransactionRevenue for visitors', fontsize=12)
plt.show()

We know that total # of visitors are 723803 and from the above plot we can see that there are more 70000 visitors with  transactionRevenue = 0. Lets plot the percentage of visitors having no transactionRevenue Vs transactionRevenue > 0

In [None]:
train_df['if_TransRev'] = np.where(train_df['totals.transactionRevenue'] > 0.0 , 1,0)
feats_counts = train_df['if_TransRev'].value_counts()
values = (feats_counts/feats_counts.sum())*100
trace1 = go.Bar(
    x = feats_counts.index,
    y = values ,
)

data = [trace1]

layout = go.Layout(
    title = "% of visitors with transaction vs Non transaction",
    xaxis=dict(
        title='TransactionRevenue',
        domain=[0, 0.5]
    ),
    
    yaxis=dict(
        title='% of visitors'
        
    )
)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='Campaign code')


***This shows this is a highly imbalance problem.*** As visitors ***not contributing towards the revenue*** are ***98.73%*** whereas the visitors ***contributing towards the revenue*** are only ***1.27%*** . 

In [None]:
#train_df.to_csv("filterTrain_df.csv", index=False)

<H3>Analysis on transactions that contributes towards revenue</h3>

a) Contribution of Mobile ads where customer visit is converted into revenue

b) When customer made a new visit and the visit is converted into revenue

c) Contribution of video Ads where customer visit is converted into revenue

d) Contribution of direct visits where customer visit is converted into revenue

In [None]:
tempTrain_df = train_df[train_df['totals.transactionRevenue'] > 0.0]
tempTrain_df['device.isMobile'] = np.where(tempTrain_df['device.isMobile'] == True , 1,0)
tempTrain_df['totals.newVisits'] = np.where(tempTrain_df['totals.newVisits'] == True , 1,0)
tempTrain_df['trafficSource.adwordsClickInfo.isVideoAd'] = np.where(tempTrain_df['trafficSource.adwordsClickInfo.isVideoAd'] == True , 1,0)
tempTrain_df['trafficSource.isTrueDirect'] = np.where(tempTrain_df['trafficSource.isTrueDirect'] == True , 1,0)
tempTrain_df['totals.bounces'] = np.where(tempTrain_df['totals.bounces'] == True , 1,0)
tempTrain_df.shape

In [None]:
def bar_chart(lables, values):
    trace = go.Bar(
        x=lables,
        y=values,
        showlegend=False,
        marker=dict(
            color='rgba(28,32,56,0.84)',
        )
    )
    return trace

feats_counts = tempTrain_df['device.isMobile'].value_counts()
trace1 = bar_chart(lables = feats_counts.index, values = (feats_counts/feats_counts.sum())*100)

feats_counts = tempTrain_df['trafficSource.adwordsClickInfo.isVideoAd'].value_counts()
trace2 = bar_chart(lables = feats_counts.index, values = (feats_counts/feats_counts.sum())*100)

feats_counts = tempTrain_df['totals.newVisits'].value_counts()
trace3 = bar_chart(lables = feats_counts.index, values = (feats_counts/feats_counts.sum())*100)

feats_counts = tempTrain_df['trafficSource.isTrueDirect'].value_counts()
trace4 = bar_chart(lables = feats_counts.index, values = (feats_counts/feats_counts.sum())*100)

feats_counts = tempTrain_df['totals.bounces'].value_counts()
trace5 = bar_chart(lables = feats_counts.index, values = (feats_counts/feats_counts.sum())*100)

fig = tools.make_subplots(rows=3, cols=2, vertical_spacing=0.06, 
                          subplot_titles=["Mobile Ads", "Video Ads","New Visit", "Direct Visit","if user bounced"])

fig.append_trace(trace1, 1, 1)
fig.append_trace(trace2, 1, 2)
fig.append_trace(trace3, 2, 1)
fig.append_trace(trace4, 2, 2)
fig.append_trace(trace5, 3, 1)


fig['layout'].update(height=1200, width=800, paper_bgcolor='rgb(233,233,233)', title="Impact of 5 binary features on transactions with revenue")

py.iplot(fig, filename='plots_2')


<h3>Snippets from the above analysis : </h3>

* Only 9% of the transactions contributing to revenue are Mobile Ads.
* ~ 96 % of the transactions contributing to revenue are Video ads.
* Only 38% of the transactions contributing to revenue are by New visitors
* ~ 61% of the transactions contributing to revenue are by direct visit to google Store
* It is very obvious if user bounsed(visitors who enter the site and then leave)the transactions will not contribute to any revenue. 

