# Google Analytics Customer Revenue Prediction

In [None]:
# Import libraries to use

# Common imports
import numpy as np
import os
import pandas as pd
from sklearn.model_selection import train_test_split
#from preprocess import read_data, json_read

from datetime import datetime # To access datetime

import warnings                # To ignore the warning
warnings.filterwarnings("ignore")

# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

## Auxiliary Functions

In [None]:
import pandas as pd
import numpy as np
import json
import os # it's a operational system library, to set some informations

def read_data(file_path, file_name, data_format):
    """
    Parameters:
    -----------
    file_path: str 
               where the datafile is
    file_name: str
               file name of the datafile
    data_format: dict
                 format of data to import
                 
    Return
    ------
    df: dataframe
        the df after preprocessing
    """
    # Load the data
    df = pd.read_csv(file_path + file_name, dtype = data_format)
    
    # Printing the shape of dataframes that was imported     
    print("Loaded file at {}, and dataframe with shape {}".format(file_path + file_name, df.shape))
    
    return df


def json_read(df, field_name, extract_field, new_field_name):
    """
    Read semi-structured JSON data
    
    Parameters:
    ----------
    df: dataframe
        the dataframe needed to process
    field_name: str
                column to read json data
    extract_field: str 
                   info to extract from json data format
    new_field_name: str
                    add a column for data extract from json data format
                    
    Return
    ------
    df: dataframe
        the df after extracting data from json data format and adding to a new column, called new_field_name
    """
    df[new_field_name] = pd.DataFrame(df[field_name].apply(json.loads).tolist())[[extract_field]]
    return df

# 1. Load Data

Load training set:

In [None]:
chunksize = 600000
for chunk in pd.read_csv('../input/train_v2.csv', chunksize=chunksize):
    df_train = chunk
    break
    
df_train.head()

In [None]:
# Check data type in each column
df_train.dtypes

In [None]:
df_train.describe()

In [None]:
# Check memory usage in MB
df_train.memory_usage(deep=True)* 1e-6

In [None]:
# Estimate total memory usage
usage = df_train.memory_usage(deep=True).sum() * 1e-6
print('Memory usage is {} Gb'.format(usage/1000))

## Data Fields

- **fullVisitorId**- A unique identifier for each user of the Google Merchandise Store.
- **channelGrouping** - The channel via which the user came to the Store.
- **date** - The date on which the user visited the Store.
- **device** - The specifications for the device used to access the Store.
- **geoNetwork** - This section contains information about the geography of the user.
- **sessionId** - A unique identifier for this visit to the store.
- **socialEngagementType** - Engagement type, either "Socially Engaged" or "Not Socially Engaged".
- **totals** - This section contains aggregate values across the session.
- **trafficSource** - This section contains information about the Traffic Source from which the session originated.
- **visitId** - An identifier for this session. This is part of the value usually stored as the _utmb cookie. This is only unique to the user. For a completely unique ID, you should use a combination of fullVisitorId and visitId.
- **visitNumber** - The session number for this user. If this is the first session, then this is set to 1.
- **visitStartTime** - The timestamp (expressed as POSIX time).
- **hits** - This row and nested fields are populated for any and all types of hits. Provides a record of all page visits. (new in version 2 data set)
- **customDimensions** - This section contains any user-level or session-level custom dimensions that are set for a session. This is a repeated field and has an entry for each dimension that is set. (new in version 2 data set)


In [None]:
chunksize = 100
for chunk in pd.read_csv('../input/test_v2.csv', chunksize=chunksize):
    df_test = chunk
    break

df_test.head()

In [None]:
del df_test


## What am I predicting?

The following is the sample submission.

In [None]:
df_submit = pd.read_csv('../input/sample_submission_v2.csv')
df_submit.head()

In [None]:
del df_submit

For each `fullVisitorId` in the test set, we must predict the **natural log** of their total revenue in `PredictedLogRevenue`. The submission file should contain a header and have the following format:

```
fullVisitorId,PredictedLogRevenue
0000000259678714014,0
0000049363351866189,0
0000053049821714864,0
etc.
```

We are predicting the natural log of the **sum of all transactions per user**. For every user in the test set, the target is:

$$ y_{user}=\sum_{i=1}^{n}transaction_{user_i}$$

$$ target_{user}=\ln(y_{user}+1)$$

where $n$ is the number of times a specific customer vistied the GStore. 

**Why do we choose above metric to gauage the performance?**

Since each transaction is a large value, if we use those values directly, then it will not be sensitive to the minor changes! Thus, we  apply natural log to $y_{user}$. The reason why we consider $y_{user}+1$ is because some users may not purchase anything and $y_{user}=0$. Then $\ln (y_{user})$ will not be well-defined! In contrast, $\ln (y_{user}+1) = 0$ when $y_{user} = 0$. 

Note that Kaggle has updated dataset, `train_v2.csv` and `test_v2.csv`, and the training dataset does NOT contain data for December 1st 2018 to January 31st 2019. You must identify the unique fullVisitorIds in the provided test_v2.csv and make predictions for them for those unseen months.

In [None]:
df_train.loc[0,'geoNetwork']

In [None]:
df_train.loc[10,'device']

Note that `deviceCategory` and `isMobile` are related actually. If `deviceCategory` is mobile or tablet, then `isMobile` is true. If `deviceCategory` is desktop, then `isMobile` is false. We will examine these two features later to see whether it's necessary to consider `deviceCategory` or not.

In [None]:
df_train.loc[101,'device']

In [None]:
df_train.loc[80,'totals']

In [None]:
df_train.loc[102,'trafficSource']

In **device, geoNetwork, totals, trafficSource** fields, they are semi-structured JSON data.

- **device**: browser, browserVersion, browserSize, operatingSystem, operatingSystemVersion, isMobile, mobileDeviceBranding, mobileDeviceModel, mobileInputSelector, mobileDeviceInfo, mobileDeviceMarketingName, flashVersion, language, screenColors, screenResolution, deviceCategory
- **totals**: visits, hits, pageviews, newVisits, bounces
- **geoNetwork**: continent, subContinent, country, region, metro, city, cityId, networkDomain, latitude, longitude, networkLocation
- **trafficSource**: campaign, source, medium, keyword, adwordsClickInfo, referralPath


In [None]:
# Extract revenue from transactionRevenue from totals field
field_name = 'totals' 
extract_field = 'transactionRevenue'
new_field_name = 'revenue'

json_read(df_train, field_name, extract_field, new_field_name).head()

In [None]:
# Estimate total memory usage
usage = df_train.memory_usage(deep=True).sum() * 1e-6
print('Memory usage is {} Gb'.format(usage/1000))

In [None]:
# Check missing values in revenue field
df_train['revenue'].isnull().sum()

There are 593602 missing values out of 600000. This is highly **imbalanced dataset**. We have to keep this in mind when we build model to do further analysis. We denote these missing values by 0 since they may be events that customers visit gstore without purchasing any products.

In [None]:
# Fill in missing data with zeros
df_train['revenue'] = df_train['revenue'].fillna(0)
df_train.head(7)

In [None]:
df_train['revenue'] = df_train['revenue'].astype('int64')
df_train.dtypes

In order to determine what factors affect whether customers purchase things or not when they visit gstore, we set 0 when revenue is 0 (not purchasing) and 1 when revenue is not 0 (purchasing) as shown in column `Buy`.

In [None]:
df_train['Buy'] = df_train['revenue'].apply(lambda x: 1 if x != 0 else 0)
df_train.head()

In [None]:
# Estimate total memory usage
usage = df_train.memory_usage(deep=True).sum() * 1e-6
print('Memory usage is {} Gb'.format(usage/1000))

# 2. Data Explore, Clean Data and Feature Engineering

## 2.1 channelGrouping feature

In [None]:
df_train.channelGrouping.value_counts().plot(kind="bar",title="channelGrouping distribution",figsize=(8,8),rot=25,colormap='Paired')

From above figure, we can see that most of customers used `Organic Search` channel to come to gstore, and `Social` channel is the second one. But does it mean there are more customers purchasing goods in gstore from `Organic Search` channel? Let's find out below.

### channelGrouping vs. revenue

In [None]:
df_channel = df_train[['channelGrouping','revenue', 'Buy']]
(df_channel.set_index('channelGrouping').groupby(level=0)['revenue'].agg({'mean': np.average, 'median':np.median, 'std':np.std, 
                                                                          'max': np.max, 'min': np.min}) )

According to mean and median of revenue for each channelGrouping, it is clear that in each channelGrouping, there are many customers visiting gstore without purchasing! **The distribution of data in each channel is highly right-skewed**.

In [None]:
# Visualize the data by drawing boxplot grouped by a categorical variable:
sns.boxplot(x='revenue', y='channelGrouping', data=df_channel)

### Are those customers used specific channel most possible to buy goods in gstore?

In [None]:
Channel_Buy = (df_channel.groupby(['channelGrouping', 'Buy'])['revenue'].agg({'Count':'count'}))
Channel_Buy['Relative Frequency'] = Channel_Buy.groupby(level=0).apply(lambda x: 100*x/float(x.sum()))
Channel_Buy

From above results, we can conclude that **customers from Referral channel are most possible to buy goods in gstore per visit**. 

### Did customers who purchased goods in gstore from specific channel buy more?

In order to answer this question, we show boxplot for categorical variable (channelGrouping) vs. numerical variable (revenue).

In [None]:
# boxplot for categorical variable (channelGrouping) vs. numerical variable (revenue)
df_channel_Buy = df_channel[df_channel['Buy'] == 1]
sns.boxplot(x='revenue', y='channelGrouping', data=df_channel_Buy)

From above figure, we cannot see the relation between channelGrouping and revenue due to large range of revenue in each channel. Thus, we do log transformation for revenue in the following.

In [None]:
# Transform revenue by log function
df_channel_Buy['log(revenue)'] = df_channel_Buy['revenue'].apply(np.log)
sns.boxplot(x='log(revenue)', y='channelGrouping', data=df_channel_Buy)

From above figure, we can see that **`revenue` has very weak relationship with `channelGrouping`** since median in each channel is close to each other. Although Affiliates has much different median, it is not reliable actually since only 2 visits with purchasing. 

In [None]:
# Delete useless dataframe
del df_channel_Buy, Channel_Buy, df_channel

## 2.2 date feature

There are two variables related to time and can be used in time dependent analyzes, or Time Series. Now we study the relation between date feature and counts of customers purchasing.


**1) Hypothesis Generation**

a. There will be more customers purchasing goods during the holiday season, say Nov. and Dec.
- Explanation - Purchasing rate will be higher during the holiday season.

b. There will be more customers purchasing goods during the weekend.
- Explanation - People need to work on weekdays so they may not have time to visit GStore.

c. There will be higher transaction per visit during the holiday season, say Nov. and Dec.
- Explanation - People will buy more goods when Thanksgiving, Christmas, and New Year coming.

**2) Feature Extraction**

We will extract further info from the `date`. We have seen earlier that the data type of `date` is int64. So first of all we have to change the data type to datetime format otherwise we can not extract features from it.

In [None]:
df_train['date'] = pd.to_datetime(df_train['date'],format="%Y%m%d") 

In [None]:
df_train.head()['date']

 Let's extract the year, month, day from the `date` to validate our hypothesis.

In [None]:
df_time = df_train[['date', 'revenue', 'Buy']]
df_time['year'] = pd.DatetimeIndex(df_time['date']).year
df_time['month'] = pd.DatetimeIndex(df_time['date']).month
df_time['day'] = pd.DatetimeIndex(df_time['date']).day

In [None]:
df_time.head()

We made a hypothesis that customers will purchase more goods on weekday and weekend as well. So, let's make a weekend variable to visualize the impact of weekend on purchasing rate.

 - We will first extract the day of week from `date` and then based on the values we will assign whether the day is a weekend or not.

 - Values of 5 and 6 represents that the days are weekend.



In [None]:
df_time['DayOfWeek']=df_time['date'].dt.dayofweek
temp = df_time['date']

Let’s assign 1 if the day of week is a weekend and 0 if the day of week in not a weekend.

In [None]:
def applyer(row):
    if row.dayofweek == 5 or row.dayofweek == 6:
        return 1
    else:
        return 0

temp2 = df_time['date'].apply(applyer)
df_time['weekend']=temp2

Let's look at the time series.

In [None]:
df_time.index = df_time['date'] # indexing the Datetime to get the time period on the x-axis.
df_time.head()

In [None]:
df_time_buy = df_time[df_time['Buy'] == 1]
df_time_buy.head()

**3) Exploratory Analysis**

Our first hypothesis was purchasing rate will be higher in holiday season.

In [None]:
# using an html hex string for color
color = '#0099ff'
df_time.groupby(['year','month']).size().plot.bar(rot = 55, color=color)
plt.ylabel('Visit Count')

As we can see in above figure, **there are more customers visiting GStore in Oct. 2016**.   Are there any specific event such that there are more vists in Oct. 2016? But does it also implies higher purchasing rate in that month? Let's examine that in the following.

In [None]:
df_time.groupby(['year','month'])['Buy'].mean().plot.bar(color=color)
plt.ylabel('Purchasing Rate')

As we can see in above figure, **there's no clear pattern about relation between time and purchasing rate**.

Next, we examine our second hypothesis, there will be more customers purchasing goods during the weekend.

In [None]:
df_time.groupby('weekend')['Buy'].mean().plot.bar(color=color)
plt.ylabel('Purchasing Rate')

In above figure, 0 denotes the weekday and 1 denotes the weekend. It is clear that our second hypothesis is not right. From the figure, we can see that **there are higher purchasing rate in weekdays**. Probably, that is because when people work or people go to office on weekdays, they have more chances to access the electric devices to visit the Google Merchandise Store.

In the end, we examine whether there will be higher transaction per visit during the holiday season, say Nov. and Dec. in the following:

In [None]:
daily_transaction_per_visit_df = df_time_buy[['date','revenue']].groupby(by=['date'],axis=0).mean()
fig, axes = plt.subplots(figsize=(20,10))
axes.set_title("Daily Transaction per Visit")
axes.set_ylabel("Transaction per Visit")
axes.set_xlabel("date")
axes.plot(daily_transaction_per_visit_df["revenue"])

From above result, we can see that the transaction per visit do not have strong peak at Nov. and Dec. so our last hypothesis is wrong. 

In [None]:
# Clean dataframe
del daily_transaction_per_visit_df, df_time_buy, df_time

## 2.3 device feature

The field, `device`, is stored in json format. It is necessary to extract its fields and analyze them. Using `json_read` function to deserializing json values.

In [None]:
df_train.iloc[10,2]

There are many keys in device attribute with values "not available in demo dataset" so we should ignore these features. 

### What device do customers frequently use to visit the store?

In order to answer this question, we consider `deviceCategory` in the following. Note that `isMobile` and `deviceCategory` are related. In what follows, we also examine whther using `deviceCategory` can gain more insight, or one can use `isMobile` instead.

In [None]:
df_device = df_train[['device', 'revenue', 'Buy']]

In [None]:
# Extract deviceCategory from device field
field_name = 'device' 
extract_field = 'deviceCategory'
new_field_name = 'DeviceCategory'

json_read(df_device, field_name, extract_field, new_field_name).head()

In [None]:
# Check whether there's missing values in DeviceCategory column
pd.isna(df_device['DeviceCategory']).sum()

In [None]:
# using an html hex string for color
color = '#0099ff'
df_device.groupby(['DeviceCategory']).size().plot.bar(rot = 0, color=color)
plt.ylabel('Visit Count')

As we can see in the above bar plot, **most of customers visit the store via desktop device**. Also, visit counts for tablet is very small compared to mobile and desktop, and tablet and mobile can be regarded as mobile devices. Hence, we can group mobile and tablet features together, and we can classify the device is mobile device or not. This is nothing but `isMobile` in `device` attribute. Let's extract `isMobile` from `device` in the following:

In [None]:
# Extract deviceCategory from device field
field_name = 'device' 
extract_field = 'isMobile'
new_field_name = 'IsMobile'

json_read(df_device, field_name, extract_field, new_field_name).head()

From above table, we can see that `IsMobile` is False if  `DeviceCategory` is "desktop". In contrast, `IsMobile` is True if  `DeviceCategory` is "mobile" or "tablet".

In [None]:
# Estimate total memory usage
usage = df_device.memory_usage(deep=True).sum() * 1e-6
print('Memory usage is {} Gb'.format(usage/1000))

In [None]:
# Check data type in IsMobile
df_device['IsMobile'].dtypes

### Is IsMobile related to purchasing rate?

In [None]:
df_device.groupby('IsMobile')['Buy'].mean().plot.bar(color=color, rot = 0)
plt.ylabel('Purchasing Rate')

From above result, we can see that customers using desktop are more willing to purchase in store. 

### Is IsMobile related to revenue per visit?

In [None]:
df_device_Buy = df_device[df_device['Buy']==1]
df_device_Buy.head()

In [None]:
# Transform revenue by log function
df_device_Buy['log(revenue)'] = df_device_Buy['revenue'].apply(np.log)

In [None]:
sns.boxplot(x='IsMobile', y='log(revenue)', data=df_device_Buy)

As we can see in the above box plot, the medians for mobile device and non-mobile device are very close to each other. This implies that revenue per visit is not correlated to `IsMobile` feature. 

In [None]:
del df_device_Buy, df_device

## 2.4 geoNetwork

The field, `geoNetwork`, is stored in json format. It is necessary to extract its fields and analyze them. Using json_read function to deserializing json values.

In [None]:
df_train.loc[21,'geoNetwork']

In [None]:
df_geo = df_train[['geoNetwork', 'revenue', 'Buy']]
df_geo.head()

In [None]:
# Estimate total memory usage
usage = df_geo.memory_usage(deep=True).sum() * 1e-6
print('Memory usage is {} Gb'.format(usage/1000))

### Which continent has the most visitors to GStore?

In [None]:
# Extract continent from geoNetwork field
field_name = 'geoNetwork' 
extract_field = 'continent'
new_field_name = 'continent'

json_read(df_geo, field_name, extract_field, new_field_name).head()

In [None]:
df_geo.groupby('continent').size().plot.bar(color=color, rot = 0)
plt.ylabel('visit counts')

From above bar plot, we can see that most of customers are from Americas. Then the second most is from Asia. And visit counts from Europe is close to Asia. Hence, we can conclude that most of customers are from these three continents, Americas, Asia, and Europe.

### Which continent has the highest purchasing rate?

In [None]:
df_geo.groupby('continent')['Buy'].mean().plot.bar(color=color, rot = 0)
plt.ylabel('Purchasing Rate')

From above bar plot, we can see that the highest purchasing rate is in Americas. **This impliest that customers from Americas have higher possibility to buy things in GStore.** 

In [None]:
# Extract country from geoNetwork field
field_name = 'geoNetwork' 
extract_field = 'country'
new_field_name = 'country'

json_read(df_geo, field_name, extract_field, new_field_name).head()

### Which country in Americas has the highest purchasing rate?

In [None]:
df_americas = df_geo[df_geo['continent'] == 'Americas'] 
df_americas.groupby('country')['Buy'].mean().sort_values(ascending=False).head(10)

Visitors from St. Lucia in Americas are the most probable to purchase goods in Gstore. But is it reasonable? Let's take a look at how many visits are from St. Lucia below.

In [None]:
# How many visitors are from Anguilla?
len(df_americas[df_americas['country']=='St. Lucia'])

It is clear that we do not have enough data from this country so we cannot conclude that visitors from Anguilla in Americas are the most probable to purchase goods in Gstore. Let's examine other countries, St. Lucia, Guadeloupe, Curaçao, and United States, with high purchasing rate in the following:

In [None]:
df_americas.groupby('country').size().sort_values(ascending=False).head(25)

As we can see in above results, there are less than 55 visits in St. Lucia, Guadeloupe, Curaçao so we cannot conclude whether customers from these countries are more willing to buy in GStore. In contrast, **United States has the highest visits and higher purchasing rate.** This is an important feature we can consider!

### Is continent related to revenue?

In [None]:
df_geo_Buy = df_geo[df_geo['Buy']==1]
df_geo_Buy.head()

In [None]:
# Transform revenue by log function
df_geo_Buy['log(revenue)'] = df_geo_Buy['revenue'].apply(np.log)

In [None]:
sns.boxplot(x='continent', y='log(revenue)', data=df_geo_Buy)

As we can see in above box plot, the median of `log(revenue)` for each continent is close, except for Africa. But the distribution of `log(revenue)` for Africa is highly right-skewed. Let's take a look at how many visits with purchasing in Africa in the following:

In [None]:
len(df_geo_Buy[df_geo_Buy['continent']== 'Africa'])

The samples in Africa are very less so the results cannot really show that customers from Africa contribute higher revenues compared with customers from other continents. Therefore, when we predict revenues, we will not include `continent` feature.

In [None]:
del df_americas, df_geo, df_geo_Buy

## 2.5 social Engagement Type

In this section, we study the relation between purchasing rate and `socialEngagementType`. Then we study whether `revenue` is related to `socialEngagementType`.

### Which social Engagement Type has more visits?

In [None]:
# using an html hex string for color
color = '#0099ff'
df_train.groupby(['socialEngagementType']).size().plot.bar(rot = 0, color=color)
plt.ylabel('Visits')

In [None]:
df_train['socialEngagementType'].count()

In this feature, samples in training set are all in the same category so it is not useful to include this feature into our model.

## 2.6 Traffic Source

In [None]:
df_train.loc[101,'trafficSource']

In [None]:
df_train.loc[102,'trafficSource']

In [None]:
df_train.loc[103,'trafficSource']

In above three instances, except for those not available info, the most useful feature is `medium` but it's already extracted in `channelGrouping`.

## 2.7 totals

Except for `transactionRevenue`, there are other useful features stored in json format in `totals` field like `visits`, `hits`, and `pageviews`.

In [None]:
df_train.loc[80,'totals']

In [None]:
df_train.loc[1001,'totals']

In [None]:
df_train.loc[302,'totals']

In [None]:
df_totals = df_train[['totals', 'revenue', 'Buy']]
df_totals.head()

In [None]:
# View momory usage including objects
df_totals.info(memory_usage='deep')

### visits feature in totals

In [None]:
# Extract visits from totals field
field_name = 'totals' 
extract_field = 'visits'
new_field_name = 'visits'

json_read(df_totals, field_name, extract_field, new_field_name).head()

In [None]:
df_totals.groupby('visits').size()

From above result, the values in visits attribute are all 1 so this feature cannot offer any insight we need. But **it implies that in this data set, each instance is related to distinct customer visiting GStore**. 

In [None]:
# Drop unuseful feature to release memory
df_totals.drop(columns='visits', inplace=True)
df_totals.head()

### hits and pageviews features in totals

In [None]:
# Extract hits from totals field
field_name = 'totals' 
extract_field = 'hits'
new_field_name = 'hits'

json_read(df_totals, field_name, extract_field, new_field_name).head()

In [None]:
df_totals.groupby('hits').size().head()

In [None]:
# Extract pageviews from totals field
field_name = 'totals' 
extract_field = 'pageviews'
new_field_name = 'pageviews'

json_read(df_totals, field_name, extract_field, new_field_name).head()

In [None]:
df_totals.groupby('pageviews').size().head()

In [None]:
# Check data types for values in each column
df_totals.dtypes

In [None]:
# Convert desired columns to numeric type
df_totals[['hits', 'pageviews']] = df_totals[['hits', 'pageviews']].apply(pd.to_numeric) 
df_totals.dtypes

### Take a look at correlations between revenue, Buy, hits, and pageviews

In [None]:
temp_df = df_totals[['hits','pageviews', 'Buy','revenue']]

# Calculate correlations
corr = temp_df.corr()

# Heatmap
sns.heatmap(corr)

From above heatmap, we can see that `hits` and `pageviews` have very weak relation to `revenue`. However, `hits` and `pageviews` have very highly positive correlations with `Buy`. 