In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np 
import pandas as pd 
import json
import os
from pandas.io.json import json_normalize
import matplotlib.pyplot as plt
%matplotlib inline

from plotly import tools
import plotly.offline as py
import plotly.graph_objs as go

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import TimeSeriesSplit, KFold
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

pd.options.display.max_columns = None

In [None]:
import torch 
from torch import nn
import seaborn as sns

# Google Analytics Customer Revenue Prediction


## 1. Load Data and Format data

In [None]:
data_path = "../input/ga-customer-revenue-prediction/"
train_df = pd.read_csv(data_path+ "train.csv")
test_df = pd.read_csv(data_path+ "test.csv")

Extract json data from csv

In [None]:
def load_df(csv_path='kaggle/train.csv', nrows=None):
    JSON_COLUMNS = ['device', 'geoNetwork', 'totals', 'trafficSource']
    df = pd.read_csv(csv_path,converters={column: json.loads for column in JSON_COLUMNS},dtype={'fullVisitorId': 'str'},nrows=nrows)
    for column in JSON_COLUMNS:
        column_as_df = json_normalize(df[column])
        column_as_df.columns = [f"{column}.{subcolumn}" for subcolumn in column_as_df.columns]
        df = df.drop(column, axis=1).merge(column_as_df, right_index=True, left_index=True)
    print(f"Loaded {os.path.basename(csv_path)}. Shape: {df.shape}")
    return df
train = load_df(csv_path = data_path+ "train.csv")

In [None]:
train.head()

In [None]:
train.info()

# 2. Exploratory Data Analysis (EDA)

Since the goal is to predict the total revenue per user, so we want to explore the total_revenue grouped by each user

In [None]:
train["totals.transactionRevenue"] = train["totals.transactionRevenue"].astype('float')
gdf = train.groupby("fullVisitorId")["totals.transactionRevenue"].sum().reset_index()

plt.figure(figsize=(8,6))
plt.scatter(range(gdf.shape[0]), np.sort(np.log1p(gdf["totals.transactionRevenue"].values)))
plt.xlabel('index', fontsize=12)
plt.ylabel('TransactionRevenue', fontsize=12)
plt.show()

In [None]:
not_none_transaction = sum(gdf["totals.transactionRevenue"]>0)
print("There are %d transactions and %d transactions have consumptions, %.3f%% are transactions are >0"%( 
                                                                                len(gdf), not_none_transaction,
                                                                                100*not_none_transaction/len(gdf)))
not_none_records = pd.notnull(train["totals.transactionRevenue"]).sum()
print("There are %d out of %d records with NaN values. %.3f%% transactions are Nan "%(not_none_records, len(train["totals.transactionRevenue"]),
                                                                                      100*not_none_records/len(train["totals.transactionRevenue"])))

In [None]:
visit_cnt = train.groupby("fullVisitorId")[["totals.hits"]].sum().reset_index()
df = visit_cnt.sample(1000)
plt.figure(figsize=(8,6))
plt.scatter(range(df.shape[0]), np.sort(df["totals.hits"]))
plt.xlabel('index', fontsize=12)
plt.ylabel('hits', fontsize=12)
plt.show()

In [None]:
fig, ax = plt.subplots(1,1, figsize=(20,8))
plt.hist( x= np.sort(df["totals.hits"]),orientation= "vertical",bins=50)

In [None]:
import datetime

def scatter_plot(cnt_srs, color):
    trace = go.Scatter(
        x=cnt_srs.index[::-1],
        y=cnt_srs.values[::-1],
        showlegend=False,
        marker=dict(
            color=color,
        ),
    )
    return trace

train['date'] = train['date'].apply(lambda x: datetime.date(int(str(x)[:4]), int(str(x)[4:6]), int(str(x)[6:])))
cnt_srs = train.groupby('date')['totals.transactionRevenue'].agg(['size', 'count'])
cnt_srs.columns = ["count", "count of non-zero revenue"]
cnt_srs = cnt_srs.sort_index()
#cnt_srs.index = cnt_srs.index.astype('str')
trace1 = scatter_plot(cnt_srs["count"], 'red')
trace2 = scatter_plot(cnt_srs["count of non-zero revenue"], 'blue')

fig = tools.make_subplots(rows=2, cols=1, vertical_spacing=0.08,
                          subplot_titles=["Date - Count", "Date - Non-zero Revenue count"])
fig.append_trace(trace1, 1, 1)
fig.append_trace(trace2, 2, 1)
fig['layout'].update(height=800, width=800, paper_bgcolor='rgb(233,233,233)', title="Date Plots")
py.iplot(fig, filename='date-plots')

# 3. Data Preprocessing

In [None]:
train['trafficSource.adwordsClickInfo.isVideoAd'].fillna(True, inplace=True)
train['trafficSource.isTrueDirect'].fillna(False, inplace=True)

# remove columns with only one distinct value
cols_to_drop = [col for col in train.columns if train[col].nunique(dropna=False) == 1]
train.drop(cols_to_drop, axis=1, inplace=True)

#only one not null value
train.drop(['trafficSource.campaignCode'], axis=1, inplace=True)

train.head()

In [None]:
num_cols = ['visitNumber', 'totals.hits', 'totals.pageviews', 'totals.bounces', 'totals.newVisits', 'totals.transactionRevenue']

for col in num_cols:
    train[col] = train[col].fillna(0)
    train[col] = train[col].astype(float)
    train[col] = np.log1p(train[col])

In [None]:
train['trafficSource.adContent'] = train['trafficSource.adContent'].fillna(0)
train['trafficSource.keyword'] = train['trafficSource.keyword'].fillna(0)
train['trafficSource.adwordsClickInfo.adNetworkType'] = train['trafficSource.adwordsClickInfo.adNetworkType'].fillna(0)
train['trafficSource.adwordsClickInfo.gclId'] = train['trafficSource.adwordsClickInfo.gclId'].fillna(0)
train['trafficSource.adwordsClickInfo.page'] = train['trafficSource.adwordsClickInfo.page'].fillna(0)
train['trafficSource.adwordsClickInfo.slot'] = train['trafficSource.adwordsClickInfo.slot'].fillna(0)

In [None]:
train.head()

In [None]:
train['device.browser'].nunique(),train['device.deviceCategory'].nunique()

In [None]:
train['browser_category'] = train['device.browser'] + '_' + train['device.deviceCategory']
train['browser_operatingSystem'] = train['device.browser'] + '_' + train['device.operatingSystem']
train['source_country'] = train['trafficSource.source'] + '_' + train['geoNetwork.country']

In [None]:
no_use = ["date", "fullVisitorId", "sessionId", "visitId", "visitStartTime", 'totals.transactionRevenue', 'trafficSource.referralPath']
cat_cols = [col for col in train.columns if col not in num_cols and col not in no_use]

## Label Encoding to convert strings to labels

In [None]:
max_values = {}
for col in cat_cols:
    print(col)
    lbl = LabelEncoder()
    lbl.fit(list(train[col].values.astype('str')))
    train[col] = lbl.transform(list(train[col].values.astype('str')))
    max_values[col] = train[col].max() + 2  # 根据经验，比真实值大一点，效果较好

In [None]:
max_values

In [None]:
cat_col_labels1 = ["channelGrouping", "device.deviceCategory", "device.operatingSystem", "geoNetwork.continent",
                   "geoNetwork.subContinent", "trafficSource.adContent", "trafficSource.adwordsClickInfo.adNetworkType",
                   "trafficSource.adwordsClickInfo.isVideoAd", "trafficSource.adwordsClickInfo.page", "trafficSource.adwordsClickInfo.slot",
                   "trafficSource.campaign", "trafficSource.medium", "geoNetwork.region"]

cat_col_labels2 = ["browser_category", "browser_operatingSystem", "source_country", "device.browser", "geoNetwork.city",
                   "trafficSource.source", "trafficSource.keyword", "trafficSource.adwordsClickInfo.gclId", "geoNetwork.networkDomain",
                   "geoNetwork.country", "geoNetwork.metro", "geoNetwork.region"]

# 4. Split dataset into trainset and validation set

In [None]:
import datetime

train = train.sort_values('date')

x_train = train[train["date"] <= pd.Timestamp(2017,5,31)]
x_val = train[train["date"] > pd.Timestamp(2017,5,31)]

y_train = x_train['totals.transactionRevenue']
y_val = x_val['totals.transactionRevenue']

x_train = x_train.drop(no_use, axis=1)
x_val = x_val.drop(no_use, axis=1)

In [None]:
num_cols.remove("totals.transactionRevenue")
num_cols

In [None]:
emb_dims1 = []
emb_dims2 = []
for i in cat_col_labels1:
    emb_dims1.append((max_values[i], min((max_values[i]+1)//2, 50)))
for i in cat_col_labels2:
    emb_dims2.append((max_values[i], min((max_values[i]+1)//2, 50)))

# 5. Modeling

# 5.1  LGBM model

In [None]:
import lightgbm as lgb
train_set = lgb.Dataset(x_train, y_train)
valid_set = lgb.Dataset(x_val, y_val)


In [None]:
params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'boosting': 'gbdt',
        'learning_rate': 0.3 ,
        'verbose': 0,
        'num_leaves': 90,
        'bagging_fraction': 0.95,
        'bagging_freq': 1,
        'bagging_seed': 1,
        'feature_fraction': 0.9,
        'feature_fraction_seed': 1,
        'max_bin': 256,
        'max_depth': 15,
        'num_rounds': 200,
        'metric' : 'auc'
    }

%time model_f1 = lgb.train(params, train_set=train_set,  valid_sets=valid_set, verbose_eval=5)

# 5.1 Wide and Deep Model

In [None]:
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

In [None]:
class TabularDataset(Dataset):
    def __init__(self, x_data, y_data, cat_cols1, cat_cols2, num_cols):
        
        """
        data: pandas data frame;
        cat_cols: list of string, the names of the categorical columns in the data, will be passed through the embedding layers;
        num_cols: list of string
        y_data: the target
        """
        self.n = x_data.shape[0]
        self.y = y_data.astype(np.float32).values.reshape(-1, 1)
       
        self.cat_cols1 = cat_cols1
        self.cat_cols2 = cat_cols2
        self.num_cols = num_cols
        
        self.num_X = x_data[self.num_cols].astype(np.float32).values
        self.cat_X1 = x_data[self.cat_cols1].astype(np.int64).values
        self.cat_X2 = x_data[self.cat_cols2].astype(np.int64).values
        
    
    def print_data(self):
        return self.num_X, self.cat_X1, self.cat_X2, self.y
    
    def __len__(self):
        """
        total number of samples
        """
        return self.n
    
    def __getitem__(self, idx):
        """
        Generates one sample of data.
        """
        return [self.y[idx], self.num_X[idx], self.cat_X1[idx], self.cat_X2[idx]]

In [None]:
class FeedForwardNN(nn.Module):
    def __init__(self, emb_dims1, emb_dims2, no_of_num, lin_layer_sizes, output_size, emb_dropout, lin_layer_dropouts):
        """
        emb_dims:           List of two element tuples;
        no_of_num:          Integer, the number of continuous features in the data;
        lin_layer_sizes:    List of integers. The size of each linear layer;
        output_size:        Integer, the size of the final output;
        emb_dropout:        Float, the dropout to be used after the embedding layers.
        lin_layer_dropouts: List of floats, the dropouts to be used after each linear layer.
        """
        super().__init__()
        
        # embedding layers
        self.emb_layers1 = nn.ModuleList([nn.Embedding(x, y) for x, y in emb_dims1])
        self.emb_layers2 = nn.ModuleList([nn.Embedding(x, y) for x, y in emb_dims2])
        
        # 计算各个emb参数数量，为后续Linear layer的输入做准备
        self.no_of_embs1 = sum([y for x, y in emb_dims1])
        self.no_of_embs2 = sum([y for x, y in emb_dims2])
        self.no_of_num = no_of_num
        
        # 分支1
        self.branch1 = nn.Linear(self.no_of_embs1, lin_layer_sizes[0])
        self.branch1_2 = nn.Linear(lin_layer_sizes[0], lin_layer_sizes[1])
        nn.init.kaiming_normal_(self.branch1.weight.data)
        nn.init.kaiming_normal_(self.branch1_2.weight.data)
        
        # 分支2
        self.branch2 = nn.Linear(self.no_of_embs2, lin_layer_sizes[0] * 2)
        self.branch2_2 = nn.Linear(lin_layer_sizes[0] * 2, lin_layer_sizes[1] * 2)
        nn.init.kaiming_normal_(self.branch2.weight.data)
        nn.init.kaiming_normal_(self.branch2_2.weight.data)
        
        # 主分支
        self.main_layer1 = nn.Linear(lin_layer_sizes[1] * 3 + self.no_of_num, lin_layer_sizes[2])
        self.main_layer2 = nn.Linear(lin_layer_sizes[2], lin_layer_sizes[3])
        
        # batch normal
        self.branch_bn_layers1 = nn.BatchNorm1d(lin_layer_sizes[0])
        self.branch_bn_layers2 = nn.BatchNorm1d(lin_layer_sizes[0] * 2)
        self.main_bn_layer = nn.BatchNorm1d(lin_layer_sizes[2])
        
        # Dropout Layers
        self.emb_dropout_layer = nn.Dropout(emb_dropout)
        self.dropout_layers = nn.ModuleList([nn.Dropout(size) for size in lin_layer_dropouts])
        
        # Output layer
        self.output_layer = nn.Linear(lin_layer_sizes[-1], output_size)
        nn.init.kaiming_normal_(self.output_layer.weight.data)
        
    def forward(self, num_data, cat_data1, cat_data2):
        # embedding categorical feature and cat them together
        x1 = [emb_layer(torch.tensor(cat_data1[:, i])) for i, emb_layer in enumerate(self.emb_layers1)]
        x1 = torch.cat(x1, 1)
        
        x1 = self.emb_dropout_layer(F.relu(self.branch1(x1)))
        x1 = self.branch_bn_layers1(x1)
        x1 = self.dropout_layers[0](F.relu(self.branch1_2(x1)))

        x2 = [emb_layer(torch.tensor(cat_data2[:, i])) for i, emb_layer in enumerate(self.emb_layers2)]
        x2 = torch.cat(x2, 1)
        
        x2 = self.emb_dropout_layer(F.relu(self.branch2(x2)))
        x2 = self.branch_bn_layers2(x2)
        x2 = self.dropout_layers[0](F.relu(self.branch2_2(x2)))

        main = torch.cat([x1, x2, num_data], 1)

        main = self.dropout_layers[1](F.relu(self.main_layer1(main)))
        main = self.main_bn_layer(main)
        main = self.dropout_layers[2](F.relu(self.main_layer2(main)))

        out = self.output_layer(main)
        return out

In [None]:
train_dataset = TabularDataset(x_data=x_train, y_data=y_train, cat_cols1=cat_col_labels1, cat_cols2=cat_col_labels2, num_cols=num_cols)
val_dataset = TabularDataset(x_data=x_val, y_data=y_val, cat_cols1=cat_col_labels1, cat_cols2=cat_col_labels2, num_cols=num_cols)

In [None]:
batchsize = 64
train_dataloader = DataLoader(train_dataset, batchsize, shuffle=True, num_workers=0)
val_dataloder = DataLoader(val_dataset, 64, shuffle=True, num_workers=0)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = FeedForwardNN(emb_dims1=emb_dims1, 
                      emb_dims2=emb_dims2, 
                      no_of_num=len(num_cols),
                      lin_layer_sizes=[128,64,32,16],
                      output_size=1,
                      lin_layer_dropouts=[0.1, 0.1, 0.05],
                      emb_dropout=0.05).to(device)

## Training

In [None]:
no_of_epochs = 5
criterion = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
total_data = train_dataset.__len__()

print_every = 500
steps = 0
running_loss = 0
best_val_score = 0
best_model = None

for epoch in range(no_of_epochs):
    model.train()
    for index, datas in enumerate(train_dataloader):
        steps += 1
        y, num_x, cat_x1, cat_x2 = datas
        cat_x1 = cat_x1.to(device)
        cat_x2 = cat_x2.to(device)
        num_x = num_x.to(device)
        y  = y.to(device)
        
        # Forward Pass
        optimizer.zero_grad()
        preds = model.forward(num_x, cat_x1, cat_x2)
        loss = criterion(preds, y)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        
        if steps % print_every == 0:
            val_loss = 0
            model.eval()
            with torch.no_grad():
                for val_index, val_datas in enumerate(val_dataloder):
                    y, num_x, cat_x1, cat_x2 = val_datas
                    cat_x1 = cat_x1.to(device)
                    cat_x2 = cat_x2.to(device)
                    num_x = num_x.to(device)
                    y  = y.to(device)
                    
                    out = model.forward(num_x, cat_x1, cat_x2)
                    val_acc = ((out>0.5)==y ).sum().detach().to('cpu').numpy()/len(out)
                    
                    batch_loss = criterion(out, y)
                    val_loss += batch_loss.item()
                    
                    if val_acc> best_val_score:
                        best_val_score = val_acc
                        torch.save(model,"checkpoint.pt")
            
            print(f"Epoch {epoch+1}/{no_of_epochs}.."
                     f"Train loss:{running_loss/print_every:.3f}.."
                     f"Validation loss:{val_loss/len(val_dataloder):.3f}..")
            running_loss = 0
            model.train()

# 5.2 LGBM model