In [None]:
# Import libraries 
import pandas as pd 
import numpy as np
import os
import time
from dask import dataframe as dd
import json
from pandas import json_normalize
from datetime import datetime
from datetime import timedelta
import csv
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score 
import lightgbm as lgb

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 100)

# Brief task description and approach of it’s solving
- The dataset is too big to read it in one piece with Pandas, so Dask was used to read the initial data.
- The goal is to predict the sum of transactions per user during two months period. Initial data is in the context of user’s visits, so we need to do grouping per user.
- In the train set we need to recreate the same conditions as for the test set: data available for 168 days, 46 days of gap and the target is the sum of transactions per user (for users in the data for 168 days) during the 62 days after gap period. 
- To construct the training data following steps were performed:
    - concatenate training and test sets (since testing data contains all necessary columns, this way we can increase volume of training data);
    - find first date in the combined dataset and select 4 non-overlapping periods of 168 days each;
    - calculate target for each of four 168 days periods – find unique Visitor IDs of users in the period, select data for 62 days after 46 days gap for these users and sum up transactions’ revenue per user for 62 days. For users who are not found in 62 days period set target as 0. 
- The trick that was used in winner’s solution is using model stacking by predicting probability that user returns during 62 days period. In regression model training we use data only for returned users. Final prediction is made by multiplying probability that user returns to the revenue prediction made by regression model. 

# 1. Read the data

In the hidden cells we define a function to read the data and parse json columns. You can view it by pressing "Show hidden code".

In [None]:
def parse_json(file_path, file_name):
    start = time.time()
    custom_date_parser = lambda x: datetime.strptime(x, "%Y%m%d")

    with open(file_path, 'r') as f:
        header = next(csv.reader(f))
        
    df = dd.read_csv(file_path, 
                  parse_dates=['date'],
                  date_parser=custom_date_parser,
                  usecols = list(set(header) - {'hits'}),
                  dtype={'fullVisitorId':'str'})
    
    df['Region'] = df.customDimensions.map(lambda x: x.split("'value': '")[-1][0:-3])
    df = df.drop('customDimensions', axis=1)
    
    print('Starting parsing JSON columns')
    # Parse columns with JSON values and add them as separate column to the main table
    JSON_columns = ['device', 'geoNetwork', 'totals', 'trafficSource']

    for col in JSON_columns:
        df[col] = df[col].map(lambda x: json.loads(x), meta=('','object'))
        new_df = json_normalize(df[col])
        new_df.columns = [f"{col}.{subcolumn}" for subcolumn in new_df.columns]
        df = df.merge(new_df, right_index=True, left_index=True)
        df = df.drop(col, axis=1)
    
    print('Finished parsing JSON columns')
    # These columns have the same values, so it's decided to drop them
    df = df.drop(['device.browserVersion', 'device.browserSize', 'device.operatingSystemVersion', 
         'device.mobileDeviceBranding', 'device.mobileDeviceModel', 'device.mobileInputSelector',
        'device.mobileDeviceInfo', 'device.mobileDeviceMarketingName', 'device.flashVersion',
        'device.language', 'device.screenColors', 'device.screenResolution', 'geoNetwork.cityId', 
         'geoNetwork.networkLocation', 'trafficSource.adwordsClickInfo.criteriaParameters', 'geoNetwork.latitude', 
        'geoNetwork.longitude', 'socialEngagementType'], axis=1)
    
    df=df.compute()
    df.to_csv('{}_before_aggregating.csv'.format(file_name), index=False)
    end = time.time()
    print('Time spent on processing {} mins'.format((end - start)/60))
    return df

In [None]:
# These cells are commented since reading the files and parsing JSON columns takes a lot of time
# In the cell below we read the result of function run from .csv file

#test_raw = parse_json('/kaggle/input/ga-customer-revenue-prediction/test_v2.csv', file_name='test_raw')
#train_raw = parse_json('/kaggle/input/ga-customer-revenue-prediction/train_v2.csv', file_name='train_raw')

In [None]:
# Read the result of running the function "parse_json"
custom_date_parser = lambda x: datetime.strptime(x, "%Y-%m-%d")

test_raw = pd.read_csv('/kaggle/input/ga-competition-processed-data/test_raw_before_aggregating.csv', 
                  parse_dates=['date'],
                  date_parser=custom_date_parser,                 
                  dtype={'fullVisitorId':'str'})

train_raw = pd.read_csv('/kaggle/input/ga-competition-processed-data/train_raw_before_aggregating.csv', 
                  parse_dates=['date'],
                  date_parser=custom_date_parser,                 
                  dtype={'fullVisitorId':'str'})

# 2. Brief EDA

In [None]:
# Combine datsets received on the previous step
df = pd.concat([train_raw, test_raw])

In [None]:
# Set necessary values as null values
Nulls = ['(not set)', 'not available in demo dataset', '(not provided)', 
         'unknown.unknown', '/', 'Not Socially Engaged', 'not set', '(none)']
for null in Nulls:    
    df.replace(null, np.nan, inplace=True)

In [None]:
print('Check proportion of null values')
print(round(df.isna().sum()/len(df)*100),0)

In [None]:
# We have rows with not null transactions and null revenue
# We will assume that this could happen in reality, for example, because of promo campaigns.
df[(df['totals.transactionRevenue'].isna())&(df['totals.transactions'].notnull())][['totals.transactionRevenue', 'totals.transactions']].head(10)

In [None]:
# Check values count for columns where number of unique values <10
for i in df.columns:
    if df[i].nunique()<10:
        print ('\033[1m'+i+':'+'\033[0m')
        print(df[i].value_counts())

In [None]:
# Convert necessary columns to numerical to apply aggregation functions during grouping on the next step

def convert_to_num(df):
    # Fill missing values where necessary
    df['totals.bounces'] = df['totals.bounces'].fillna(0).astype('int')
    df['totals.newVisits'] = df['totals.newVisits'].fillna(0).astype('int')
    df['trafficSource.isTrueDirect'] = df['trafficSource.isTrueDirect'].fillna(False).astype('bool')
    df['trafficSource.adwordsClickInfo.isVideoAd'] = df['trafficSource.adwordsClickInfo.isVideoAd'].astype('bool')
    # Convert to numeric
    col_to_numeric = ['totals.timeOnSite', 'totals.hits', 'totals.pageviews', 'totals.sessionQualityDim', 'totals.transactions', 'totals.transactionRevenue', 
                    'totals.totalTransactionRevenue', 'totals.visits']
    for col in col_to_numeric:
        df[col] = df[col].astype('float')
    return (df)

In [None]:
# Convert columns to numeric
df = convert_to_num(df)
test_raw = convert_to_num(test_raw)

# 3. Construct train set by analogy of test set and group by VisitorID

In the hidden cells we create a dictionary of columns to group with aggregation functions and define a function to construct training set from grouped data for necessary periods.

In [None]:
group_dict = {
            'geoNetwork.networkDomain': [('networkDomain', lambda x: x.dropna().max())],
            'geoNetwork.city': [('city', lambda x: x.dropna().max())],
            'geoNetwork.metro': [('metro', lambda x: x.dropna().max())],
            'geoNetwork.region': [('geo_region', lambda x: x.dropna().max())],
            'geoNetwork.country': [('country', lambda x: x.dropna().max())],
            'geoNetwork.continent': [('continent', lambda x: x.dropna().max())],
            'geoNetwork.subContinent': [('subContinent', lambda x: x.dropna().max())],

            'device.operatingSystem': [('operatingSystem', lambda x: x.dropna().max())],
            'device.isMobile': [('isMobile', lambda x: x.dropna().mean())],
            'device.deviceCategory': [('deviceCategory', lambda x: x.dropna().max())],
            'device.browser': [('browser', lambda x: x.dropna().max())],
        
            'trafficSource.source': [('source', lambda x: x.dropna().max())],
            'trafficSource.medium': [('medium', lambda x: x.dropna().max())],
            'trafficSource.isTrueDirect': [('isTrueDirect', lambda x: x.dropna().mean())],
            'trafficSource.adwordsClickInfo.isVideoAd': [('isVideoAd', lambda x: x.dropna().mean())],
    
            'channelGrouping': [('channelGrouping', lambda x: x.dropna().max())],
            'date': [('first_ses_from_the_period_start', lambda x: x.dropna().min() - df_mindate),
                     ('last_ses_from_the_period_end', lambda x: df_maxdate - x.dropna().max()),
                     ('interval_dates', lambda x: x.dropna().max() - x.dropna().min()),
                     ('unqiue_date_num', lambda x: len(set(x.dropna())) )], 
            'visitNumber': [('visitNumber', lambda x: x.dropna().max())],
            'visitStartTime': [('visitStartTime', lambda x: x.dropna().count())],
            'Region': [('Region', lambda x: x.dropna().max())],
        
            'totals.bounces': [('bounces', lambda x: x.dropna().sum())],
            'totals.timeOnSite': [('timeOnSite_sum', lambda x: x.dropna().sum()),
                                  ('timeOnSite_min', lambda x: x.dropna().min()), 
                                  ('timeOnSite_max', lambda x: x.dropna().max()),
                                  ('timeOnSite_mean', lambda x: x.dropna().mean()),
                                 ('timeOnSite_median', lambda x: x.dropna().median()),
                                 ('timeOnSite_std', lambda x: np.std(x.dropna()))],
            'totals.pageviews': [('pageviews_sum', lambda x: x.dropna().sum()),
                                 ('pageviews_min', lambda x: x.dropna().min()), 
                                 ('pageviews_max', lambda x: x.dropna().max()),
                                 ('pageviews_mean', lambda x: x.dropna().mean()),
                                ('pageviews_median', lambda x: x.dropna().median()),
                                ('pageviews_std', lambda x: np.std(x.dropna()))],
            'totals.hits': [('hits_sum', lambda x: x.dropna().sum()), 
                            ('hits_min', lambda x: x.dropna().min()), 
                            ('hits_max', lambda x: x.dropna().max()), 
                            ('hits_mean', lambda x: x.dropna().mean()),
                           ('hits_median', lambda x: x.dropna().median()),
                           ('hits_std', lambda x: np.std(x.dropna()))],
            'totals.sessionQualityDim': [('sessionQualityDim_sum', lambda x: x.dropna().sum()),
                                        ('sessionQualityDim_mean', lambda x: x.dropna().mean()),
                                        ('sessionQualityDim_median', lambda x: x.dropna().median()),
                                        ('sessionQualityDim_min', lambda x: x.dropna().min()),
                                        ('sessionQualityDim_max', lambda x: x.dropna().max()),
                                        ('sessionQualityDim_std', lambda x: np.std(x.dropna()))],
            'totals.newVisits': [('newVisits', lambda x: x.dropna().max())],
            'totals.transactionRevenue':  [('transactionRevenue_sum', lambda x:x.dropna().sum()),
                                          ('transactionRevenue_mean', lambda x:x.dropna().mean()),
                                          ('transactionRevenue_median', lambda x:x.dropna().median()),
                                          ('transactionRevenue_min', lambda x:x.dropna().min()),
                                          ('transactionRevenue_max', lambda x:x.dropna().max()),
                                          ('transactionRevenue_std', lambda x: np.std(x.dropna()))],
            'totals.totalTransactionRevenue':  [('totalTransactionRevenue_max', lambda x:x.dropna().max())],
            'totals.transactions' : [('transactions_sum', lambda x:x.dropna().sum()),
                                    ('transactions_mean', lambda x:x.dropna().mean()),
                                    ('transactions_median', lambda x:x.dropna().median()),
                                    ('transactions_min', lambda x:x.dropna().min()),
                                    ('transactions_max', lambda x:x.dropna().max()),
                                    ('transactions_std', lambda x:np.std(x.dropna()))]    
}

In [None]:
def getTimeFramewithFeatures(tr, k=1):
    start = time.time()
    tf = tr.loc[(tr['date'] >= min(tr['date']) + timedelta(days=168*(k-1))) 
              & (tr['date'] < min(tr['date']) + timedelta(days=168*k))]

    tf_fvid = set(tr.loc[(tr['date'] >= min(tr['date']) + timedelta(days=168*k + 46 )) 
                       & (tr['date'] < min(tr['date']) + timedelta(days=168*k + 46 + 62))]['fullVisitorId'])

    tf_returned = tf[tf['fullVisitorId'].isin(tf_fvid)]
    
    tf_tst = tr[tr['fullVisitorId'].isin(set(tf_returned['fullVisitorId']))
             & (tr['date'] >= min(tr['date']) + timedelta(days=168*k + 46))
             & (tr['date'] < min(tr['date']) + timedelta(days=168*k + 46 + 62))]

    tf_target = tf_tst.groupby('fullVisitorId').agg({
                                    'totals.transactionRevenue': [('sum_revenue_target',  'sum')] })
    tf_target.columns = tf_target.columns.droplevel()
    tf_target.reset_index(inplace=True)
    tf_target['ret'] = 1
    
    tf_nonret = pd.DataFrame()
    tf_nonret['fullVisitorId'] = list(set(tf['fullVisitorId']) - tf_fvid)    
    tf_nonret['sum_revenue_target'] = 0
    tf_nonret['ret'] = 0
    
    tf_target = pd.concat([tf_target, tf_nonret], axis=0).reset_index(drop=True)
    # len(set(tf['fullVisitorId'])), len(set(tf_target['fullVisitorId']))
    global df_maxdate, df_mindate
    df_maxdate = max(tf['date'])
    df_mindate = min(tf['date'])

    tf = tf.groupby('fullVisitorId').agg(group_dict)
                                                      
    tf.columns = tf.columns.droplevel()

    tf = pd.merge(tf, tf_target, left_on='fullVisitorId', right_on='fullVisitorId')
    end = time.time()
    print('Time spent on processing {} mins'.format((end - start)/60))

    return tf

In [None]:
# In this cell we get train data by parts. Cell is commented since execution takes approximately 70 mins for one part. 
# We will read the results of this function run from .csv file later

#train_1 = getTimeFramewithFeatures(df, k=1)
#train_1.to_csv('train_pivot_1.csv')

#train_2 = getTimeFramewithFeatures(df, k=2)
#train_2.to_csv('train_pivot_2.csv')

#train_3 = getTimeFramewithFeatures(df, k=3)
#train_3.to_csv('train_pivot_3.csv')

#train_4 = getTimeFramewithFeatures(df, k=4)
#train_4.to_csv('train_pivot_4.csv')

# 4. Construct test set and group by VisitorID

In [None]:
# Cell is commented since execution takes approximately 70 mins
# We will read the results of this cell run from .csv file later

# df_maxdate = max(df['date'])
# df_mindate = min(df['date'])

# start = time.time()

# test = test_raw.groupby('fullVisitorId').agg(group_dict)
# test.columns = test.columns.droplevel()
# test['sum_revenue_target'] = np.nan
# test['ret'] = np.nan
# test = test.reset_index()

# end = time.time()
# print('Time spent on processing {} mins'.format((end - start)/60))
# test.to_csv('test_5.csv')

# 6. Combine all data

In [None]:
# Cell is commented since grouping of datasets for concatenation takes a lot of time.  
# We will read the results from previously prepared .csv file.

# train_all = pd.concat([train_1, train_2, train_3, train_4, test], axis=0, sort=False).reset_index(drop=True)

# train_all['interval_dates'] = train_all['interval_dates'].dt.days
# train_all['first_ses_from_the_period_start'] = train_all['first_ses_from_the_period_start'].dt.days
# train_all['last_ses_from_the_period_end'] = train_all['last_ses_from_the_period_end'].dt.days
# train_all['sum_revenue_target'] = np.log1p(train_all['sum_revenue_target'])

# train_all.to_csv('train_and_test_clean.csv')

In [None]:
# Read the dataset from previously generated file
train_all = pd.read_csv('/kaggle/input/ga-competition-processed-and-grouped-data/train_and_test.csv', dtype={'fullVisitorId':'str'})
train_all = train_all.drop(columns=['Unnamed: 0'])

In [None]:
# Fill null categorical variables with value ‘999’ 
cat_cols_to_fill = train_all.loc[:, train_all.isna().sum()>0].select_dtypes(include='object').columns
train_all[cat_cols_to_fill] = train_all[cat_cols_to_fill].fillna('999')
train_all['networkDomain'] = train_all['networkDomain'].replace('(not set)', '999') # on the previous step we missed this value, that also can be considered as null

# Fill null numerical variables with value 0
num_cols_to_fill = [i for i in train_all.loc[:, train_all.isna().sum()>0].select_dtypes(include=['int', 'float']).columns if i not in ['sum_revenue_target', 'ret']]
train_all[num_cols_to_fill] = train_all[num_cols_to_fill].fillna(0)

In [None]:
# Replace values with count <30 of 'networkDomain' column with 'other' value
values_to_replace = train_all['networkDomain'].value_counts()[train_all['networkDomain'].value_counts()<30].index

for i in values_to_replace:
    train_all['networkDomain'] = train_all['networkDomain'].replace(i, 'other')

# 7. Model building

In [None]:
target_cols = ['sum_revenue_target', 'ret', 'fullVisitorId']
data = train_all.drop(columns=target_cols)

In [None]:
# Initialize transformer 
cat_cols = data.select_dtypes(include=["object", "category"]).columns
num_cols = data.select_dtypes(include=["number", "bool"]).columns
transformer = ColumnTransformer([("cat", OrdinalEncoder(), cat_cols),
                                 ('num', StandardScaler(), num_cols)])
# Transform the data
transformed =  transformer.fit_transform(data)

In [None]:
# Combine transformed data with target variables
data_transformed =  pd.concat((train_all[['sum_revenue_target', 'ret']], pd.DataFrame(transformed, columns = data.columns)), axis=1)

In [None]:
# Separate train and test set
train = data_transformed[data_transformed['sum_revenue_target'].notnull()]
test = data_transformed[data_transformed['sum_revenue_target'].isnull()]

In [None]:
# Create LGB dataset
dtrain_all = lgb.Dataset(train.drop(['sum_revenue_target', 'ret'], axis=1), label=train['ret'], params={'verbose': -1})
dtrain_ret = lgb.Dataset(train.drop(['sum_revenue_target', 'ret'], axis=1)[train['ret']==1], label=train['sum_revenue_target'][train['ret']==1],
                        params={'verbose': -1})

In [None]:
# Set parameters for two models 
params_lgb1 = {
        "objective" : "binary",
        "metric" : "binary_logloss",
        "max_bin": 256,
        "num_leaves" : 15,
        "min_child_samples" : 1,
        "learning_rate" : 0.005,
        "bagging_fraction" : 0.9,
        "feature_fraction" : 0.8,
        "bagging_freq" : 1,
        "verbose": -1}
params_lgb2 = {
        "objective" : "regression",
        "metric" : "rmse", 
        "max_bin": 256,
        "num_leaves" : 9,
        "min_child_samples" : 1,
        "learning_rate" : 0.005,
        "bagging_fraction" : 0.9,
        "feature_fraction" : 0.8,
        "bagging_freq" : 1,
        "verbose": -1}

In [None]:
# Initialize array for results storing
pred_lgb_sum  = np.zeros(test.shape[0],)

for i in range(10):
    print('Interation number', i, 'completed.')
    params_lgb1['feature_fraction_seed'] = 0+i
    params_lgb1['bagging_seed'] = 1+i
    lgb_model1 = lgb.train(params_lgb1, dtrain_all, num_boost_round=1200)
    pr_lgb = lgb_model1.predict(test.drop(['sum_revenue_target', 'ret'], axis=1), num_iteration=lgb_model1.best_iteration)
    params_lgb2['feature_fraction_seed'] = 0+i
    params_lgb2['bagging_seed'] = 1+i
    lgb_model2 = lgb.train(params_lgb2, dtrain_ret, num_boost_round=368)
    pr_lgb_ret = lgb_model2.predict(test.drop(['sum_revenue_target', 'ret'], axis=1), num_iteration=lgb_model2.best_iteration)
    
    pred_lgb_sum = pred_lgb_sum + pr_lgb*pr_lgb_ret

pred_final = pred_lgb_sum/10

In [None]:
# Check that we don’t get negative predictions   
print(pred_final.min())

# 8. Prepare submission file 

In [None]:
submission = pd.DataFrame()
submission['fullVisitorId'] = train_all[train_all['sum_revenue_target'].isnull()]['fullVisitorId']
submission['PredictedLogRevenue'] = pred_final

In [None]:
submission.to_csv('submission.csv', index=False)