In [43]:
# import packages

import gc
import numpy as np
import pandas as pd
pd.options.display.float_format = '{:20,.2f}'.format # disabling scientific notation in pandas
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import lightgbm as lgb

gc.enable()

In [2]:
data = pd.read_csv("data/train_filtered.csv",dtype={'fullVisitorId':str}) # as I have no dedicated test data, this is all the data I have

In [5]:
data['revenue_generated'] = (data['totals_transactionRevenue'] > 0).astype(int)
data['revenue_amount'] = data['totals_transactionRevenue']/(10**6)
data.drop(columns='totals_transactionRevenue', inplace=True)

In [6]:
data.drop(columns='trafficSource_adwordsClickInfo.isVideoAd', inplace=True) ## ADD TO PREPROCESSING. Feature has a constant value and has thus no information

Add time features

In [7]:
data.date = pd.to_datetime(data.date, format='%Y%m%d')
data['date_time'] = pd.to_datetime(data['visitStartTime'], unit='s')
data['date_dow'] = data['date_time'].dt.dayofweek
data['date_hours'] = data['date_time'].dt.hour
data['date_dom'] = data['date_time'].dt.day
data['month'] = data['date_time'].dt.month

In [13]:
data.sort_values('date') # it is too little data to split it date-wise -- as then we would have only data for a month

Unnamed: 0,channelGrouping,date,fullVisitorId,visitNumber,visitStartTime,totals_bounces,totals_hits,totals_newVisits,totals_pageviews,device_browser,...,trafficSource_medium,trafficSource_referralPath,trafficSource_source,revenue_generated,revenue_amount,date_time,date_dow,date_hours,date_dom,month
537962,Direct,2016-08-01,893148638848434176,1,1470118400,1.00,1,1.00,1.00,Chrome,...,@,@,(direct),0,0.00,2016-08-02 06:13:20,1,6,2,8
537786,Direct,2016-08-01,5751009969575920290,1,1470106674,1.00,1,1.00,1.00,Chrome,...,@,@,(direct),0,0.00,2016-08-02 02:57:54,1,2,2,8
537787,Direct,2016-08-01,9906757708231207408,1,1470103444,1.00,1,1.00,1.00,Safari,...,@,@,(direct),0,0.00,2016-08-02 02:04:04,1,2,2,8
537788,Direct,2016-08-01,7394165545362887055,3,1470044425,1.00,1,0.00,1.00,Chrome,...,@,@,(direct),0,0.00,2016-08-01 09:40:25,0,9,1,8
537789,Referral,2016-08-01,6107229716178617930,1,1470094529,1.00,1,1.00,1.00,Chrome,...,referral,@,mall.googleplex.com,0,0.00,2016-08-01 23:35:29,0,23,1,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64601,Direct,2017-08-01,0370352037468604196,2,1501581346,1.00,1,0.00,1.00,Chrome,...,@,@,(direct),0,0.00,2017-08-01 09:55:46,1,9,1,8
64600,Referral,2017-08-01,5321699176786856383,2,1501648138,1.00,1,0.00,1.00,Chrome,...,referral,/offer/2145,gdeals.googleplex.com,0,0.00,2017-08-02 04:28:58,2,4,2,8
64599,Affiliates,2017-08-01,6202772851799466410,1,1501602677,1.00,1,1.00,1.00,Chrome,...,affiliate,@,Partners,0,0.00,2017-08-01 15:51:17,1,15,1,8
64606,Referral,2017-08-01,7827431027330439153,1,1501624621,1.00,1,1.00,1.00,Chrome,...,referral,@,mall.googleplex.com,0,0.00,2017-08-01 21:57:01,1,21,1,8


In [21]:
train, test = train_test_split(data, test_size=0.3)

In [22]:
train.shape

(631928, 37)

In [23]:
test.shape

(270827, 37)

In [14]:
def calc_days_since_visit(row):
    if row['fullVisitorId'] == row['last_visit_id']:
        return (row['date'] - row['last_visit_date']).days
    else:
        return np.nan

In [15]:
def get_delays(df):
    visit_dates = df[['fullVisitorId', 'date']]
    visit_dates = visit_dates.sort_values(['fullVisitorId', 'date'])
    visit_dates[['last_visit_id', 'last_visit_date']] = visit_dates.shift(+1)[['fullVisitorId', 'date']]
    visit_dates['days_since_visit'] = visit_dates.apply(calc_days_since_visit, axis=1)
    visit_dates.sort_index(inplace=True)
    
    return visit_dates['days_since_visit']

In [17]:
train['days_since_visit'] = get_delays(train)
test['days_since_visit'] = get_delays(test)

In [25]:
for df in [train, test]:
    df['days_since_visit'] = get_delays(df)

In [131]:
# train_df_reduced = train_df_filter[['channelGrouping',
#                  #'fullVisitorId', 
#                  'visitNumber', 
#                  'totals_newVisits', 
#                  'totals_pageviews', 
#                  'device_browser', 
#                  'geoNetwork_subContinent', 
#                  #'trafficSource_adContent', 
#                  #'trafficSource_adwordsClickInfo.adNetworkType',
#                  #'trafficSource_adwordsClickInfo.page',
#                  #'trafficSource_adwordsClickInfo.slot', 
#                  #'trafficSource_campaign', 
#                  'trafficSource_isTrueDirect', 
#                  #'trafficSource_keyword', 
#                  #'trafficSource_medium', # is perfectly correlated with channelgrouping
#                  #'trafficSource_referralPath', 
#                  'trafficSource_source', 
#                  'revenue_generated', 
#                  'revenue_amount', 
#                  'days_since_visit']]#.set_index('fullVisitorId') ## does that not duplicate indexes?

In [27]:
# https://www.kaggle.com/prashantkikani/teach-lightgbm-to-sum-predictions-fe
def browser_mapping(x):
    browsers = ['chrome','safari','firefox','internet explorer','edge','opera','coc coc','maxthon','iron']
    if x in browsers:
        return x.lower()
    elif  ('android' in x) or ('samsung' in x) or ('mini' in x) or ('iphone' in x) or ('in-app' in x) or ('playstation' in x):
        return 'mobile browser'
    elif  ('mozilla' in x) or ('chrome' in x) or ('blackberry' in x) or ('nokia' in x) or ('browser' in x) or ('amazon' in x):
        return 'mobile browser'
    elif  ('lunascape' in x) or ('netscape' in x) or ('blackberry' in x) or ('konqueror' in x) or ('puffin' in x) or ('amazon' in x):
        return 'mobile browser'
    elif '(not set)' in x:
        return x
    else:
        return 'others'
    
    
def adcontents_mapping(x):
    if  ('google' in x):
        return 'google'
    elif  ('placement' in x) | ('placememnt' in x):
        return 'placement'
    elif '(not set)' in x or 'nan' in x:
        return x
    elif 'ad' in x:
        return 'ad'
    else:
        return 'others'
    
def source_mapping(x):
    if  ('google' in x):
        return 'google'
    elif  ('youtube' in x):
        return 'youtube'
    elif '(not set)' in x or 'nan' in x:
        return x
    elif 'yahoo' in x:
        return 'yahoo'
    elif 'facebook' in x:
        return 'facebook'
    elif 'reddit' in x:
        return 'reddit'
    elif 'bing' in x:
        return 'bing'
    elif 'quora' in x:
        return 'quora'
    elif 'outlook' in x:
        return 'outlook'
    elif 'linkedin' in x:
        return 'linkedin'
    elif 'pinterest' in x:
        return 'pinterest'
    elif 'ask' in x:
        return 'ask'
    elif 'siliconvalley' in x:
        return 'siliconvalley'
    elif 'lunametrics' in x:
        return 'lunametrics'
    elif 'amazon' in x:
        return 'amazon'
    elif 'mysearch' in x:
        return 'mysearch'
    elif 'qiita' in x:
        return 'qiita'
    elif 'messenger' in x:
        return 'messenger'
    elif 'twitter' in x:
        return 'twitter'
    elif 't.co' in x:
        return 't.co'
    elif 'vk.com' in x:
        return 'vk.com'
    elif 'search' in x:
        return 'search'
    elif 'edu' in x:
        return 'edu'
    elif 'mail' in x:
        return 'mail'
    elif 'ad' in x:
        return 'ad'
    elif 'golang' in x:
        return 'golang'
    elif 'direct' in x:
        return 'direct'
    elif 'dealspotr' in x:
        return 'dealspotr'
    elif 'sashihara' in x:
        return 'sashihara'
    elif 'phandroid' in x:
        return 'phandroid'
    elif 'baidu' in x:
        return 'baidu'
    elif 'mdn' in x:
        return 'mdn'
    elif 'duckduckgo' in x:
        return 'duckduckgo'
    elif 'seroundtable' in x:
        return 'seroundtable'
    elif 'metrics' in x:
        return 'metrics'
    elif 'sogou' in x:
        return 'sogou'
    elif 'businessinsider' in x:
        return 'businessinsider'
    elif 'github' in x:
        return 'github'
    elif 'gophergala' in x:
        return 'gophergala'
    elif 'yandex' in x:
        return 'yandex'
    elif 'msn' in x:
        return 'msn'
    elif 'dfa' in x:
        return 'dfa'
    elif '(not set)' in x:
        return '(not set)'
    elif 'feedly' in x:
        return 'feedly'
    elif 'arstechnica' in x:
        return 'arstechnica'
    elif 'squishable' in x:
        return 'squishable'
    elif 'flipboard' in x:
        return 'flipboard'
    elif 't-online.de' in x:
        return 't-online.de'
    elif 'sm.cn' in x:
        return 'sm.cn'
    elif 'wow' in x:
        return 'wow'
    elif 'baidu' in x:
        return 'baidu'
    elif 'partners' in x:
        return 'partners'
    else:
        return 'others'
    

In [29]:
def process_device(df):
    print("process device ...")
    df['source_country'] = df['trafficSource_source'] + '_' + df['geoNetwork_country']
    df['campaign_medium'] = df['trafficSource_campaign'] + '_' + df['trafficSource_medium']
    df['browser_category'] = df['device_browser'] + '_' + df['device_deviceCategory']
    df['browser_os'] = df['device_browser'] + '_' + df['device_operatingSystem']
    return df


def custom(df):
    print('custom ...')
    df['device_deviceCategory_channelGrouping'] = df['device_deviceCategory'] + "_" + df['channelGrouping']
    df['channelGrouping_browser'] = df['device_browser'] + "_" + df['channelGrouping']
    df['channelGrouping_OS'] = df['device_operatingSystem'] + "_" + df['channelGrouping']
    
    for i in ['geoNetwork_city', 'geoNetwork_continent', 'geoNetwork_country','geoNetwork_metro', 'geoNetwork_networkDomain', 'geoNetwork_region','geoNetwork_subContinent']:
        for j in ['device_browser','device_deviceCategory', 'device_operatingSystem', 'trafficSource_source']:
            df[i + "_" + j] = df[i] + "_" + df[j]
    
    df['content_source'] = df['trafficSource_adContent'] + "_" + df['source_country']
    df['medium_source'] = df['trafficSource_medium'] + "_" + df['source_country']
    return df

In [30]:
for df in [train, test]:

    df['device_browser'] = df['device_browser'].map(lambda x:browser_mapping(str(x).lower())).astype('str')
    df['trafficSource_adContent'] = df['trafficSource_adContent'].map(lambda x:adcontents_mapping(str(x).lower())).astype('str')
    df['trafficSource_source'] = df['trafficSource_source'].map(lambda x:source_mapping(str(x).lower())).astype('str')
    
    df = process_device(df)
    df = custom(df)

process device ...
custom ...
process device ...
custom ...


In [36]:
excluded_features = [
    'date', 'fullVisitorId', 'visitStartTime'
]

categorical_features = [
    _f for _f in train.columns
    if (_f not in excluded_features) & (train[_f].dtype == 'object')
]

ToDo. `pd.factorize()`

In [37]:
categorical_features

['channelGrouping',
 'device_browser',
 'device_deviceCategory',
 'device_operatingSystem',
 'geoNetwork_city',
 'geoNetwork_continent',
 'geoNetwork_country',
 'geoNetwork_metro',
 'geoNetwork_networkDomain',
 'geoNetwork_region',
 'geoNetwork_subContinent',
 'trafficSource_adContent',
 'trafficSource_adwordsClickInfo.adNetworkType',
 'trafficSource_adwordsClickInfo.slot',
 'trafficSource_campaign',
 'trafficSource_keyword',
 'trafficSource_medium',
 'trafficSource_referralPath',
 'trafficSource_source',
 'source_country',
 'campaign_medium',
 'browser_category',
 'browser_os',
 'device_deviceCategory_channelGrouping',
 'channelGrouping_browser',
 'channelGrouping_OS',
 'geoNetwork_city_device_browser',
 'geoNetwork_city_device_deviceCategory',
 'geoNetwork_city_device_operatingSystem',
 'geoNetwork_city_trafficSource_source',
 'geoNetwork_continent_device_browser',
 'geoNetwork_continent_device_deviceCategory',
 'geoNetwork_continent_device_operatingSystem',
 'geoNetwork_continent_tr

In [38]:
for f in categorical_features:
    train[f], indexer = pd.factorize(train[f])
    test[f] = indexer.get_indexer(test[f])

In [49]:
[c for c in train.columns if c not in categorical_features]

['date',
 'fullVisitorId',
 'visitNumber',
 'visitStartTime',
 'totals_bounces',
 'totals_hits',
 'totals_newVisits',
 'totals_pageviews',
 'device_isMobile',
 'trafficSource_adwordsClickInfo.page',
 'trafficSource_isTrueDirect',
 'revenue_generated',
 'revenue_amount',
 'date_time',
 'date_dow',
 'date_hours',
 'date_dom',
 'month',
 'days_since_visit']

In [54]:
excluded_features = ['revenue_generated', 'revenue_amount', 'fullVisitorId', 'visitStartTime', 'date_time', 'date']

In [50]:
X_train = train.drop(columns=excluded_features)
y_train = train.revenue_generated

In [52]:
X_test = test.drop(columns=excluded_features)
y_test = test.revenue_generated
#train.columns not in categorical_features