In [98]:
import pandas as pd
import os
import json
from pandas.io.json import json_normalize #package for flattening json in pandas df
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor




In [2]:
def load_df(csv_path='../data/train.csv', nrows=None):
    JSON_COLUMNS = ['device', 'geoNetwork', 'totals', 'trafficSource']

    df = pd.read_csv(csv_path, dtype={'fullVisitorId': 'str'}, nrows=nrows)

    for column in JSON_COLUMNS:
        df = df.join(pd.DataFrame(df.pop(column).apply(pd.io.json.loads).values.tolist(), index=df.index))
    return df

print(os.listdir("../data"))

['test.csv', 'train.csv', 'sample_submission.csv']


In [112]:
train_df = load_df()
test_df = load_df(csv_path = '../data/test.csv')

In [68]:
# list_of_totals = train_df_raw.totals.apply(json.loads)
# # get all the keys for devices column
# total_keys = list()
# for metrics in list_of_totals:
#     total_keys.extend(x for x in metrics.keys() if x not in total_keys)

In [113]:
drop_columns = list()
dict_columns = list()
for item in train_df.columns:
    if str(type(train_df[item][0])) =="<class 'dict'>":
        dict_columns.append(item)
        continue
    if len(train_df[item].unique()) == 1:
        drop_columns.append(item)
        continue

In [114]:
print(len(drop_columns), len(dict_columns), len(train_df.columns))

18 1 50


In [115]:
def add_date_features(df):
    df['date'] = df['date'].astype(str)
    df["date"] = df["date"].apply(lambda x : x[:4] + "-" + x[4:6] + "-" + x[6:])
    df["date"] = pd.to_datetime(df["date"])
    
    df["month"]   = df['date'].dt.month
    df["day"]     = df['date'].dt.day
    df["weekday"] = df['date'].dt.weekday
    return df 

In [116]:
train_df = add_date_features(train_df)
test_df = add_date_features(test_df)


In [117]:
def normalize_numerical_columns(df, isTrain = True):
    df["hits"] = df["hits"].astype(float)
    df["hits"] = (df["hits"] - min(df["hits"])) / (max(df["hits"]) - min(df["hits"]))

    df["pageviews"] = df["pageviews"].astype(float)
    df["pageviews"] = (df["pageviews"] - min(df["pageviews"])) / (max(df["pageviews"]) - min(df["pageviews"]))
    
    if isTrain:
        df["transactionRevenue"] = df["transactionRevenue"].fillna(0.0)
    return df 

In [118]:
train_df = normalize_numerical_columns(train_df)
test_df = normalize_numerical_columns(test_df, isTrain = False)

In [119]:
non_relevant = ["visitNumber", "date", "fullVisitorId", "sessionId", "visitId", "visitStartTime"]
train_df = train_df.drop(non_relevant,axis=1)
train_df = train_df.drop(drop_columns,axis=1)
train_df = train_df.drop(dict_columns,axis=1)
#test_df = test_df.drop(non_relevant,axis=1)
test_df = test_df.drop(drop_columns,axis=1)
test_df = test_df.drop(dict_columns,axis=1)

In [122]:
# https://www.kaggle.com/prashantkikani/teach-lightgbm-to-sum-predictions-fe
def browser_mapping(x):
    browsers = ['chrome','safari','firefox','internet explorer','edge','opera','coc coc','maxthon','iron']
    if x in browsers:
        return x.lower()
    elif  ('android' in x) or ('samsung' in x) or ('mini' in x) or ('iphone' in x) or ('in-app' in x) or ('playstation' in x):
        return 'mobile browser'
    elif  ('mozilla' in x) or ('chrome' in x) or ('blackberry' in x) or ('nokia' in x) or ('browser' in x) or ('amazon' in x):
        return 'mobile browser'
    elif  ('lunascape' in x) or ('netscape' in x) or ('blackberry' in x) or ('konqueror' in x) or ('puffin' in x) or ('amazon' in x):
        return 'mobile browser'
    elif '(not set)' in x:
        return x
    else:
        return 'others'
    
    
def adcontents_mapping(x):
    if  ('google' in x):
        return 'google'
    elif  ('placement' in x) | ('placememnt' in x):
        return 'placement'
    elif '(not set)' in x or 'nan' in x:
        return x
    elif 'ad' in x:
        return 'ad'
    else:
        return 'others'
    
def source_mapping(x):
    if  ('google' in x):
        return 'google'
    elif  ('youtube' in x):
        return 'youtube'
    elif '(not set)' in x or 'nan' in x:
        return x
    elif 'yahoo' in x:
        return 'yahoo'
    elif 'facebook' in x:
        return 'facebook'
    elif 'reddit' in x:
        return 'reddit'
    elif 'bing' in x:
        return 'bing'
    elif 'quora' in x:
        return 'quora'
    elif 'outlook' in x:
        return 'outlook'
    elif 'linkedin' in x:
        return 'linkedin'
    elif 'pinterest' in x:
        return 'pinterest'
    elif 'ask' in x:
        return 'ask'
    elif 'siliconvalley' in x:
        return 'siliconvalley'
    elif 'lunametrics' in x:
        return 'lunametrics'
    elif 'amazon' in x:
        return 'amazon'
    elif 'mysearch' in x:
        return 'mysearch'
    elif 'qiita' in x:
        return 'qiita'
    elif 'messenger' in x:
        return 'messenger'
    elif 'twitter' in x:
        return 'twitter'
    elif 't.co' in x:
        return 't.co'
    elif 'vk.com' in x:
        return 'vk.com'
    elif 'search' in x:
        return 'search'
    elif 'edu' in x:
        return 'edu'
    elif 'mail' in x:
        return 'mail'
    elif 'ad' in x:
        return 'ad'
    elif 'golang' in x:
        return 'golang'
    elif 'direct' in x:
        return 'direct'
    elif 'dealspotr' in x:
        return 'dealspotr'
    elif 'sashihara' in x:
        return 'sashihara'
    elif 'phandroid' in x:
        return 'phandroid'
    elif 'baidu' in x:
        return 'baidu'
    elif 'mdn' in x:
        return 'mdn'
    elif 'duckduckgo' in x:
        return 'duckduckgo'
    elif 'seroundtable' in x:
        return 'seroundtable'
    elif 'metrics' in x:
        return 'metrics'
    elif 'sogou' in x:
        return 'sogou'
    elif 'businessinsider' in x:
        return 'businessinsider'
    elif 'github' in x:
        return 'github'
    elif 'gophergala' in x:
        return 'gophergala'
    elif 'yandex' in x:
        return 'yandex'
    elif 'msn' in x:
        return 'msn'
    elif 'dfa' in x:
        return 'dfa'
    elif '(not set)' in x:
        return '(not set)'
    elif 'feedly' in x:
        return 'feedly'
    elif 'arstechnica' in x:
        return 'arstechnica'
    elif 'squishable' in x:
        return 'squishable'
    elif 'flipboard' in x:
        return 'flipboard'
    elif 't-online.de' in x:
        return 't-online.de'
    elif 'sm.cn' in x:
        return 'sm.cn'
    elif 'wow' in x:
        return 'wow'
    elif 'baidu' in x:
        return 'baidu'
    elif 'partners' in x:
        return 'partners'
    else:
        return 'others'

train_df['browser'] = train_df['browser'].map(lambda x:browser_mapping(str(x).lower())).astype('str')
train_df['adContent'] = train_df['adContent'].map(lambda x:adcontents_mapping(str(x).lower())).astype('str')
train_df['source'] = train_df['source'].map(lambda x:source_mapping(str(x).lower())).astype('str')

test_df['browser'] = test_df['browser'].map(lambda x:browser_mapping(str(x).lower())).astype('str')
test_df['adContent'] = test_df['adContent'].map(lambda x:adcontents_mapping(str(x).lower())).astype('str')
test_df['source'] = test_df['source'].map(lambda x:source_mapping(str(x).lower())).astype('str')

In [76]:
[item for item in train_df.columns if item not in test_df.columns]

['transactionRevenue', 'campaignCode']

In [123]:
train_df = train_df.drop(['campaignCode'], axis=1)

In [78]:
numerical_lst =['hits', 'pageviews', 'transactionRevenue']
categorical_columns = [c for c in train_df.columns if c not in numerical_lst]
for c in categorical_columns:
    le = LabelEncoder()
    train_vals = list(train_df[c].values.astype(str))
    test_vals = list(test_df[c].values.astype(str))
    
    le.fit(train_vals + test_vals)
    
    train_df[c] = le.transform(train_vals)
    test_df[c] = le.transform(test_vals)
# test_df.newVisits = test_df.newVisits.astype(float).fillna(-1)
# train_df.newVisits = train_df.newVisits.astype(float).fillna(-1)
# test_df.bounces = test_df.bounces.astype(float).fillna(-1)
# train_df.bounces = train_df.bounces.astype(float).fillna(-1)

In [102]:
train_df.isnull().sum()

channelGrouping         0
browser                 0
deviceCategory          0
isMobile                0
operatingSystem         0
city                    0
continent               0
country                 0
metro                   0
networkDomain           0
region                  0
subContinent            0
bounces                 0
hits                    0
newVisits               0
pageviews             100
transactionRevenue      0
adContent               0
campaign                0
isTrueDirect            0
keyword                 0
medium                  0
referralPath            0
source                  0
month                   0
day                     0
weekday                 0
dtype: int64

In [105]:
train_df["transactionRevenue"] = np.log1p(train_df["transactionRevenue"].astype(float))
train_x, valid_x, train_y, valid_y = train_test_split(train_df.drop(["transactionRevenue"], axis=1), train_df["transactionRevenue"], test_size=0.25, random_state=2018)

In [111]:
lgb_params = {"objective" : "regression", "metric" : "rmse",
              "num_leaves" : 80, "learning_rate" : 0.01, 
              "bagging_fraction" : 0.8, "feature_fraction" : 0.75, "bagging_frequency" : 10}
    
lgb_train = lgb.Dataset(train_x, label=train_y)
lgb_val = lgb.Dataset(valid_x, label=valid_y)
model = lgb.train(lgb_params, lgb_train, 1500, valid_sets=[lgb_val], early_stopping_rounds=250, verbose_eval=20)

Training until validation scores don't improve for 250 rounds.
[20]	valid_0's rmse: 0.314869
[40]	valid_0's rmse: 0.303543
[60]	valid_0's rmse: 0.294854
[80]	valid_0's rmse: 0.289309
[100]	valid_0's rmse: 0.284371
[120]	valid_0's rmse: 0.281038
[140]	valid_0's rmse: 0.278446
[160]	valid_0's rmse: 0.276388
[180]	valid_0's rmse: 0.27485
[200]	valid_0's rmse: 0.273719
[220]	valid_0's rmse: 0.272806
[240]	valid_0's rmse: 0.272179
[260]	valid_0's rmse: 0.271605
[280]	valid_0's rmse: 0.271201
[300]	valid_0's rmse: 0.270846
[320]	valid_0's rmse: 0.270552
[340]	valid_0's rmse: 0.270268
[360]	valid_0's rmse: 0.27005
[380]	valid_0's rmse: 0.269827
[400]	valid_0's rmse: 0.269646
[420]	valid_0's rmse: 0.269512
[440]	valid_0's rmse: 0.269401
[460]	valid_0's rmse: 0.269289
[480]	valid_0's rmse: 0.269184
[500]	valid_0's rmse: 0.26911
[520]	valid_0's rmse: 0.269038
[540]	valid_0's rmse: 0.268968
[560]	valid_0's rmse: 0.268906
[580]	valid_0's rmse: 0.268863
[600]	valid_0's rmse: 0.26879
[620]	valid_0's

In [104]:
# clf = RandomForestRegressor()
# clf.fit(train_x, train_y)
train_df = train_df.fillna(0)
test_df = test_df.fillna(0)

In [107]:
preds = model.predict(test_df[train_df.drop(["transactionRevenue"],axis =1).columns], num_iteration=model.best_iteration)
test_df["PredictedLogRevenue"] = np.expm1(preds)
submission = test_df.groupby("fullVisitorId").agg({"PredictedLogRevenue" : "sum"}).reset_index()
submission["PredictedLogRevenue"] = np.log1p(submission["PredictedLogRevenue"])
submission["PredictedLogRevenue"] =  submission["PredictedLogRevenue"].apply(lambda x : 0.0 if x <= 0 else x)
submission = submission.fillna(0)
submission.to_csv("baseline_3.csv", index=False)
submission.head()

Unnamed: 0,fullVisitorId,PredictedLogRevenue
0,259678714014,0.036775
1,49363351866189,0.0
2,53049821714864,0.0
3,59488412965267,0.0
4,85840370633780,0.005649


In [43]:
train_df.columns

Index(['channelGrouping', 'browser', 'deviceCategory', 'isMobile',
       'operatingSystem', 'city', 'continent', 'country', 'metro',
       'networkDomain', 'region', 'subContinent', 'bounces', 'hits',
       'newVisits', 'pageviews', 'transactionRevenue', 'adContent', 'campaign',
       'isTrueDirect', 'keyword', 'medium', 'referralPath', 'source', 'month',
       'day', 'weekday'],
      dtype='object')

In [88]:
submission.iloc[140007]

fullVisitorId          2269590319988525616
PredictedLogRevenue                    NaN
Name: 140007, dtype: object

In [89]:
test_df.iloc[14007]

channelGrouping                                    4
date                             2018-01-17 00:00:00
fullVisitorId                     119947106824475886
sessionId              119947106824475886_1516213080
visitId                                   1516213080
visitNumber                                        1
visitStartTime                            1516213080
browser                                           72
deviceCategory                                     1
isMobile                                           1
operatingSystem                                   23
city                                             955
continent                                          2
country                                          218
metro                                            122
networkDomain                                      0
region                                           482
subContinent                                      12
bounces                                       

In [90]:
np.log1p(-0.00410628)

-0.0041147339184265854

In [92]:
submission = submission.fillna(0)

In [93]:
submission.to_csv("baseline.csv", index=False)
submission.head()

Unnamed: 0,fullVisitorId,PredictedLogRevenue
0,259678714014,0.163213
1,49363351866189,0.0
2,53049821714864,0.0
3,59488412965267,0.0
4,85840370633780,0.052515
