In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import lightgbm as lgb
from main import sample_customers,process_date_col

# load training pairs
df = pd.read_csv("data/labels_training.txt")
print("Loaded training pairs ", df.shape)

## use customer data
customers = pd.read_csv("data/customers.txt")
# update df
df = pd.merge(df, customers, left_on=['customerId'], right_on=['customerId'], how='left')

## use product data
products = pd.read_csv("data/products.txt")
# replace dateOnSite with year, month, day ...
dateOnSite = process_date_col(products['dateOnSite'])
products = products.drop(columns=['dateOnSite'])
products = pd.concat([products, dateOnSite], axis=1)
# update df
df = pd.merge(df, products, left_on=['productId'], right_on=['productId'], how='left')

# add views
print("Loading views info ...")
views = pd.read_csv("data/views.txt")
views = views.drop(columns=['imageZoom']) # discard imageZoom since all 0 but 1 value
aggr_views = views.groupby(['customerId','productId']).sum() # aggregate the views
# update df
df = pd.merge(df, aggr_views, right_index=True, left_on=['customerId', 'productId'])
print("Merged with train df")
# add other features
# ...

# load testing pairs
df_test = pd.read_csv("data/labels_predict.txt")
df_test = pd.merge(df_test, customers, left_on=['customerId'], right_on=['customerId'], how='left')
df_test = pd.merge(df_test, products, left_on=['productId'], right_on=['productId'], how='left')
df_test = pd.merge(df_test, aggr_views, right_index=True, left_on=['customerId', 'productId'])
print("Merged with test df")

Loaded training pairs  (13481429, 3)


In [3]:
countries = df.country.astype('category').cat.categories
df.loc[:,["country"]] = df.country.astype('category').cat.codes.astype(int)

bool_d = {"True":1, "False":0, '1':1, '0':0, 1:1, 0:0}
df.isFemale = df.isFemale.map(bool_d, na_action='ignore')
df.isPremier = df.isPremier.map(bool_d, na_action='ignore')

days = df.dateOnSite_dayname.astype('category').cat.categories
df.loc[:,["dateOnSite_dayname"]] = df.dateOnSite_dayname.astype('category').cat.codes.astype(int)

In [4]:
country2id = dict([(v,k) for k,v in enumerate(countries)])
df_test.country = df_test.country.map(country2id).fillna(len(countries)).astype(int)

df_test.isFemale = df_test.isFemale.map(bool_d, na_action='ignore')
df_test.isPremier = df_test.isPremier.map(bool_d, na_action='ignore')
df_test.yearOfBirth = df_test.yearOfBirth

day2id = dict([(v,k) for k,v in enumerate(days)])
df_test.dateOnSite_dayname = df_test.dateOnSite_dayname.map(day2id).fillna(len(days)).astype(int)

In [5]:
df_test.isna().sum()

customerId                    0
productId                     0
purchase_probability    3345261
isFemale                   4022
country                       0
yearOfBirth                4022
isPremier                  4022
brand                         0
price                         0
productType                   0
onSale                        0
dateOnSite_year              81
dateOnSite_month             81
dateOnSite_day               81
dateOnSite_dayname            0
viewOnly                      0
changeThumbnail               0
viewCatwalk                   0
view360                       0
sizeGuide                     0
dtype: int64

In [6]:
# keep held-out set
held_out_frac = 0.1
val_ind, train_ind = sample_customers(df, df['customerId'], frac=held_out_frac)


In [7]:
label = df.purchased
customerId = df.customerId
df.drop(columns=['purchased', 'customerId', 'productId'], inplace=True)
df_test.drop(columns=['purchase_probability', 'customerId', 'productId'], inplace=True)

In [8]:
df_test.dtypes

isFemale              float64
country                 int64
yearOfBirth           float64
isPremier             float64
brand                   int64
price                 float64
productType             int64
onSale                   bool
dateOnSite_year       float64
dateOnSite_month      float64
dateOnSite_day        float64
dateOnSite_dayname      int64
viewOnly                int64
changeThumbnail         int64
viewCatwalk             int64
view360                 int64
sizeGuide               int64
dtype: object

In [9]:
param = {'num_leaves': 31, 'objective': 'binary'}
param['metric'] = 'auc'

train_dset = lgb.Dataset(df.iloc[train_ind], label=label.iloc[train_ind])
val_dset = lgb.Dataset(df.iloc[val_ind], label=label.iloc[val_ind])


In [10]:
num_round = 100
bst = lgb.train(param, train_dset, num_round, valid_sets=[val_dset])

[LightGBM] [Info] Number of positive: 245519, number of negative: 11887767
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 937
[LightGBM] [Info] Number of data points in the train set: 12133286, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.020235 -> initscore=-3.879891
[LightGBM] [Info] Start training from score -3.879891
[1]	valid_0's auc: 0.803118
[2]	valid_0's auc: 0.808845
[3]	valid_0's auc: 0.820178
[4]	valid_0's auc: 0.822914
[5]	valid_0's auc: 0.826834
[6]	valid_0's auc: 0.829243
[7]	valid_0's auc: 0.833165
[8]	valid_0's auc: 0.834289
[9]	valid_0's auc: 0.835711
[10]	valid_0's auc: 0.83686
[11]	valid_0's auc: 0.837524
[12]	valid_0's auc: 0.838487
[13]	valid_0's auc: 0.83927
[14]	valid_0's auc: 0.840294
[15]	valid_0's auc: 0.841176
[16]	valid_0's auc: 0.842106
[17]	valid_0's auc: 0.842854
[18]	valid_0's auc: 0.843216
[19]	valid_0's auc: 0.844137


In [11]:
test_pred = bst.predict(df_test)

In [12]:
test_pred.max()

0.9999986202112507

In [16]:
df_test = pd.read_csv("data/labels_predict.txt")
df_test['purchase_probability'] = test_pred

In [17]:
df_test

Unnamed: 0,customerId,productId,purchase_probability
0,2,4601984,0.103446
1,2,5015355,0.014592
2,2,5022042,0.156199
3,2,5048287,0.040018
4,2,6016479,0.003866
...,...,...,...
3345256,399476,7188787,0.006904
3345257,399476,7215288,0.005959
3345258,399476,7258955,0.006968
3345259,399476,7272924,0.005445
