In [1]:
from utils import load_data, check_gpu, check_dir

import pandas as pd
import numpy as np
from tqdm import tqdm
import gc
from utils import meta_encoding
from functools import partial
import matplotlib.pyplot as plt

import datetime
pd.options.display.max_colwidth = 1000
%matplotlib inline
%load_ext autoreload
%autoreload 2
def fprint(df, name):
    print(f'{name} shape: ({df.shape[0]:,}, {df.shape[1]})')
import pprint
pp = pprint.PrettyPrinter(indent=1)

### Check clean data

In [9]:
train_cleaned = pd.read_parquet('gbm_cache/preprocess_train_1000000_no_test_added.snappy')

In [3]:
train_raw = load_data('train', nrows=1000000)

[06-07 16:41:15 - utils-111 - load_data - INFO] Loading train using 1,000,000 rows (999,995 trimmed) which is 6.28% out of total train data


In [4]:
train_ids = train_raw.session_id.unique()

In [19]:
rid = np.random.choice(train_ids, 1)[0]
rid

'b8ad97aa9a206'

In [20]:
# raw
train_raw[train_raw.session_id==rid].head()

Unnamed: 0,user_id,session_id,timestamp,step,action_type,reference,platform,city,device,current_filters,impressions,prices
819801,699E5SZX34AJ,b8ad97aa9a206,1541200812,1,search for item,111925,AU,"Kuta, Indonesia",desktop,,,
819802,699E5SZX34AJ,b8ad97aa9a206,1541200835,2,clickout item,111925,AU,"Kuta, Indonesia",desktop,,111925|104163|99563|114598|823931|1289618|1353992|1369129|3150422|4569310|1016699|2634040|97907|104122|3866722|1357232|1016731|2318446|99560|96275|1081268|2420384|122633|344371|880943,36|66|114|80|196|157|178|417|181|324|121|152|173|267|57|112|12|104|102|46|134|236|78|128|330
819803,699E5SZX34AJ,b8ad97aa9a206,1541201688,3,interaction item deals,111925,AU,"Kuta, Indonesia",desktop,,,


In [21]:
# clean
train_cleaned[train_cleaned.session_id==rid].head()

Unnamed: 0,session_id,timestamp,step,action_type,current_filters,reference,impressions,prices,country,device,platform
233752,b8ad97aa9a206,2018-11-02 23:20:12,1.0,2,,111925,,,18,0,12
233753,b8ad97aa9a206,2018-11-02 23:20:35,2.0,0,,111925,111925|104163|99563|114598|823931|1289618|1353992|1369129|3150422|4569310|1016699|2634040|97907|104122|3866722|1357232|1016731|2318446|99560|96275|1081268|2420384|122633|344371|880943,36|66|114|80|196|157|178|417|181|324|121|152|173|267|57|112|12|104|102|46|134|236|78|128|330,18,0,12


In [8]:
train_cleaned.action_type.head()

0    1.0
1    1.0
2    2.0
3    0.0
4    0.0
Name: action_type, dtype: float64

In [25]:
# check current filters
cfs_ids = train_cleaned[train_cleaned.current_filters.notna()].session_id.unique()

### Check feature generations

In [32]:
train_cleaned[train_cleaned.session_id==np.random.choice(cfs_ids, 1)[0]]

Unnamed: 0,session_id,timestamp,step,action_type,current_filters,reference,impressions,prices,country,device,platform
300567,ef019b0fd74f0,2018-11-06 14:04:05,1.0,2,,97590,,,94,2,3
300568,ef019b0fd74f0,2018-11-06 14:04:07,2.0,2,,97590,,,94,2,3
300569,ef019b0fd74f0,2018-11-06 14:04:17,8.0,2,,97590,,,94,2,3
300570,ef019b0fd74f0,2018-11-06 14:04:27,17.0,2,,97590,,,94,2,3
300571,ef019b0fd74f0,2018-11-06 14:04:37,29.0,2,,97590,,,94,2,3
300572,ef019b0fd74f0,2018-11-06 14:04:47,31.0,2,,97590,,,94,2,3
300573,ef019b0fd74f0,2018-11-06 14:05:25,33.0,2,,3498974,,,94,2,3
300574,ef019b0fd74f0,2018-11-06 14:05:35,37.0,2,,3498974,,,94,2,3
300575,ef019b0fd74f0,2018-11-06 14:05:45,47.0,2,,3498974,,,94,2,3
300576,ef019b0fd74f0,2018-11-06 14:05:55,56.0,2,,3498974,,,94,2,3
