In [None]:
import pandas as pd
import numpy as np
import datetime

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [None]:
data_path = './data/'
!ls -lthr data/

In [None]:
%%time
nrows = 10000
# nrows = None
train = pd.read_csv(data_path+'train.csv', nrows=nrows)

In [None]:
test = pd.read_csv(data_path+'test.csv', nrows=1000)
test.shape

In [None]:
# test_click_session_ids = test[test.impressions.notnull()].session_id.unique()

In [None]:
# ts = np.random.choice(test_click_session_ids, 1)
# test[test.session_id==ts[0]]

In [None]:
# train_click_ids = train[train.impressions.notnull()].session_id.unique()

In [None]:
# ts = np.random.choice(train_click_ids, 1)
# train[train.session_id==ts[0]]

In [None]:
# train.groupby('session_id').apply(lambda x: x.iloc[-1]['reference']).isna().sum()

In [None]:
# train.groupby('session_id').apply(lambda x: x.iloc[-1]['impressions']).isna().sum()

### clip sessions off to last click

In [None]:
%%time
# get all rows upto the last clickout action (some rows has reference but it's not clickout action)
def up_to_last_click(grp):
    check = grp.action_type == 'clickout item'
    if check.sum() != 0:
        return grp.iloc[:np.argwhere(check)[-1][0]+1]
    else:
        return grp
        
train = train.groupby('session_id').apply(up_to_last_click).reset_index(drop=True)

In [None]:
%%time
# get utc timestamp
train['ts'] = train['timestamp'].apply(lambda t: datetime.datetime.utcfromtimestamp(t))

In [None]:
%%time
# only look at sessions with clickouts (for now)
# first filter out sessions that does not have a click-out
def check_clickout(grp):
     # sessions has clickouts
    has_clickout = 'clickout item' in grp['action_type'].unique()
    # last row has reference and it's not nan
    has_ref = ((grp['action_type'].iloc[-1] == 'clickout item') & 
               (grp.iloc[-1][['impressions', 'reference', 'prices']].isna().sum()==0))
#                (pd.notnull(grp.iloc[-1][])) &
#                (pd.notnull(grp.iloc[-1]['prices'])))
    return has_clickout & has_ref
    
clicked = train.groupby('session_id').apply(check_clickout)
click_session_ids = clicked[clicked].index
# filter
train = train[train.session_id.isin(click_session_ids)].reset_index(drop=True)

### meta info

In [None]:
%%time
meta = pd.read_csv(data_path+'item_metadata.csv', nrows=nrows)
meta['properties'] = meta['properties'].str.lower()

In [None]:
%%time
# checkout properties of items
def get_property_count(row):
    return len(row.split('|'))

item_p_ctn = meta['properties'].apply(get_property_count)
item_p_ctn.describe()

In [None]:
%%time
def get_property(row):
    return row.lower().split('|')
meta['ps'] = meta['properties'].apply(get_property)
# numer of properties
meta['nprop'] = meta.ps.str.len()
# star ratings
meta['star'] = meta.properties.str.extract('[\|](\d) star')
meta['star'] = meta['star'].astype(float)

In [None]:
%%time
# create rating columns
ratings = ['good rating', 'satisfactory rating', 'excellent rating']
for r in ratings:
    meta[r.replace(' ', '_')] = meta.properties.str.findall(f'\|{r}').str.len()

In [None]:
# total_ps = []
# for p in meta['ps'].values:
#     total_ps += p

In [None]:
# total_ps

In [None]:
# pd.value_counts(total_ps).iloc[:20].plot(kind='bar')

In [None]:
meta.head()

In [None]:
meta.index[0]

In [None]:
# meta.shape

In [None]:
meta = (meta[['item_id', 'nprop', 'star', 'good_rating', 'satisfactory_rating', 'excellent_rating']]
        .set_index('item_id'))

### Create session features
---

In [None]:
%%time
# find out if same customers have multiple sessions
s = train.drop_duplicates(subset=['user_id', 'session_id'])
s.session_id.duplicated().sum()
# seems like not

In [None]:
# train.current_filters[:1000].unique()

In [None]:
%%time
def get_filters(x):
    if type(x) == str or type(x) == list:
        return x.split('|')
    else:
        return np.nan
def get_impressions(x):
    if type(x) == str:
        return x.split('|')
    else:
        return np.nan
    
    
train['filters'] = train.current_filters.str.split('|')
train['nfilters'] = train.filters.str.len()
train['imps_list'] = train.impressions.str.split('|')
nn_mask = train['imps_list'].notnull()
train.loc[nn_mask, 'imps_list'] = train.loc[nn_mask, 'imps_list'].apply(lambda x: [int(i) for i in x])
train['nimps'] = train.imps_list.str.len()

In [None]:
train.nimps.describe()

In [None]:
# no switch of devices during session
(train.groupby('session_id')['device'].nunique()!=1).sum()

In [None]:
# differnet city during session
(train.groupby('session_id')['city'].nunique()!=1).sum()

In [None]:
%%time
# some custom funcs used in agggregation
def mean_dwell_time(x):
    if len(x) == 1:
        return 0
    else:
        return np.mean(np.diff(np.sort(x)))
    
def var_dwell_time(x):
    if len(x) == 1:
        return 0
    else:
        return np.var(np.diff(np.sort(x)))
    
def get_first(x):
    return x.iloc[0]

def get_last(x):
    return x.iloc[-1]

def n_clickouts(x):
    return (x=='clickout item').sum()

def click_rel_pos_avg(x):
    return np.mean(np.argwhere((x=='clickout item')))/len(x)

# define some aggs
session_aggs = {'timestamp': [np.ptp, mean_dwell_time, var_dwell_time],
                'step': ['max'],
                'action_type': ['nunique', n_clickouts, click_rel_pos_avg],
                'city': ['nunique', get_first],
                'platform': [get_first],
                'device': [get_first],
                'nfilters': ['mean', 'max', 'min', get_last],
                'nimps': ['max']
               }

In [None]:
session_grp = train.groupby('session_id')

In [None]:
%%time
session_fts = session_grp.agg(session_aggs)
session_fts.columns = ['_'.join(col).strip() for col in session_fts.columns.values]

In [None]:
# session_fts.columns.values
session_fts.head()

In [None]:
# train[train.session_id=='62991f7c78f27']

### Create clickout features
---

In [None]:
import time

class Timer:    
    def __init__(self, task, profile):
        self.profile = profile
        self.task = task
        if self.task not in self.profile.keys():
            self.profile[self.task] = 0
        
    def __enter__(self):
        self.start = time.process_time()
        return self

    def __exit__(self, *args):
        self.end = time.process_time()
        self.interval = self.end - self.start
        self.profile[self.task] += np.round(self.interval, decimals=5)

In [None]:
session_grp = train.groupby('session_id')

In [None]:
# train[train.session_id=='62991f7c78f27']

In [None]:
def get_profile(profile):
    profile_df = pd.Series(profile).rename_axis(index='task').reset_index(name='sec')
    profile_df['per'] = (profile_df['sec']/profile_df['sec'].sum()*100).round(2)
    return profile_df

In [None]:
%%time
# 1) has the items appeared in previous impressions, and clickout
# session_grp
profile = {}
def compute_clicks(grp):
    with Timer('select_rows', profile) as t:
        last_row = grp.iloc[-1]
        above = grp.iloc[:-1]
    with Timer('exclude_nans', profile) as t:
        # get previous appeard impressions
        prev = above[above['impressions'].notnull()]
    with Timer('get_imp_list', profile) as t:
        prev_imps = prev['imps_list']
        unique_imps = [j for i in prev_imps for j in i]
    
    with Timer('get_price', profile) as t:
        imp_l = last_row['imps_list']
        prices = last_row['prices'].split('|')
        prices = [int(p) for p in prices]
        # whether the impression appeared before
        appeared = [int(i in unique_imps) for i in imp_l]
        # the location of the impression
        locs = list(range(len(imp_l)))
    
    with Timer('create_df', profile) as t:
    # build the df
        result = pd.DataFrame({'appeared': appeared, 'location': locs, 'price': prices}, index=imp_l)
        result.index.name = 'item_id'
    with Timer('rel_price_rank', profile) as t:
        price_ind = np.argsort(result['price'].values) + 1
        result['rel_price_rank'] = price_ind/len(imp_l)
#         result['rel_price_rank'] = result[['location', 'price']].sort_values(by='price')['location']/len(imp_l)
    
    with Timer('compute_mean_median', profile) as t:
        result['price_mean'] = np.mean(result['price'])
        result['price_median'] = np.median(result['price'])

    with Timer('compute_diff', profile) as t: 
        result_price = result['price'].values
        result_price_mean = result['price_mean'].values 
        result_price_median = result['price_median'].values
        
        result['diff_mean'] = result_price - result_price_mean
        result['diff_median'] = result_price - result_price_median
        result['diff_mean_rel'] = (result_price - result_price_mean)/result_price
        result['diff_median_rel'] = (result_price - result_price_median)/result_price
    
    with Timer('join', profile) as t:
        # fetch the meta data
        result = result.join(meta, on='item_id')
    with Timer('create_mean_meta1', profile) as t:
        result['star_mean'] = np.mean(result['star'].values)
    with Timer('create_mean_meta2', profile) as t:
        result['gr_mean'] = np.mean(result['good_rating'].values)
    with Timer('create_mean_meta3', profile) as t:
        result['sr_mean'] = np.mean(result['satisfactory_rating'].values)
    with Timer('create_mean_meta4', profile) as t:
        result['er_mean'] = np.mean(result['excellent_rating'].values)
    with Timer('create_mean_meta5', profile) as t:
        result.reset_index(inplace=True)
        
    with Timer('create target', profile) as t:
        # get target
        ref = int(last_row['reference'])
        result['target'] = (result['item_id'].values == ref).astype(int)
    return result

n_runs = 5000
n = 0
for k,v in session_grp:
    a = compute_clicks(v)
    n += 1
    if n == n_runs:
        break

# 2) dwell time for each clickout
# 3) the rating and star and nproperty -> mean and median for 
#     the other impressions in list (or the quantile of these and prices)
# 4) location and relative location in the impressions list

In [None]:
get_profile(profile)

In [None]:
get_profile(profile)

In [None]:
def get_popularity(df):
    """Get number of clicks that each item received in the df."""

    mask = df["action_type"] == "clickout item"
    df_clicks = df[mask]
    df_item_clicks = (df_clicks
                      .groupby("reference")
                      .size()
                      .reset_index(name="n_clicks")
                      .transform(lambda x: x.astype(int))

    return df_item_clicks

In [None]:
df = train

In [None]:

mask = df["action_type"] == "clickout item"
df_clicks = df[mask]
df_item_clicks = (df_clicks
                  .groupby("reference")
                  .size()
                  .reset_index(name="n_clicks"))

In [None]:
df_item_clicks

In [None]:
%timeit df['per'].mean()

In [None]:
%timeit np.mean(df['per'].values)

In [None]:
%timeit df['per'].values

In [None]:
pprint(profile)

In [None]:
%%time
dd = session_grp.apply(compute_clicks)

In [None]:
%%time
# join on session features
dd = dd.reset_index(level=0).set_index('session_id')

In [None]:
%%time
final = dd.join(session_fts)

### Questions:

1) timestamp to utc time, timezones? (this matters to e.g. hours of the timestamp)

2) does platform change within a session?

to do:

1) check the time range of train vs test