In [1]:
import pandas as pd
import numpy as np

from utils import *

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
train = load_data('train', nrows=None)

In [12]:
dups = train[train.duplicated(subset=['user_id', 'session_id', 'timestamp'], keep=False)]

In [15]:
dups['device'].value_counts()

mobile     5881579
desktop    5657871
tablet     1002090
Name: device, dtype: int64

In [16]:
train.device.value_counts()

mobile     7643538
desktop    7003938
tablet     1285516
Name: device, dtype: int64

In [13]:
dups.head(30)

Unnamed: 0,user_id,session_id,timestamp,step,action_type,reference,platform,city,device,current_filters,impressions,prices
1,00RL8Z82B2Z1,aff3928535f48,1541037522,2,interaction item image,666856,AU,"Sydney, Australia",mobile,,,
2,00RL8Z82B2Z1,aff3928535f48,1541037522,3,interaction item image,666856,AU,"Sydney, Australia",mobile,,,
3,00RL8Z82B2Z1,aff3928535f48,1541037532,4,interaction item image,666856,AU,"Sydney, Australia",mobile,,,
4,00RL8Z82B2Z1,aff3928535f48,1541037532,5,interaction item image,109038,AU,"Sydney, Australia",mobile,,,
5,00RL8Z82B2Z1,aff3928535f48,1541037532,6,interaction item image,666856,AU,"Sydney, Australia",mobile,,,
6,00RL8Z82B2Z1,aff3928535f48,1541037532,7,interaction item image,109038,AU,"Sydney, Australia",mobile,,,
7,00RL8Z82B2Z1,aff3928535f48,1541037532,8,interaction item image,666856,AU,"Sydney, Australia",mobile,,,
8,00RL8Z82B2Z1,aff3928535f48,1541037542,9,interaction item image,109038,AU,"Sydney, Australia",mobile,,,
9,00RL8Z82B2Z1,aff3928535f48,1541037542,10,interaction item image,109038,AU,"Sydney, Australia",mobile,,,
10,00RL8Z82B2Z1,aff3928535f48,1541037542,11,interaction item image,109038,AU,"Sydney, Australia",mobile,,,


In [5]:
train[['user_id', 'session_id', 'timestamp', 'city', 'reference', 'device']].duplicated().sum()

9425134

In [9]:
test = load_data('test')

In [10]:
test[[col for col in test.columns if col!='step']].duplicated().sum()

1986944

In [11]:
test.shape

(3782335, 12)

In [6]:
train[[col for col in train.columns if col!='step']].duplicated().sum()

9249623

In [7]:
dups.shape

(9976106, 12)

In [8]:
train.shape

(15932992, 12)

In [None]:
9976106/15932992

In [None]:
9425134/15932992

In [None]:
train_grp = train.groupby('session_id')


In [None]:
a = train_grp['timestamp'].apply(lambda x: x.max() - x.min())

In [None]:
filepath = './cache/hotel_2vec'
check_dir(filepath)

In [None]:
a.describe()

In [None]:
def session_duration(ts):
    """
    Total session duration
    """
    if len(ts) == 1:
        return np.nan
    else:
        return ts.max() - ts.min()

    
def mean_dwell_time(ts):
    """
    Average dwell time
    """
    if len(ts) == 1:
        return np.nan
    else:
        return np.mean(np.diff(np.sort(ts)))

    
def median_dwell_time(ts):
    """
    Median dwell time
    """
    if len(ts) == 1:
        return np.nan
    else:
        return np.median(np.diff(np.sort(ts)))

    
def var_dwell_time(ts):
    """
    Variance of dewell time
    """
    if len(ts) == 1:
        return np.nan
    else:
        return np.var(np.diff(np.sort(ts)))
    
    
def dwell_time_prior_clickout(ts):
    """
    Duration before last clickout
    """
    if len(ts) == 1:
        return np.nan
    else:
        sorted_ts = np.sort(ts)
        return sorted_ts[-1] - sorted_ts[-2]
    
    
def dwell_time_prior_clickout_per(ts):
    """
    Percentage of last interaction dwell time before clickout
    """
    if len(ts) == 1:
        return np.nan
    else:
        sorted_ts = np.sort(ts)
        return (sorted_ts[-1] - sorted_ts[-2])/(sorted_ts[-1]-sorted_ts[0])

    
def second_last(x):
    """
    Get the seoncd last value
    """
    if len(x) == 1:
        return np.nan
    else:
        return x.iloc[-2]
    
    
def n_prev_clickouts(action_type):
    """
    Count how many previous clickouts are there
    """
    return (action_type.values == 'clickout item').sum() - 1


def avg_clickout_loc(action_type):
    """
    Average clickout location in the session
    """
    return np.mean(np.argwhere((action_type.values == 'clickout item'))+1)/len(x)



def compute_session_fts(df):
    """
    Create session features using groupby with agg
    """
    # define some aggs
    session_aggs = {'timestamp': [session_duration, mean_dwell_time, var_dwell_time, median_dwell_time, 
                                  dwell_time_prior_clickout, dwell_time_prior_clickout_per],
                    'step': ['max'],
                    'action_type': ['nunique', n_prev_clickouts, avg_clickout_loc, second_last],
                    'reference': ['nunique', second_last],
                    'city': ['last'],
                    'platform': ['last'],
                    'device': ['last'],
                    # below opeartes on createed features
                    'n_imps': ['last'],
                    'n_filters': ['last']}
    
    fprint = Fprint().fprint
    fprint("Generate length of 'impressions' and 'current_filters'")
    df['n_imps'] = df.impressions.str.split('|').str.len()
    df['n_filters'] = df.current_filters.str.split('|').str.len()
    
    fprint("Creating session features using agg on groupby from 'session_id'")
    session_grp = df.groupby('session_id')
    session_fts = session_grp.agg(session_aggs)
    fprint('Done creating session fts, cleaning up column names')
    session_fts.columns = ['_'.join(col).strip() for col in session_fts.columns.values]
    del df['n_imps'], df['n_filters']
    gc.collect()
    return session_fts
    
    
    

# def get_first(x):
#     return x.iloc[0]


# def get_last(x):
#     return x.iloc[-1]


# def n_clickouts(x):
#     # 'clickout item': 2
#     return (x.values == 2).sum()


# def click_rel_pos_avg(x):
#     # 'clickout item': 2
#     return np.mean(np.argwhere((x.values == 2)) + 1) / len(x)



