# Imports

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import random
import warnings
warnings.filterwarnings("ignore")

# Load Data and Sub Sample

In [None]:
data = pd.read_csv('/kaggle/input/trivagorecsyschallengedata2019/trivagorecsyschallengedata2019_v2/train.csv')

In [None]:
random.seed(50)
a = data.user_id.unique()
b =random.sample(list(a),5000)
sub_sample = data[data.user_id.isin(b)]

In [None]:
train_reference = sub_sample[sub_sample.impressions.notnull()][['user_id', 'session_id', 'reference', 'impressions']]

# Feature Generation

In [None]:
sub_sample.tail()

In [None]:
def phase_one_prep(data):
    data['n_action'] = ''
    data['last_interact'] = 0
    data['dif_last_interact_time'] = 0
    
    current_se = data.head(1).session_id.values[0]
    clickout_list = []
    n_ac = {}
    last_interact = 0
    last_interact_time = 0
    for index, row in data.iterrows():
        if current_se != row.session_id:
            for i in clickout_list:
                data.at[i,'n_action'] = n_ac
            current_se = row.session_id
            last_interact = 0
            last_interact_time = 0
            clickout_list = []
            n_ac = {}

        # Number of actions on target item in current session
        ref = row.reference
        if ref.isdigit():
            if not ref in n_ac:
                n_ac[ref] = 1
            else:
                n_ac[ref] += 1

        # get last interacted item before clickout
        if row.action_type == 'clickout item':
            try:
                data.at[index,'last_interact'] = last_interact
            except:
                pass
            if last_interact_time !=0:
                data.at[index,'dif_last_interact_time'] = row.timestamp - last_interact_time
            clickout_list.append(index)
            last_interact = 0
            last_interact_time =0
        elif row.action_type in ['interaction item rating', 'search for item', 'interaction item deals', 'interaction item info', 'interaction item image']:
            last_interact = row.reference
            last_interact_time = row['timestamp']

    for i in clickout_list:
        data.at[i,'n_action'] = n_ac


    return data[data.action_type == 'clickout item'][['user_id', 'session_id','reference', 'impressions', 'prices', 'n_action', 'last_interact', 'dif_last_interact_time']]

features = phase_one_prep(sub_sample)
print(features.shape)
features.head()

In [None]:
def phase_two_prep(features):
    new_features = pd.DataFrame(columns=['is_top', 'position', 'price_rank', 'n_action', 'dif_last_target_price', 'last_interact_position', 'dif_last_target_position', 'dif_last_interact_time', 'price', 'user_id', 'session_id', 'item_id', 'label'])
    timer = 0
    for index, row in features.iterrows():
        if timer%100 == 0:
            print(timer/features.shape[0]*100)
        timer+=1
        pl = row.prices.split('|')
        iml = row.impressions.split('|')
        last_interact_index = -1
        last_rank = -1
        if row['last_interact'] != 0:
            for i, im in enumerate(iml):
#                 print(last_interact_index, type(im),type(row['last_interact']) , im==row['last_interact'])
                if int(im) == row['last_interact']:
                    last_interact_index = i
#                     print(last_interact_index)
                    break
            for j in range(i):
                if pl[j] <= pl[i]:
                    last_rank += 1       

        for i, im in enumerate(iml):
            feat = {}

            # is_top
            if i == 0:
                feat['is_top'] = 1
            else:
                feat['is_top'] = 0

            # posiotion
            feat['position'] = i+1

            # price rank
            rank = 1
            for j in range(i):
                if pl[j] <= pl[i]:
                    rank += 1
            feat['price_rank'] = rank

            # number of actions on target
            n_ac = 0
            try:
                n_ac = row['n_action'][im]
            except:
                pass
            feat['n_action'] = n_ac

            # diffrence in price rank between target item and last interacted item
            if last_rank != -1:
                feat['dif_last_target_price'] = rank - last_rank
            else:
                feat['dif_last_target_price'] = 0

            # last interact possiotion 
            feat['last_interact_position'] = last_interact_index+1

            # diffrence in position rank between target item and last interacted item
            feat['dif_last_target_position'] = i-last_interact_index

            # diffrence in last interacted and clickout
            feat['dif_last_interact_time'] = row['dif_last_interact_time']


            feat['price'] = pl[i]
            feat['user_id'] = row['user_id']
            feat['session_id'] = row['session_id']
            feat['item_id'] = im

            # label 
            if row.reference == im:
                feat['label'] = 1
            else:
                feat['label'] =0

            new_features = new_features.append(feat, ignore_index=True)
    return new_features     

new_features = phase_two_prep(features)
new_features

In [None]:
new_features.to_csv('new_features.csv', index=False)
train_reference.to_csv('train_reference.csv', index=False)