In [14]:
#!pip install tqdm

In [15]:
just_checking_integrity=False
rows=3000
test_rows=1000

In [16]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers
import numpy as np
import datetime
from itertools import compress
from math import sin, cos

# Utilities

## Preprocessing

In [17]:
embedding_weights = np.load(
    '../dataset/processed_data/compressed_features.npy'
    )

import os
import scipy.sparse as sps
def get_ICM(files_directory="../dataset/processed_data"):
    df_icm = pd.read_csv(filepath_or_buffer=os.path.join(files_directory, 'simplified_features_and_categories_30.csv'), sep=',', header=0)
    
    item_id_list = df_icm['item_id'].values
    feat_id_list = df_icm['feature_idx'].values
    rating_id_list = np.ones_like(feat_id_list)
    ICM_matrix = sps.csr_matrix((rating_id_list, (item_id_list, feat_id_list)))
    return ICM_matrix

ICM=get_ICM()

In [18]:
from tqdm.auto import tqdm
tqdm.pandas()
def macro_features_generation(input_dataframe: pd.DataFrame) -> pd.DataFrame:
    input_dataframe = remove_reloaded_items(input_dataframe)
    print('completed removal')


    input_dataframe = input_dataframe.sort_values(by='date').groupby(['session_id']).agg(list).reset_index()
    input_dataframe.sort_values(by="session_id", inplace=True)
    #if filter:
        # input_dataframe["len"]=input_dataframe["item_id"].apply(len)
        # input_dataframe=input_dataframe.loc[input_dataframe["len"]>=3]
        # print("length dataset : ", len(input_dataframe))
    input_dataframe = get_date_features(input_dataframe)
    print('completed date features')
    input_dataframe = get_session_length_features(input_dataframe)
    print('completed session length')
    input_dataframe = get_special_date_features(input_dataframe)
    print('completed special features')

    input_dataframe = get_session_similarity(input_dataframe)
    print('completed similarities')
    return input_dataframe


# Remove sequent items if the same item has a delta t of 30 seconds
def remove_reloaded_items(input_dataframe: pd.DataFrame) -> pd.DataFrame:
    shifted_item_ids_per_group = input_dataframe.sort_values(['session_id', 'date']).groupby(['session_id'])['item_id'].shift(-1).fillna(0).astype(int)
    shifted_datetime_per_group = input_dataframe.sort_values(['session_id', 'date']).groupby(['session_id'])['date'].shift(-1)#.fillna(0)
    consecutive_item_filter = (input_dataframe.sort_values(['session_id', 'date'])['item_id'] - shifted_item_ids_per_group).eq(0)
    time_delta_filter = (shifted_datetime_per_group - input_dataframe.sort_values(['session_id', 'date'])['date']).dt.total_seconds() < 30
    duplication_filter = ~(consecutive_item_filter & time_delta_filter)

    filtered_dataframe = input_dataframe[duplication_filter]
    return filtered_dataframe


def remove_items(x):
    boolean_vector = [
        not ((x['item_id'][i] == x['item_id'][i + 1]) and ((x['date'][i + 1] - x['date'][i]).total_seconds() < 30)) for
        i in range(len(x['item_id']) - 1)]

    boolean_vector.append(True)

    filtered_items = list(compress(x['item_id'], boolean_vector))
    filtered_timestamps = list(compress(x['date'], boolean_vector))

    return filtered_timestamps, filtered_items


def get_date_features(input_dataframe: pd.DataFrame) -> pd.DataFrame:
    date_feature_names = [
        'timedelta',
        'date_normalized'
    ]

    input_dataframe["date_session_starting"] = pd.to_datetime(input_dataframe['date'].str[0])
    input_dataframe["date_session_ending"] = pd.to_datetime(input_dataframe['date'].str[-1])

    input_dataframe["date_hour_sin"] = np.sin((input_dataframe["date_session_starting"].dt.hour + input_dataframe["date_session_starting"].dt.minute / 60) * np.pi / 12)
    input_dataframe["date_hour_cos"] = np.cos((input_dataframe["date_session_starting"].dt.hour + input_dataframe["date_session_starting"].dt.minute / 60) * np.pi / 12)
    input_dataframe["date_day_sin"] = np.sin(input_dataframe["date_session_starting"].dt.day * np.pi / 15)
    input_dataframe["date_day_cos"] = np.cos(input_dataframe["date_session_starting"].dt.day * np.pi / 15)
    input_dataframe["date_month_sin"] = np.sin(input_dataframe["date_session_starting"].dt.month * np.pi / 6)
    input_dataframe["date_month_cos"] = np.cos(input_dataframe["date_session_starting"].dt.month * np.pi / 6)

    input_dataframe["date_hour_sin_ending"] = np.sin((input_dataframe["date_session_ending"].dt.hour + input_dataframe["date_session_ending"].dt.minute / 60) * np.pi / 12)
    input_dataframe["date_hour_cos_ending"] = np.cos((input_dataframe["date_session_ending"].dt.hour + input_dataframe["date_session_ending"].dt.minute / 60) * np.pi / 12)
    input_dataframe["date_day_sin_ending"] = np.sin(input_dataframe["date_session_ending"].dt.day * np.pi / 15)
    input_dataframe["date_day_cos_ending"] = np.cos(input_dataframe["date_session_ending"].dt.day * np.pi / 15)
    input_dataframe["date_month_sin_ending"] = np.sin(input_dataframe["date_session_ending"].dt.month * np.pi / 6)
    input_dataframe["date_month_cos_ending"] = np.cos(input_dataframe["date_session_ending"].dt.month * np.pi / 6)

    input_dataframe["date_year_2020"] = (input_dataframe["date_session_starting"].dt.year == 2020).astype(int)

    input_dataframe[date_feature_names] = input_dataframe[['date']].progress_apply(
        process_timestamps,
        axis=1,
        result_type="expand"
    )
    return input_dataframe


def process_timestamps(x):
    # TODO: insert this in time series features
    x = x[0]
    times = [datetime.hour*3600 + datetime.minute*60 + datetime.second for datetime in x]
    times = [time - times[0] for time in times]
    timedelta = [(times[index + 1] - times[index]) for index in range(len(times) - 1)]
    timedelta.append(np.mean(timedelta) if len(timedelta) > 0 else -1)
    timedelta=[np.log10(t+1) if t>0 else -1 for t in timedelta]
    return (
        timedelta,
        times,
    )


def get_session_length_features(input_dataframe: pd.DataFrame) -> pd.DataFrame:
    session_length_feature_names = ['length_of_session_seconds',
                                    'avg_time_spent_per_item_seconds',
                                    'variance_time_spent_per_item_seconds',
                                    'longest_seen_item',
                                    'shortest_seen_item',
                                    'n_seen_items',
                                    'n_unique_items',
                                    'user_went_afk']

    # compute length of sessions in seconds
    input_dataframe[session_length_feature_names] = input_dataframe[['date', 'item_id']].progress_apply(
        compute_lengths,
        axis=1,
        result_type="expand"
    )
    return input_dataframe


def compute_lengths(x):
    session_length_seconds = (x['date'][-1] - x['date'][0]).total_seconds()

    n_seen_items = len(x['item_id'])
    n_unique=len(set(x["item_id"]))

    time_deltas_between_items = np.array([(x['date'][i + 1] - x['date'][i]).total_seconds() for i in
                                          range(len(x['date']) - 1)]) if n_seen_items > 1 else np.array([0])

    avg_time_spent_on_item_seconds = session_length_seconds / (len(x['date']) - 1) if n_seen_items > 1 else 0

    variance_time_spent_on_item_seconds = np.var(time_deltas_between_items)

    user_went_afk = int(any(time_deltas_between_items / 60 > 30))
    
    if n_seen_items > 1:
        longest_seen_item = x['item_id'][
            np.argmax(time_deltas_between_items)]
        shortest_seen_item = x['item_id'][
            np.argmin(time_deltas_between_items)]
    else:
        longest_seen_item = x['item_id'][0]
        shortest_seen_item = longest_seen_item

    return (
        session_length_seconds,
        avg_time_spent_on_item_seconds,
        variance_time_spent_on_item_seconds,
        longest_seen_item,
        shortest_seen_item,
        n_seen_items,
        n_unique,
        user_went_afk
    )


# TODO: Implement the special time features
def get_special_date_features(input_dataframe: pd.DataFrame) -> pd.DataFrame:
    input_dataframe["date_session_starting"] = pd.to_datetime(input_dataframe['date'].str[0])

    input_dataframe["is_weekend"] = ((input_dataframe["date_session_starting"].dt.day_of_week == 5) | (
                input_dataframe["date_session_starting"].dt.day_of_week == 6)).astype(int)

    input_dataframe["is_hot_hour"] = ((datetime.time(hour=21) > input_dataframe["date_session_starting"].dt.time) & (
                input_dataframe["date_session_starting"].dt.time > datetime.time(hour=18))).astype(int)

    input_dataframe["is_night"] = ((datetime.time(hour=23) < input_dataframe["date_session_starting"].dt.time) | (
                input_dataframe["date_session_starting"].dt.time < datetime.time(hour=5))).astype(int)

    input_dataframe["is_christmas_time"] = (input_dataframe["date_session_starting"].dt.month == 12).astype(int)

    input_dataframe["is_black_friday"] = ((input_dataframe["date_session_starting"].dt.month == 11) & (
                27 <= input_dataframe["date_session_starting"].dt.day) & (input_dataframe["date_session_starting"].dt.day <= 30)).astype(int)

    return input_dataframe

def get_session_similarity(input_dataframe: pd.DataFrame) -> pd.DataFrame:
    session_similarity = ['session_similarity',
                          'session_similarity_uniques',
                          'session_similarity_feats',  
                          'session_similarity_feats_uniques'
                          ]

    # compute length of sessions in seconds
    input_dataframe[session_similarity ] = input_dataframe[['date', 'item_id']].progress_apply(
        compute_similarity,
        axis=1,
        result_type="expand"
    )
    return input_dataframe


def compute_similarity(x):
    #use embeddings

    #all items
    normalized_embedding_matrix = embedding_weights[x['item_id']] / np.expand_dims(np.linalg.norm(embedding_weights[x['item_id']], axis=1), axis=1)
    s = np.linalg.svd(normalized_embedding_matrix, compute_uv=False)[0]/len(x["item_id"])
    #unique items
    temp=np.unique(x['item_id'])
    normalized_embedding_matrix = embedding_weights[temp] / np.expand_dims(np.linalg.norm(embedding_weights[temp], axis=1), axis=1)
    s_uniques = np.linalg.svd(normalized_embedding_matrix, compute_uv=False)[0]/len(temp)

    #use ICM ( takes too long)

    #all items
    temp=ICM[x['item_id']].toarray()
    normalized_embedding_matrix = temp / np.expand_dims(np.linalg.norm(temp, axis=1), axis=1)
    s_feats = np.linalg.svd(normalized_embedding_matrix, compute_uv=False)[0]/len(x["item_id"])

    # #unique items
    temp=ICM[np.unique(x['item_id'])].toarray()
    normalized_embedding_matrix = temp / np.expand_dims(np.linalg.norm(temp, axis=1), axis=1)
    s_feats_uniques = np.linalg.svd(normalized_embedding_matrix, compute_uv=False)[0]/len(temp)
    return (
        s,
        s_uniques,
        s_feats,
        s_feats_uniques
        )

# Prepare dataset

In [19]:
candidate_items = pd.read_csv('../dataset/processed_data/candidate_items_mapped.csv')
candidate_items.head()

Unnamed: 0,item_id
0,1
1,2
2,3
3,4
4,5


In [20]:
embedding_weights = np.load(
    '../dataset/processed_data/compressed_features.npy'
    )
embedding_weights.shape

(23692, 64)

In [21]:
if just_checking_integrity:
     train_sessions = pd.read_csv(
          '../dataset/processed_data/train_sessions_mapped.csv',
          parse_dates=['date'], 
          infer_datetime_format=True,
          nrows=rows
          )
else:
     train_sessions = pd.read_csv(
          '../dataset/processed_data/train_sessions_mapped.csv',
          parse_dates=['date'], 
          infer_datetime_format=True
          )

In [22]:
result = macro_features_generation(train_sessions)

result.to_csv("../dataset/processed_data/macro_feats_NN.csv",index=False)



completed removal


100%|██████████| 662/662 [00:00<00:00, 17364.89it/s]


completed date features


100%|██████████| 662/662 [00:00<00:00, 5041.24it/s]


completed session length
completed special features


100%|██████████| 662/662 [00:00<00:00, 1313.72it/s]


completed similarities


In [23]:
if just_checking_integrity:
     test_leaderboard_sessions = pd.read_csv(
          '../dataset/processed_data/test_leaderboard_sessions_mapped.csv',
          parse_dates=['date'], 
          infer_datetime_format=True,
          nrows=test_rows
          )
else:
     test_leaderboard_sessions = pd.read_csv(
     '../dataset/processed_data/test_leaderboard_sessions_mapped.csv',
     parse_dates=['date'], 
     infer_datetime_format=True
     )

test_leaderboard_sessions.head()
test_leaderboard_sessions = macro_features_generation(test_leaderboard_sessions)
test_leaderboard_sessions.to_csv("../dataset/processed_data/macro_feats_NN_leaderboard.csv",index=False)

completed removal


100%|██████████| 187/187 [00:00<00:00, 13356.75it/s]


completed date features


100%|██████████| 187/187 [00:00<00:00, 4555.53it/s]


completed session length
completed special features


100%|██████████| 187/187 [00:00<00:00, 1131.64it/s]

completed similarities





In [24]:
test_leaderboard_sessions.head()

Unnamed: 0,session_id,item_id,date,date_session_starting,date_session_ending,date_hour_sin,date_hour_cos,date_day_sin,date_day_cos,date_month_sin,...,user_went_afk,is_weekend,is_hot_hour,is_night,is_christmas_time,is_black_friday,session_similarity,session_similarity_uniques,session_similarity_feats,session_similarity_feats_uniques
0,26,[3404],[2021-06-16 09:53:54.158000],2021-06-16 09:53:54.158,2021-06-16 09:53:54.158,0.526214,-0.850352,-0.207912,-0.978148,1.224647e-16,...,0.0,0,0,0,0,0,1.0,1.0,1.0,1.0
1,200,"[3037, 3037, 1468, 887]","[2021-06-25 12:23:40.811000, 2021-06-25 12:24:...",2021-06-25 12:23:40.811,2021-06-25 12:24:50.692,-0.100188,-0.994969,-0.866025,0.5,1.224647e-16,...,0.0,0,0,0,0,0,0.472795,0.546823,0.482228,0.555301
2,205,[1484],[2021-06-11 00:28:07.058000],2021-06-11 00:28:07.058,2021-06-11 00:28:07.058,0.121869,0.992546,0.743145,-0.669131,1.224647e-16,...,0.0,0,0,1,0,0,1.0,1.0,1.0,1.0
3,495,[1250],[2021-06-14 22:13:06.741000],2021-06-14 22:13:06.741,2021-06-14 22:13:06.741,-0.450098,0.892979,0.207912,-0.978148,1.224647e-16,...,0.0,0,0,0,0,0,1.0,1.0,1.0,1.0
4,521,[4673],[2021-06-19 13:50:03.090000],2021-06-19 13:50:03.090,2021-06-19 13:50:03.090,-0.461749,-0.887011,-0.743145,-0.669131,1.224647e-16,...,0.0,1,0,0,0,0,1.0,1.0,1.0,1.0


In [25]:
if just_checking_integrity:
     test_final_sessions = pd.read_csv(
          '../dataset/processed_data/test_final_sessions_mapped.csv',
          parse_dates=['date'], 
          infer_datetime_format=True,
          nrows=test_rows
          )
else:
     test_final_sessions = pd.read_csv(
          '../dataset/processed_data/test_final_sessions_mapped.csv',
          parse_dates=['date'], 
          infer_datetime_format=True
          )
test_final_sessions.head()
test_final_sessions = macro_features_generation(test_final_sessions)
test_final_sessions.to_csv("../dataset/processed_data/macro_feats_NN_final.csv",index=False)

completed removal


100%|██████████| 250/250 [00:00<00:00, 14706.33it/s]


completed date features


100%|██████████| 250/250 [00:00<00:00, 5625.53it/s]


completed session length
completed special features


100%|██████████| 250/250 [00:00<00:00, 1269.28it/s]

completed similarities





In [26]:
test_final_sessions.head()

Unnamed: 0,session_id,item_id,date,date_session_starting,date_session_ending,date_hour_sin,date_hour_cos,date_day_sin,date_day_cos,date_month_sin,...,user_went_afk,is_weekend,is_hot_hour,is_night,is_christmas_time,is_black_friday,session_similarity,session_similarity_uniques,session_similarity_feats,session_similarity_feats_uniques
0,61,[4785],[2021-06-01 08:12:39.664000],2021-06-01 08:12:39.664,2021-06-01 08:12:39.664,0.838671,-0.544639,0.207912,0.978148,1.224647e-16,...,0.0,0,0,0,0,0,1.0,1.0,1.0,1.0
1,96,"[2107, 3251, 884, 89, 1252]","[2021-06-19 17:48:05.227000, 2021-06-19 17:49:...",2021-06-19 17:48:05.227,2021-06-19 17:56:21.317,-0.99863,-0.052336,-0.743145,-0.669131,1.224647e-16,...,0.0,1,0,0,0,0,0.341118,0.341118,0.373524,0.373524
2,185,"[3133, 3767, 3133, 929]","[2021-06-07 15:53:21.640000, 2021-06-07 15:53:...",2021-06-07 15:53:21.640,2021-06-07 15:55:18.518,-0.850352,-0.526214,0.994522,0.104528,1.224647e-16,...,0.0,0,0,0,0,0,0.489476,0.565026,0.483208,0.555911
3,224,"[4363, 2147]","[2021-06-14 10:31:39.990000, 2021-06-14 16:03:...",2021-06-14 10:31:39.990,2021-06-14 16:03:12.793,0.378649,-0.925541,0.207912,-0.978148,1.224647e-16,...,1.0,0,0,0,0,0,0.587334,0.587334,0.597972,0.597972
4,285,[2662],[2021-06-29 15:33:39.601000],2021-06-29 15:33:39.601,2021-06-29 15:33:39.601,-0.801254,-0.598325,-0.207912,0.978148,1.224647e-16,...,0.0,0,0,0,0,0,1.0,1.0,1.0,1.0
