In [1]:
import sys
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
import math
import gc
tqdm.pandas()

In [2]:
candidate_items = pd.read_csv("../../Dataset/candidate_items.csv")
item_features = pd.read_csv("../../Dataset/item_features.csv")
train_purchases = pd.read_csv("../../Dataset/train_purchases.csv")
train_sessions = pd.read_csv("../../Dataset/train_sessions.csv")
test_final_sessions = pd.read_csv("../../Dataset/test_final_sessions.csv")
test_leaderboard_sessions = pd.read_csv("../../Dataset/test_leaderboard_sessions.csv")

In [3]:
def compute_mean(x, y):
    """Compute the Mean Value of an attribute where x is the sum of values and y is the number of elements.
    Used also to compute the composition of the seasonality attribute
    Outputs -1 if no stats are present"""
    if y != 0:
        return x / y
    else:
        return -1

def compute_quartet_entropy(wi, sp, su, au):
    """Compute the Information Entropy of the Seasonality Attribute"""
    total = wi + sp + su + au
    if total == 0:
        return -1
    if wi == 0:
        wi_coeff = 0
    else:
        wi_coeff = - (wi / total) * math.log2(wi / total)
    if sp == 0:
        sp_coeff = 0
    else:
        sp_coeff = - (sp / total) * math.log2(sp / total)
    if su == 0:
        su_coeff = 0
    else:
        su_coeff = - (su / total) * math.log2(su / total)
    if au == 0:
        au_coeff = 0
    else:
        au_coeff = - (au / total) * math.log2(au / total)
    return 1 - ((wi_coeff + sp_coeff + su_coeff + au_coeff) / 2)

def compute_season(x, record):
    """Lambda function to compute the purchases of an item or of a user during the 4 seasons"""
    month = x.date.month
    elems = record[x.item_id]
    if month in [12, 1, 2]:
        elems[0] += 1
    elif month in [3, 4, 5]:
        elems[1] += 1
    elif month in [6, 7, 8]:
        elems[2] += 1
    else:
        elems[3] += 1


def extract_season(sessions):
    """Routine to extract the Seasonality information of the items or users"""
    sessions.date = pd.to_datetime(sessions.date)
    record = {item: [0, 0, 0, 0] for item in item_features.item_id.unique()}

    print('Building Statistics...')
    sessions.progress_apply(lambda x: compute_season(x, record), axis=1)

    return pd.DataFrame.from_dict(record, orient='index', columns=['winter', 'spring', 'summer', 'autumn'])


def compute_seasonality_tendency(df):
    """Compute Seasonality Tendency for items or users"""
    print('Computing Seasonality Tendency...')
    return df.progress_apply(lambda x:
                             compute_quartet_entropy(x['winter'], x['spring'],
                                                     x['summer'], x['autumn']),
                             axis=1)


def compute_winter_tendency(df):
    """Compute Winter Tendency for items or users"""
    print('Computing Winter Tendency...')
    return df.progress_apply(lambda x:
                             compute_mean(x['winter'],
                                          x['winter'] + x['spring'] +
                                          x['summer'] + x['autumn']),
                             axis=1)


def compute_spring_tendency(df):
    """Compute Spring Tendency for items or users"""
    print('Computing Spring Tendency...')
    return df.progress_apply(lambda x:
                             compute_mean(x['spring'],
                                          x['winter'] + x['spring'] +
                                          x['summer'] + x['autumn']),
                             axis=1)


def compute_summer_tendency(df):
    """Compute Summer Tendency for items or user"""
    print('Computing Summer Tendency...')
    return df.progress_apply(lambda x:
                             compute_mean(x['summer'],
                                          x['winter'] + x['spring'] +
                                          x['summer'] + x['autumn']),
                             axis=1)


def compute_autumn_tendency(df):
    """Compute Summer Tendency for items or user"""
    print('Computing Autumn Tendency...')
    return df.progress_apply(lambda x:
                             compute_mean(x['autumn'],
                                          x['winter'] + x['spring'] +
                                          x['summer'] + x['autumn']),
                             axis=1)

In [4]:
season_sessions_df = extract_season(train_sessions)

Building Statistics...


100%|██████████| 4743820/4743820 [01:44<00:00, 45442.98it/s]


In [5]:
season_sessions_df['seasonality_tendency'] = compute_seasonality_tendency(season_sessions_df)
season_sessions_df['winter_tendency'] = compute_winter_tendency(season_sessions_df)
season_sessions_df['spring_tendency'] = compute_spring_tendency(season_sessions_df)
season_sessions_df['summer_tendency'] = compute_summer_tendency(season_sessions_df)
season_sessions_df['autumn_tendency'] = compute_autumn_tendency(season_sessions_df)
season_sessions_df

Computing Seasonality Tendency...


100%|██████████| 23691/23691 [00:00<00:00, 42669.84it/s]


Computing Winter Tendency...


100%|██████████| 23691/23691 [00:00<00:00, 43769.28it/s]


Computing Spring Tendency...


100%|██████████| 23691/23691 [00:00<00:00, 45386.40it/s]


Computing Summer Tendency...


100%|██████████| 23691/23691 [00:00<00:00, 41686.70it/s]


Computing Autumn Tendency...


100%|██████████| 23691/23691 [00:00<00:00, 44085.34it/s]


Unnamed: 0,winter,spring,summer,autumn,seasonality_tendency,winter_tendency,spring_tendency,summer_tendency,autumn_tendency
2,0,1,0,0,1.000000,0.000000,1.000000,0.000000,0.000000
3,101,0,0,56,0.530050,0.643312,0.000000,0.000000,0.356688
4,0,375,0,0,1.000000,0.000000,1.000000,0.000000,0.000000
7,0,0,0,0,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000
8,44,83,0,0,0.534568,0.346457,0.653543,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...
28139,0,15,299,2,0.834787,0.000000,0.047468,0.946203,0.006329
28140,160,1,0,288,0.519488,0.356347,0.002227,0.000000,0.641425
28141,53,14,0,85,0.342092,0.348684,0.092105,0.000000,0.559211
28142,2,0,0,0,1.000000,1.000000,0.000000,0.000000,0.000000


In [6]:
season_purchases_df = extract_season(train_purchases)

Building Statistics...


100%|██████████| 1000000/1000000 [00:23<00:00, 42058.49it/s]


In [7]:
season_purchases_df['seasonality_tendency'] = compute_seasonality_tendency(season_purchases_df)
season_purchases_df['winter_tendency'] = compute_winter_tendency(season_purchases_df)
season_purchases_df['spring_tendency'] = compute_spring_tendency(season_purchases_df)
season_purchases_df['summer_tendency'] = compute_summer_tendency(season_purchases_df)
season_purchases_df['autumn_tendency'] = compute_autumn_tendency(season_purchases_df)
season_purchases_df

Computing Seasonality Tendency...


100%|██████████| 23691/23691 [00:00<00:00, 36354.29it/s]


Computing Winter Tendency...


100%|██████████| 23691/23691 [00:00<00:00, 36944.94it/s]


Computing Spring Tendency...


100%|██████████| 23691/23691 [00:00<00:00, 39444.57it/s]


Computing Summer Tendency...


100%|██████████| 23691/23691 [00:00<00:00, 42842.92it/s]


Computing Autumn Tendency...


100%|██████████| 23691/23691 [00:00<00:00, 41336.54it/s]


Unnamed: 0,winter,spring,summer,autumn,seasonality_tendency,winter_tendency,spring_tendency,summer_tendency,autumn_tendency
2,0,0,0,0,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000
3,14,0,0,5,0.584263,0.736842,0.000000,0.000000,0.263158
4,0,80,0,0,1.000000,0.000000,1.000000,0.000000,0.000000
7,1,0,0,0,1.000000,1.000000,0.000000,0.000000,0.000000
8,14,10,0,0,0.510066,0.583333,0.416667,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...
28139,0,3,50,1,0.779479,0.000000,0.055556,0.925926,0.018519
28140,8,0,0,67,0.755110,0.106667,0.000000,0.000000,0.893333
28141,17,3,0,19,0.343868,0.435897,0.076923,0.000000,0.487179
28142,0,0,0,0,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000


In [8]:
season_sessions_df.rename(columns={'winter': 'winter_view',
                           'spring': 'spring_view',
                           'summer': 'summer_view',
                           'autumn': 'autumn_view',
                           'seasonality_tendency': 'seasonality_tendency_view',
                           'winter_tendency': 'winter_tendency_view',
                           'spring_tendency': 'spring_tendency_view',
                           'summer_tendency': 'summer_tendency_view',
                           'autumn_tendency': 'autumn_tendency_view'},
                          inplace=True)

season_purchases_df.rename(columns={'winter': 'winter_purchases',
                           'spring': 'spring_purchases',
                           'summer': 'summer_purchases',
                           'autumn': 'autumn_purchases',
                           'seasonality_tendency': 'seasonality_tendency_purchases',
                           'winter_tendency': 'winter_tendency_purchases',
                           'spring_tendency': 'spring_tendency_purchases',
                           'summer_tendency': 'summer_tendency_purchases',
                           'autumn_tendency': 'autumn_tendency_purchases'},
                          inplace=True)

In [9]:
item_attributes = season_sessions_df.merge(right=season_purchases_df, left_index=True, right_index=True)
item_attributes

Unnamed: 0,winter_view,spring_view,summer_view,autumn_view,seasonality_tendency_view,winter_tendency_view,spring_tendency_view,summer_tendency_view,autumn_tendency_view,winter_purchases,spring_purchases,summer_purchases,autumn_purchases,seasonality_tendency_purchases,winter_tendency_purchases,spring_tendency_purchases,summer_tendency_purchases,autumn_tendency_purchases
2,0,1,0,0,1.000000,0.000000,1.000000,0.000000,0.000000,0,0,0,0,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000
3,101,0,0,56,0.530050,0.643312,0.000000,0.000000,0.356688,14,0,0,5,0.584263,0.736842,0.000000,0.000000,0.263158
4,0,375,0,0,1.000000,0.000000,1.000000,0.000000,0.000000,0,80,0,0,1.000000,0.000000,1.000000,0.000000,0.000000
7,0,0,0,0,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,1,0,0,0,1.000000,1.000000,0.000000,0.000000,0.000000
8,44,83,0,0,0.534568,0.346457,0.653543,0.000000,0.000000,14,10,0,0,0.510066,0.583333,0.416667,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28139,0,15,299,2,0.834787,0.000000,0.047468,0.946203,0.006329,0,3,50,1,0.779479,0.000000,0.055556,0.925926,0.018519
28140,160,1,0,288,0.519488,0.356347,0.002227,0.000000,0.641425,8,0,0,67,0.755110,0.106667,0.000000,0.000000,0.893333
28141,53,14,0,85,0.342092,0.348684,0.092105,0.000000,0.559211,17,3,0,19,0.343868,0.435897,0.076923,0.000000,0.487179
28142,2,0,0,0,1.000000,1.000000,0.000000,0.000000,0.000000,0,0,0,0,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000


In [10]:
item_features['category_value'] = item_features['feature_category_id'].astype(str) + '-' + item_features[
        'feature_value_id'].astype(str)
item_features

Unnamed: 0,item_id,feature_category_id,feature_value_id,category_value
0,2,56,365,56-365
1,2,62,801,62-801
2,2,68,351,68-351
3,2,33,802,33-802
4,2,72,75,72-75
...,...,...,...,...
471746,28143,68,351,68-351
471747,28143,55,390,55-390
471748,28143,11,109,11-109
471749,28143,73,91,73-91


In [11]:
category_value_keys = list(item_features['category_value'].unique())  # list of pairs category-value
category_value_values = [v for v in range(len(category_value_keys))]  # new index -- perché non fare arange?
category_value_dict = dict(zip(category_value_keys, category_value_values))
item_features['category_value'] = item_features['category_value'].map(category_value_dict)
item_features.drop(['feature_value_id', 'feature_category_id'], axis=1, inplace=True)
item_features

Unnamed: 0,item_id,category_value
0,2,0
1,2,1
2,2,2
3,2,3
4,2,4
...,...,...
471746,28143,2
471747,28143,545
471748,28143,62
471749,28143,168


In [12]:
item_features['value'] = 1
item_features_unstack = item_features.pivot(index='item_id', columns='category_value', values='value')
item_features_unstack.fillna(value=0, inplace=True)
item_features_unstack

category_value,0,1,2,3,4,5,6,7,8,9,...,894,895,896,897,898,899,900,901,902,903
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28139,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
28140,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
28141,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
28142,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
item_features = item_features.merge(right=item_features_unstack, left_on='item_id', right_on='item_id')
item_features.drop_duplicates(['item_id'], inplace=True)
item_features.drop(columns=['category_value', 'value'], inplace=True)
item_features = item_features.astype('int64')
item_features

Unnamed: 0,item_id,0,1,2,3,4,5,6,7,8,...,894,895,896,897,898,899,900,901,902,903
0,2,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
13,3,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
37,4,1,0,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
61,7,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
76,8,1,0,1,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
471652,28139,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
471676,28140,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
471694,28141,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
471718,28142,0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
item_attributes = item_features.merge(right=item_attributes, left_on='item_id', right_index=True)
item_attributes.reset_index(inplace=True)
item_attributes

Unnamed: 0,index,item_id,0,1,2,3,4,5,6,7,...,autumn_tendency_view,winter_purchases,spring_purchases,summer_purchases,autumn_purchases,seasonality_tendency_purchases,winter_tendency_purchases,spring_tendency_purchases,summer_tendency_purchases,autumn_tendency_purchases
0,0,2,1,1,1,1,1,1,1,1,...,0.000000,0,0,0,0,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000
1,13,3,1,0,0,0,1,0,0,0,...,0.356688,14,0,0,5,0.584263,0.736842,0.000000,0.000000,0.263158
2,37,4,1,0,0,0,1,0,0,0,...,0.000000,0,80,0,0,1.000000,0.000000,1.000000,0.000000,0.000000
3,61,7,0,0,0,0,1,0,0,0,...,-1.000000,1,0,0,0,1.000000,1.000000,0.000000,0.000000,0.000000
4,76,8,1,0,1,0,1,0,0,0,...,0.000000,14,10,0,0,0.510066,0.583333,0.416667,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23686,471652,28139,1,0,0,0,1,0,0,0,...,0.006329,0,3,50,1,0.779479,0.000000,0.055556,0.925926,0.018519
23687,471676,28140,1,0,1,0,0,0,0,0,...,0.641425,8,0,0,67,0.755110,0.106667,0.000000,0.000000,0.893333
23688,471694,28141,1,0,0,0,1,0,0,0,...,0.559211,17,3,0,19,0.343868,0.435897,0.076923,0.000000,0.487179
23689,471718,28142,0,0,0,0,1,1,0,0,...,0.000000,0,0,0,0,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000


In [15]:
item_attributes.drop(columns=['index'], inplace=True)

In [16]:
item_attributes['total_view'] = item_attributes.apply(lambda x: x.winter_view +
                                                         x.spring_view + x.summer_view+
                                                         x.autumn_view, axis=1)
item_attributes['total_purchases'] = item_attributes.apply(lambda x: x.winter_purchases +
                                                         x.spring_purchases + x.summer_purchases+
                                                         x.autumn_purchases, axis=1)
item_attributes

Unnamed: 0,item_id,0,1,2,3,4,5,6,7,8,...,spring_purchases,summer_purchases,autumn_purchases,seasonality_tendency_purchases,winter_tendency_purchases,spring_tendency_purchases,summer_tendency_purchases,autumn_tendency_purchases,total_view,total_purchases
0,2,1,1,1,1,1,1,1,1,1,...,0,0,0,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,1.0,0.0
1,3,1,0,0,0,1,0,0,0,0,...,0,0,5,0.584263,0.736842,0.000000,0.000000,0.263158,157.0,19.0
2,4,1,0,0,0,1,0,0,0,1,...,80,0,0,1.000000,0.000000,1.000000,0.000000,0.000000,375.0,80.0
3,7,0,0,0,0,1,0,0,0,0,...,0,0,0,1.000000,1.000000,0.000000,0.000000,0.000000,0.0,1.0
4,8,1,0,1,0,1,0,0,0,1,...,10,0,0,0.510066,0.583333,0.416667,0.000000,0.000000,127.0,24.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23686,28139,1,0,0,0,1,0,0,0,0,...,3,50,1,0.779479,0.000000,0.055556,0.925926,0.018519,316.0,54.0
23687,28140,1,0,1,0,0,0,0,0,0,...,0,0,67,0.755110,0.106667,0.000000,0.000000,0.893333,449.0,75.0
23688,28141,1,0,0,0,1,0,0,0,0,...,3,0,19,0.343868,0.435897,0.076923,0.000000,0.487179,152.0,39.0
23689,28142,0,0,0,0,1,1,0,0,0,...,0,0,0,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,2.0,0.0


In [17]:
item_attributes.to_csv('../../Dataset/item_attributes.csv', index=False)