# IEEE Challenge 2021 Data Preprocessing

In [None]:
!wget -q --show-progress https://github.com/sparsh-ai/ieee21cup-recsys/raw/main/data/bronze/train.parquet.snappy
!wget -q --show-progress https://github.com/sparsh-ai/ieee21cup-recsys/raw/main/data/bronze/item_info.parquet.snappy
!wget -q --show-progress https://github.com/sparsh-ai/ieee21cup-recsys/raw/main/data/bronze/track1_testset.parquet.snappy
!wget -q --show-progress https://github.com/sparsh-ai/ieee21cup-recsys/raw/main/data/bronze/track2_testset.parquet.snappy



In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.utils import shuffle

In [None]:
pd.set_option('display.max_columns', None) 

In [None]:
df_train = pd.read_parquet('train.parquet.snappy')
df_train.head()

Unnamed: 0,user_id,user_click_history,user_protrait,exposed_items,labels,time
0,1,"30:1580603130,34:1581178937,15:1581178939,28:1...","64054,21804,80911,36504,8867,7615,54240,37294,...",120289986119213237164,111111111,1582992009
1,2,"20:1580644762,13:1580644765,1:1580644770,127:1...","64054,26565,93755,88510,6344,7615,54240,21927,...",142611286117191234172,111111110,1582992010
2,3,"39:1580772975,6:1580772981,1:1581178309,2:1581...","64054,64086,63021,88510,93500,7615,54240,21927...",224284810542193236159,111010000,1582992014
3,4,"6:1580657608,9:1580657612,15:1580657615,127:15...","64054,21531,6599,16721,37078,7615,54240,65505,...",516174133122235218172,111111110,1582992014
4,5,"37:1580743545,36:1580743554,25:1580743556,37:1...","64054,66036,6599,88510,76066,20543,83978,37294...",61168573112239172205,111111111,1582992017


In [None]:
df_item_info = pd.read_parquet('item_info.parquet.snappy')
df_item_info.head()

Unnamed: 0,item_id,item_vec,price,location
0,1,"2,2,1,0.015764,0.833",2954,1
1,2,"2,0,1,0.012539,0.776",179,1
2,3,"2,8,1,0.007518,0.721",757,1
3,4,"2,0,1,0.007163,0.806",1486,1
4,5,"2,0,1,0.015435,0.710",593,1


In [None]:
df_test1 = pd.read_parquet('track1_testset.parquet.snappy')
df_test1.head()

Unnamed: 0,user_id,user_click_history,user_protrait,exposed_items,labels,time
0,1,"24:1580573710,7:1580573724,1:1580573730,127:15...","64054,50887,66367,44932,59460,20543,83978,5013...",3529725352164211172,0,1582992008
1,2,"35:1580629376,9:1580629379,2:1580629381,73:158...","64054,35323,6599,36504,24180,37350,83978,21927...",22292513273108188213164,0,1582992009
2,3,"35:1580804903,32:1580804915,39:1581268224,33:1...","64054,67884,6599,16721,17121,38762,83978,37294...",2935146151106150148234,0,1582992019
3,4,"15:1580688381,5:1580688385,39:1580688388,39:15...","64054,23083,93755,36504,35011,37350,83978,3729...",2927359979107213200152,0,1582992024
4,5,"29:1580906648,18:1580906650,10:1580906652,139:...","64054,32125,93755,36504,37741,7615,54240,21927...",48281026186151235199,0,1582992027


In [None]:
df_test2 = pd.read_parquet('track2_testset.parquet.snappy')
df_test2.head()

Unnamed: 0,user_id,user_click_history,user_protrait
0,1,"19:1580575389,29:1580575391,31:1580575394,86:1...","92265,80116,6599,44932,44093,93364,83978,6560,..."
1,2,"31:1580572948,35:1580572950,21:1580572954,86:1...","64054,33571,6599,16721,31249,20543,83978,65505..."
2,3,"19:1580575389,29:1580575391,31:1580575394,86:1...","92265,80116,6599,44932,44093,93364,83978,6560,..."
3,4,"28:1581785027,28:1581785038,39:1581823664,14:1...","92265,44022,6599,36504,35011,63681,83978,83395..."
4,5,"29:1580607094,4:1580607111,3:1580607113,75:158...","64054,29763,80911,73143,37078,7615,54240,21927..."


In [None]:
portraitidx_to_idx_dict_list = []
for i in range(10):
    portraitidx_to_idx_dict_list.append(dict())
acculumated_idx = [0] * 10

for i in tqdm(range(df_train.shape[0])):
    user_portrait = [int(s) for s in df_train.at[i, 'user_protrait'].split(',')]
    for idx, u in enumerate(user_portrait):
        if portraitidx_to_idx_dict_list[idx].get(u, -1) == -1:
            portraitidx_to_idx_dict_list[idx][u] = acculumated_idx[idx]
            acculumated_idx[idx] += 1

for i in tqdm(range(df_test1.shape[0])):
    user_portrait = [int(s) for s in df_test1.at[i, 'user_protrait'].split(',')]
    for idx, u in enumerate(user_portrait):
        if portraitidx_to_idx_dict_list[idx].get(u, -1) == -1:
            portraitidx_to_idx_dict_list[idx][u] = acculumated_idx[idx]
            acculumated_idx[idx] += 1

for i in tqdm(range(df_test2.shape[0])):
    user_portrait = [int(s) for s in df_test2.at[i, 'user_protrait'].split(',')]
    for idx, u in enumerate(user_portrait):
        if portraitidx_to_idx_dict_list[idx].get(u, -1) == -1:
            portraitidx_to_idx_dict_list[idx][u] = acculumated_idx[idx]
            acculumated_idx[idx] += 1

100%|██████████| 260087/260087 [00:03<00:00, 74153.00it/s]
100%|██████████| 206254/206254 [00:02<00:00, 75088.70it/s]
100%|██████████| 206096/206096 [00:02<00:00, 74733.08it/s]


In [None]:
acculumated_idx

[3, 1430, 20, 10, 198, 52, 3, 13, 2, 2347]

In [None]:
portraitidx_to_idx_dict_list[0]

{64054: 0, 90378: 2, 92265: 1}

In [None]:
dict(list(portraitidx_to_idx_dict_list[1].items())[0:10])

{21531: 3,
 21804: 0,
 26565: 1,
 26897: 7,
 27984: 5,
 55456: 8,
 64086: 2,
 66036: 4,
 91745: 6,
 92420: 9}

In [None]:
# item info
item_info_dict = {}
for i in tqdm(range(df_item_info.shape[0])):
    item_id = df_item_info.at[i, 'item_id'] 

    item_discrete = df_item_info.at[i, 'item_vec'].split(',')[:3]
    item_cont = df_item_info.at[i, 'item_vec'].split(',')[-2:]
    price = df_item_info.at[i, 'price'] / 3000
    loc = df_item_info.at[i, 'location'] - 1 # 0~2

    item_cont.append(price) # 2 + 1
    item_discrete.append(loc) # 3 + 1

    item_cont = [float(it) for it in item_cont]
    item_discrete = [int(it) for it in item_discrete]
    item_discrete[0] = item_discrete[0] - 1 # 1~4 -> 0~3
    item_discrete[2] = item_discrete[2] - 1 # 1~2 -> 0~1

    item_info_dict[int(item_id)] = {
        'cont': np.array(item_cont, dtype=np.float64),
        'discrete': np.array(item_discrete, dtype=np.int64),
    }

100%|██████████| 381/381 [00:00<00:00, 19797.20it/s]


In [None]:
df_item_info.head()

Unnamed: 0,item_id,item_vec,price,location
0,1,"2,2,1,0.015764,0.833",2954,1
1,2,"2,0,1,0.012539,0.776",179,1
2,3,"2,8,1,0.007518,0.721",757,1
3,4,"2,0,1,0.007163,0.806",1486,1
4,5,"2,0,1,0.015435,0.710",593,1


In [None]:
dict(list(item_info_dict.items())[0:10])

{1: {'cont': array([0.015764  , 0.833     , 0.98466667]),
  'discrete': array([1, 2, 0, 0])},
 2: {'cont': array([0.012539  , 0.776     , 0.05966667]),
  'discrete': array([1, 0, 0, 0])},
 3: {'cont': array([0.007518  , 0.721     , 0.25233333]),
  'discrete': array([1, 8, 0, 0])},
 4: {'cont': array([0.007163  , 0.806     , 0.49533333]),
  'discrete': array([1, 0, 0, 0])},
 5: {'cont': array([0.015435  , 0.71      , 0.19766667]),
  'discrete': array([1, 0, 0, 0])},
 6: {'cont': array([0.00742, 0.734  , 0.493  ]),
  'discrete': array([1, 7, 0, 0])},
 7: {'cont': array([0.006577  , 0.674     , 0.98466667]),
  'discrete': array([1, 7, 0, 0])},
 8: {'cont': array([0.005922  , 0.66      , 0.06633333]),
  'discrete': array([1, 0, 0, 0])},
 9: {'cont': array([0.006482  , 0.69      , 0.06633333]),
  'discrete': array([1, 0, 0, 0])},
 10: {'cont': array([0.008988  , 0.749     , 0.19866667]),
  'discrete': array([1, 3, 0, 0])}}

In [None]:
# trainset
train_samples = []
val_samples = []

# shuffle
# df_train = shuffle(df_train, random_state=2333).reset_index() # not shuffling - for this tutorial
total_num = int(df_train.shape[0])
num_train = int(total_num * 0.95)
num_val = total_num - num_train

for i in tqdm(range(total_num)):
    if df_train.at[i, 'user_click_history'] == '0:0':
        user_click_list = [0]
    else:
        user_click_list = df_train.at[i, 'user_click_history'].split(',')
        user_click_list = [int(sample.split(':')[0]) for sample in user_click_list]
    num_user_click_history = len(user_click_list)
    tmp = np.zeros(400, dtype=np.int64)
    tmp[:len(user_click_list)] = user_click_list
    user_click_list = tmp
    
    exposed_items = [int(s) for s in df_train.at[i, 'exposed_items'].split(',')]
    labels = [int(s) for s in df_train.at[i, 'labels'].split(',')]

    user_portrait = [int(s) for s in df_train.at[i, 'user_protrait'].split(',')]
    # portraitidx_to_idx_dict_list: list of 10 dict, int:int
    for j in range(10):
        user_portrait[j] = portraitidx_to_idx_dict_list[j][user_portrait[j]]
    for k in range(9):
        one_sample = {
            'user_click_list': user_click_list,
            'num_user_click_history': num_user_click_history,
            'user_portrait': np.array(user_portrait, dtype=np.int64),
            'item_id': exposed_items[k],
            'label': labels[k]
        }
        if i < num_train:
            train_samples.append(one_sample)
        else:
            val_samples.append(one_sample)

100%|██████████| 260087/260087 [00:33<00:00, 7868.91it/s]


In [None]:
df_train.head()

Unnamed: 0,user_id,user_click_history,user_protrait,exposed_items,labels,time
0,1,"30:1580603130,34:1581178937,15:1581178939,28:1...","64054,21804,80911,36504,8867,7615,54240,37294,...",120289986119213237164,111111111,1582992009
1,2,"20:1580644762,13:1580644765,1:1580644770,127:1...","64054,26565,93755,88510,6344,7615,54240,21927,...",142611286117191234172,111111110,1582992010
2,3,"39:1580772975,6:1580772981,1:1581178309,2:1581...","64054,64086,63021,88510,93500,7615,54240,21927...",224284810542193236159,111010000,1582992014
3,4,"6:1580657608,9:1580657612,15:1580657615,127:15...","64054,21531,6599,16721,37078,7615,54240,65505,...",516174133122235218172,111111110,1582992014
4,5,"37:1580743545,36:1580743554,25:1580743556,37:1...","64054,66036,6599,88510,76066,20543,83978,37294...",61168573112239172205,111111111,1582992017


In [None]:
train_samples[0]

{'item_id': 1,
 'label': 1,
 'num_user_click_history': 74,
 'user_click_list': array([ 30,  34,  15,  28,  86, 132, 128, 172, 239,  35,  34,  14,  56,
        106,  15,  32,  27,  15,  56, 106,  44, 235, 149,  35,  16,   1,
         72, 132, 100, 172, 239,  14,  18,   1, 133,  79,  79, 101, 101,
        159, 212,  10,  34,  18,  18,  56,  56, 133, 132, 219, 172, 219,
        219, 219, 212,  32,  27,   9, 135, 105,  59, 172, 237,  14,  37,
          5, 113, 103,  46,  46, 192,  10,   6,  21,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0

In [None]:
class BigDataCupDataset(torch.utils.data.Dataset):
    def __init__(self, 
                 item_info_dict,
                 database
                ):
        super().__init__()
        self.item_info_dict = item_info_dict
        self.database = database

    def __len__(self, ):
        return len(self.database)

    def __getitem__(self, idx):
        one_sample = self.database[idx]
        user_click_history = one_sample['user_click_list']
        num_user_click_history = one_sample['num_user_click_history']
        user_discrete_feature = one_sample['user_portrait']
        item_id = one_sample['item_id']
        item_discrete_feature = self.item_info_dict[item_id]['discrete']
        item_cont_feature = self.item_info_dict[item_id]['cont']
        label = one_sample['label']

        # print(num_user_click_history)

        user_click_history = torch.IntTensor(user_click_history)
        num_user_click_history = torch.IntTensor([num_user_click_history])
        user_discrete_feature = torch.IntTensor(user_discrete_feature)
        item_id = torch.IntTensor([item_id])
        item_discrete_feature = torch.IntTensor(item_discrete_feature)
        item_cont_feature = torch.FloatTensor(item_cont_feature)
        label = torch.IntTensor([label])

        # print(num_user_click_history)

        return user_click_history, num_user_click_history, user_discrete_feature, \
               item_id, item_discrete_feature, item_cont_feature, label

In [None]:
train_ds = BigDataCupDataset(item_info_dict, train_samples)

In [None]:
for i in range(len(train_ds)):
    sample = train_ds[i]
    print(sample)
    if i == 1:
        break

(tensor([ 30,  34,  15,  28,  86, 132, 128, 172, 239,  35,  34,  14,  56, 106,
         15,  32,  27,  15,  56, 106,  44, 235, 149,  35,  16,   1,  72, 132,
        100, 172, 239,  14,  18,   1, 133,  79,  79, 101, 101, 159, 212,  10,
         34,  18,  18,  56,  56, 133, 132, 219, 172, 219, 219, 219, 212,  32,
         27,   9, 135, 105,  59, 172, 237,  14,  37,   5, 113, 103,  46,  46,
        192,  10,   6,  21,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, 

In [None]:
train_dl = torch.utils.data.DataLoader(dataset=train_ds, batch_size=32, shuffle=True)
train_dl

<torch.utils.data.dataloader.DataLoader at 0x7fecb286a650>

In [None]:
df_test1.head()

Unnamed: 0,user_id,user_click_history,user_protrait,exposed_items,labels,time
0,1,"24:1580573710,7:1580573724,1:1580573730,127:15...","64054,50887,66367,44932,59460,20543,83978,5013...",3529725352164211172,0,1582992008
1,2,"35:1580629376,9:1580629379,2:1580629381,73:158...","64054,35323,6599,36504,24180,37350,83978,21927...",22292513273108188213164,0,1582992009
2,3,"35:1580804903,32:1580804915,39:1581268224,33:1...","64054,67884,6599,16721,17121,38762,83978,37294...",2935146151106150148234,0,1582992019
3,4,"15:1580688381,5:1580688385,39:1580688388,39:15...","64054,23083,93755,36504,35011,37350,83978,3729...",2927359979107213200152,0,1582992024
4,5,"29:1580906648,18:1580906650,10:1580906652,139:...","64054,32125,93755,36504,37741,7615,54240,21927...",48281026186151235199,0,1582992027


In [None]:
# testset
test_samples = []

# shuffle
total_num = int(df_test1.shape[0])

for i in tqdm(range(total_num)):
    if df_test1.at[i, 'user_click_history'] == '0:0':
        user_click_list = [0]
    else:
        user_click_list = df_test1.at[i, 'user_click_history'].split(',')
        user_click_list = [int(sample.split(':')[0]) for sample in user_click_list]
    num_user_click_history = len(user_click_list)
    tmp = np.zeros(400, dtype=np.int64)
    tmp[:len(user_click_list)] = user_click_list
    user_click_list = tmp
    
    exposed_items = [int(s) for s in df_test1.at[i, 'exposed_items'].split(',')]
    labels = [int(s) for s in df_test1.at[i, 'labels'].split(',')]

    user_portrait = [int(s) for s in df_test1.at[i, 'user_protrait'].split(',')]
    # portraitidx_to_idx_dict_list: list of 10 dict, int:int
    for j in range(10):
        user_portrait[j] = portraitidx_to_idx_dict_list[j][user_portrait[j]]
    for k in range(9):
        one_sample = {
            'user_click_list': user_click_list,
            'num_user_click_history': num_user_click_history,
            'user_portrait': np.array(user_portrait, dtype=np.int64),
            'item_id': exposed_items[k],
        }
        test_samples.append(one_sample)

100%|██████████| 206254/206254 [00:24<00:00, 8287.95it/s]


In [None]:
test_samples[0]

{'item_id': 3,
 'num_user_click_history': 111,
 'user_click_list': array([ 24,   7,   1, 127,  74,  47, 212, 199,   6,  15,  10, 127, 126,
         76, 220, 196, 172, 196,  15,  39,  31, 132,  80,  61, 200, 219,
          1,   5,  14, 101,  40,  52, 235, 238, 164,   1,  14,  20,  77,
         80,  40, 239, 233, 164, 164,  33,  31,  14, 139,  83,  83, 125,
        184, 240, 160,   8,   1,  25,  51,  76,  43, 235, 211, 164,  14,
          4,   9, 126, 116,  43, 164, 213, 183,   6,  14,   4,  43, 126,
         57, 183, 188, 164,  10,  39,  25, 102, 109, 111, 160, 160, 157,
        183,  10,   5,  32,  43, 103,  50, 242, 172, 171, 171,  35,   9,
          5,  86,  48,  88, 218, 236, 215,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0, 

In [None]:
class BigDataCupTestDataset(torch.utils.data.Dataset):
    def __init__(self, 
                 item_info_dict,
                 database
                ):
        super().__init__()
        self.item_info_dict = item_info_dict
        self.database = database

    def __len__(self, ):
        return len(self.database)

    def __getitem__(self, idx):
        one_sample = self.database[idx]
        user_click_history = one_sample['user_click_list']
        num_user_click_history = one_sample['num_user_click_history']
        user_discrete_feature = one_sample['user_portrait']
        item_id = one_sample['item_id']
        item_discrete_feature = self.item_info_dict[item_id]['discrete']
        item_cont_feature = self.item_info_dict[item_id]['cont']

        user_click_history = torch.IntTensor(user_click_history)
        num_user_click_history = torch.IntTensor([num_user_click_history])
        user_discrete_feature = torch.IntTensor(user_discrete_feature)
        item_id = torch.IntTensor([item_id])
        item_discrete_feature = torch.IntTensor(item_discrete_feature)
        item_cont_feature = torch.FloatTensor(item_cont_feature)

        return user_click_history, num_user_click_history, user_discrete_feature, \
               item_id, item_discrete_feature, item_cont_feature

In [None]:
val_ds = BigDataCupTestDataset(item_info_dict, test_samples)

for i in range(len(val_ds)):
    sample = val_ds[i]
    print(sample)
    if i == 1:
        break

(tensor([ 24,   7,   1, 127,  74,  47, 212, 199,   6,  15,  10, 127, 126,  76,
        220, 196, 172, 196,  15,  39,  31, 132,  80,  61, 200, 219,   1,   5,
         14, 101,  40,  52, 235, 238, 164,   1,  14,  20,  77,  80,  40, 239,
        233, 164, 164,  33,  31,  14, 139,  83,  83, 125, 184, 240, 160,   8,
          1,  25,  51,  76,  43, 235, 211, 164,  14,   4,   9, 126, 116,  43,
        164, 213, 183,   6,  14,   4,  43, 126,  57, 183, 188, 164,  10,  39,
         25, 102, 109, 111, 160, 160, 157, 183,  10,   5,  32,  43, 103,  50,
        242, 172, 171, 171,  35,   9,   5,  86,  48,  88, 218, 236, 215,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0, 

In [None]:
val_dl = torch.utils.data.DataLoader(dataset=val_ds, batch_size=9, shuffle=False)
val_dl

<torch.utils.data.dataloader.DataLoader at 0x7fec46253510>