In [1]:
import sys
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
import math
import gc
tqdm.pandas()

In [2]:
item_features = pd.read_csv("../../Dataset/item_features.csv")
item_features

Unnamed: 0,item_id,feature_category_id,feature_value_id
0,2,56,365
1,2,62,801
2,2,68,351
3,2,33,802
4,2,72,75
...,...,...,...
471746,28143,68,351
471747,28143,55,390
471748,28143,11,109
471749,28143,73,91


In [24]:
unusable_category = set()
for item in tqdm(item_features.item_id.unique()):
    item_df = item_features.loc[item_features.item_id == item]
    duplicates = item_df.loc[item_df.feature_category_id.duplicated(),:].feature_category_id.values
    unusable_category.update(duplicates)
unusable_category

100%|██████████| 23691/23691 [00:19<00:00, 1211.96it/s]


{1, 4, 28, 30, 46, 53}

In [31]:
item_features_filtered = item_features[~(item_features.feature_category_id.isin(unusable_category))]
item_features_filtered

Unnamed: 0,item_id,feature_category_id,feature_value_id
0,2,56,365
1,2,62,801
2,2,68,351
3,2,33,802
4,2,72,75
...,...,...,...
471746,28143,68,351
471747,28143,55,390
471748,28143,11,109
471749,28143,73,91


In [32]:
mapping_dict = {}
for cat_id in tqdm(item_features_filtered.feature_category_id.unique()):
    values = sorted(item_features_filtered.loc[item_features_filtered.feature_category_id == cat_id].feature_value_id.unique())
    mapping_dict[cat_id] = {value: values.index(value) for value in values}

100%|██████████| 67/67 [00:00<00:00, 857.80it/s]


In [41]:
item_features_filtered['mapped_feature_value_id'] = item_features_filtered.progress_apply(lambda x:
                                                                                 mapping_dict[x.feature_category_id][x.feature_value_id], axis=1)
item_features_filtered

100%|██████████| 424547/424547 [00:06<00:00, 61545.70it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  item_features_filtered['mapped_feature_value_id'] = item_features_filtered.progress_apply(lambda x:


Unnamed: 0,item_id,feature_category_id,feature_value_id,mapped_feature_value_id
0,2,56,365,30
1,2,62,801,1
2,2,68,351,14
3,2,33,802,8
4,2,72,75,1
...,...,...,...,...
471746,28143,68,351,14
471747,28143,55,390,20
471748,28143,11,109,0
471749,28143,73,91,0


In [57]:
item_features_unstack = item_features_filtered.drop(columns=['feature_value_id'])
item_features_unstack = item_features_unstack.pivot(index='item_id', columns='feature_category_id', values='mapped_feature_value_id')
item_features_unstack.fillna(value=-1, inplace=True)
item_features_unstack

feature_category_id,2,3,5,6,7,8,9,10,11,12,...,64,65,66,67,68,69,70,71,72,73
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,-1.0,-1.0,-1.0,-1.0,15.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,14.0,28.0,-1.0,-1.0,1.0,-1.0
3,-1.0,4.0,9.0,-1.0,18.0,-1.0,-1.0,-1.0,16.0,-1.0,...,-1.0,5.0,-1.0,-1.0,1.0,18.0,-1.0,-1.0,1.0,1.0
4,-1.0,3.0,9.0,-1.0,33.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,5.0,-1.0,-1.0,15.0,16.0,-1.0,-1.0,1.0,1.0
7,-1.0,-1.0,-1.0,-1.0,23.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,38.0,18.0,-1.0,-1.0,1.0,-1.0
8,-1.0,3.0,9.0,-1.0,31.0,-1.0,-1.0,-1.0,12.0,-1.0,...,-1.0,5.0,-1.0,-1.0,14.0,18.0,-1.0,-1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28139,-1.0,3.0,9.0,-1.0,31.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,5.0,-1.0,-1.0,6.0,25.0,-1.0,-1.0,1.0,1.0
28140,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,1.0,-1.0,-1.0,6.0,14.0,-1.0,-1.0,3.0,-1.0,1.0
28141,-1.0,4.0,9.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,16.0,14.0,-1.0,-1.0,1.0,1.0
28142,-1.0,-1.0,-1.0,-1.0,24.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,9.0,-1.0,-1.0,46.0,23.0,-1.0,-1.0,1.0,0.0


In [58]:
item_features_unstack = item_features_unstack.reset_index()
item_features_unstack

feature_category_id,item_id,2,3,5,6,7,8,9,10,11,...,64,65,66,67,68,69,70,71,72,73
0,2,-1.0,-1.0,-1.0,-1.0,15.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,14.0,28.0,-1.0,-1.0,1.0,-1.0
1,3,-1.0,4.0,9.0,-1.0,18.0,-1.0,-1.0,-1.0,16.0,...,-1.0,5.0,-1.0,-1.0,1.0,18.0,-1.0,-1.0,1.0,1.0
2,4,-1.0,3.0,9.0,-1.0,33.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,5.0,-1.0,-1.0,15.0,16.0,-1.0,-1.0,1.0,1.0
3,7,-1.0,-1.0,-1.0,-1.0,23.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,38.0,18.0,-1.0,-1.0,1.0,-1.0
4,8,-1.0,3.0,9.0,-1.0,31.0,-1.0,-1.0,-1.0,12.0,...,-1.0,5.0,-1.0,-1.0,14.0,18.0,-1.0,-1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23686,28139,-1.0,3.0,9.0,-1.0,31.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,5.0,-1.0,-1.0,6.0,25.0,-1.0,-1.0,1.0,1.0
23687,28140,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,1.0,-1.0,-1.0,6.0,14.0,-1.0,-1.0,3.0,-1.0,1.0
23688,28141,-1.0,4.0,9.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,16.0,14.0,-1.0,-1.0,1.0,1.0
23689,28142,-1.0,-1.0,-1.0,-1.0,24.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,9.0,-1.0,-1.0,46.0,23.0,-1.0,-1.0,1.0,0.0


In [65]:
item_features_unstack.to_csv('../../Dataset/item_features_filtered.csv', index=False)