In [123]:
import pandas as pd
import os
import numpy as np

In [124]:
base_path = '../dataset'

original_data = os.path.join(base_path, 'original_data')
processed_data = os.path.join(base_path, 'processed_data')

In [125]:
features=pd.read_csv(os.path.join(processed_data, "item_features_mapped.csv"))
features=features[features["feature_category_id"]!=27].copy()#useless category with only one item -> no information to gain

In [126]:
features["value"]=1

In [127]:
len(features)

471750

In [128]:
features.head()

Unnamed: 0,item_id,feature_category_id,feature_value_id,value
0,19021,56,365,1
1,19021,62,801,1
2,19021,68,351,1
3,19021,33,802,1
4,19021,72,75,1


In [129]:
features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 471750 entries, 0 to 471750
Data columns (total 4 columns):
 #   Column               Non-Null Count   Dtype
---  ------               --------------   -----
 0   item_id              471750 non-null  int64
 1   feature_category_id  471750 non-null  int64
 2   feature_value_id     471750 non-null  int64
 3   value                471750 non-null  int64
dtypes: int64(4)
memory usage: 18.0 MB


In [130]:
feat_categories=features["feature_category_id"].unique()
print("min:",min(feat_categories),"  max:",max(feat_categories))
print("length:", len(feat_categories))

min: 1   max: 73
length: 72


In [131]:
feat_values=features["feature_value_id"].unique()
print("min:",min(feat_values),"  max:",max(feat_values))
print("length:", len(feat_values))

min: 1   max: 905
length: 889


In [132]:
feat_category_count=features.groupby("feature_category_id")['value'].count()
feat_category_count # In how many items the feature is present

feature_category_id
1       941
2      2253
3      9311
4     14954
5      9488
      ...  
69    20450
70      807
71      671
72    20499
73    13339
Name: value, Length: 72, dtype: int64

In [133]:
print("min:",min(feat_category_count),"  max:",max(feat_category_count))
print("length:", len(feat_category_count))

min: 137   max: 23691
length: 72


In [134]:
feat_category_count[feat_category_count<500] #features that appears in less than 500 items

feature_category_id
9     137
13    137
54    341
Name: value, dtype: int64

In [135]:
feat_category_count[feat_category_count>20000] # features that appears in more than 20000 items

feature_category_id
7     20416
47    23691
50    23077
56    23691
61    22512
68    23038
69    20450
72    20499
Name: value, dtype: int64

In [136]:
features_category=features[["item_id","feature_category_id"]].copy()

In [137]:
feat_value_count=features.groupby("feature_value_id")['value'].count()
feat_value_count # In how many items the feature is present

feature_value_id
1         6
2      1700
3       164
4        91
5        49
       ... 
901    5494
902    6875
903      37
904      14
905       2
Name: value, Length: 889, dtype: int64

In [138]:
print("min:",min(feat_value_count),"  max:",max(feat_value_count))
print("length:", len(feat_value_count))

min: 1   max: 19472
length: 889


In [139]:
feat_value_count[feat_value_count<20]

feature_value_id
1       6
8       5
16      7
19      1
21      2
       ..
888     1
891     2
894    18
904    14
905     2
Name: value, Length: 281, dtype: int64

In [140]:
feat_value_count[feat_value_count>20000] # features that appears in more than 20000 items

Series([], Name: value, dtype: int64)

In [141]:
feats=features.groupby(["feature_category_id","feature_value_id"])["value"].count().reset_index()
feats # For the given feature category, how many items have the value specified in feature_value_id?

Unnamed: 0,feature_category_id,feature_value_id,value
0,1,60,1
1,1,143,10
2,1,358,2
3,1,461,924
4,1,517,2
...,...,...,...
898,72,751,55
899,72,829,405
900,72,883,65
901,73,91,2381


In [142]:
feats=feats.sort_values(by="value",ascending=False)
feats
feats.head()

Unnamed: 0,feature_category_id,feature_value_id,value
636,56,365,19472
877,72,75,12582
715,61,706,12353
190,17,378,11939
742,63,861,11651


In [143]:
len(feats)

903

In [144]:
feats.tail()

Unnamed: 0,feature_category_id,feature_value_id,value
175,14,888,1
406,39,137,1
418,40,697,1
56,5,257,1
0,1,60,1


In [145]:
len(feats[feats["value"]<5])

131

In [146]:
feats=feats[feats["value"]>=5]

In [147]:
feats=feats.reset_index()
feats["feature_idx"]=feats.index

In [148]:
feats

Unnamed: 0,index,feature_category_id,feature_value_id,value,feature_idx
0,636,56,365,19472,0
1,877,72,75,12582,1
2,715,61,706,12353,2
3,190,17,378,11939,3
4,742,63,861,11651,4
...,...,...,...,...,...
767,661,56,752,5,767
768,349,30,820,5,768
769,508,50,8,5,769
770,419,40,714,5,770


In [149]:
feats[["feature_category_id","feature_value_id","feature_idx"]].to_csv(os.path.join(processed_data,"feature_mapped.csv"), index=False)

In [150]:
feats

Unnamed: 0,index,feature_category_id,feature_value_id,value,feature_idx
0,636,56,365,19472,0
1,877,72,75,12582,1
2,715,61,706,12353,2
3,190,17,378,11939,3
4,742,63,861,11651,4
...,...,...,...,...,...
767,661,56,752,5,767
768,349,30,820,5,768
769,508,50,8,5,769
770,419,40,714,5,770


In [151]:
feats_map = feats[["feature_category_id","feature_value_id","feature_idx"]]

In [152]:
feats_map

Unnamed: 0,feature_category_id,feature_value_id,feature_idx
0,56,365,0
1,72,75,1
2,61,706,2
3,17,378,3
4,63,861,4
...,...,...,...
767,56,752,767
768,30,820,768
769,50,8,769
770,40,714,770


In [153]:
features_mapped=features.merge(feats_map,how="left",on=["feature_category_id","feature_value_id"])

In [154]:
features_category=features_mapped[["item_id","feature_category_id"]].copy()
features_category=features_category.drop_duplicates()

In [155]:
features_category=features_category.rename(columns={"feature_category_id":"feature_idx"})
features_category["feature_idx"]+=np.max(features_mapped["feature_idx"])
features_category.head()


Unnamed: 0,item_id,feature_idx
0,19021,827.0
1,19021,833.0
2,19021,839.0
3,19021,804.0
4,19021,843.0


In [156]:
features_category["feature_idx"]=features_category["feature_idx"].astype("int32")

In [157]:
features_mapped=features_mapped[["item_id","feature_idx"]]

In [158]:
features_mapped.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 471750 entries, 0 to 471749
Data columns (total 2 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   item_id      471750 non-null  int64  
 1   feature_idx  471479 non-null  float64
dtypes: float64(1), int64(1)
memory usage: 10.8 MB


In [159]:
features_category

Unnamed: 0,item_id,feature_idx
0,19021,827
1,19021,833
2,19021,839
3,19021,804
4,19021,843
...,...,...
471745,19020,839
471746,19020,826
471747,19020,782
471748,19020,844


In [160]:
features_mapped.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 471750 entries, 0 to 471749
Data columns (total 2 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   item_id      471750 non-null  int64  
 1   feature_idx  471479 non-null  float64
dtypes: float64(1), int64(1)
memory usage: 10.8 MB


In [161]:
features_mapped.dropna(inplace=True)

In [162]:
features_mapped=features_mapped[["item_id","feature_idx"]]
features_mapped["feature_idx"]=features_mapped["feature_idx"].astype("int32")

In [163]:
features_mapped.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 471479 entries, 0 to 471749
Data columns (total 2 columns):
 #   Column       Non-Null Count   Dtype
---  ------       --------------   -----
 0   item_id      471479 non-null  int64
 1   feature_idx  471479 non-null  int32
dtypes: int32(1), int64(1)
memory usage: 9.0 MB


In [164]:
features_mapped.to_csv(os.path.join(processed_data,"simplified_features.csv"),index=False)

In [165]:
pd.concat([features_mapped,features_category]).to_csv(os.path.join(processed_data,"simplified_features_and_categories.csv"),index=False)