In [1]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
pip install lightgbm

In [None]:
pip install catboost

In [2]:
import pandas as pd 
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.metrics import classification_report

import xgboost as xgb

import catboost as cat



In [3]:
df_order_jan = pd.read_csv("orders_before_jan_labeled.csv")
df_order_jan['label'].value_counts()

0    882569
4      8883
3      6846
2      4522
1      3061
Name: label, dtype: int64

In [6]:
df_order_dec = pd.read_csv("orders_before_dec_labeled.csv")
df_order_dec['label'].value_counts()

0    748068
4      7304
3      5644
2      3654
1      2461
Name: label, dtype: int64

In [4]:
# unbalanced data remove extra 0s 
df_10000_0 = df_order_jan.loc[df_order_jan['label']==0].sample(n=10000, axis=0, random_state=1)
df_balanced_jan = df_10000_0.append(df_order_jan.loc[df_order_jan['label']!=0])
df_balanced_jan['label'].value_counts()

0    10000
4     8883
3     6846
2     4522
1     3061
Name: label, dtype: int64

In [7]:
# unbalanced data remove extra 0s 
df_10000_0 = df_order_dec.loc[df_order_jan['label']==0].sample(n=10000, axis=0, random_state=1)
df_balanced_dec = df_10000_0.append(df_order_dec.loc[df_order_dec['label']!=0])
df_balanced_dec['label'].value_counts()

0    10000
4     7304
3     5644
2     3654
1     2461
Name: label, dtype: int64

In [5]:
# load other dataset
df_cat_hierarchy = pd.read_csv('category_hierarchy.csv', delimiter='|',error_bad_lines=False)
df_items = pd.read_csv('items.csv',sep='|',error_bad_lines=False)
# fill NA values in category column -- 4300 is the category with null values  不太好，需探索missing的原因，不存在itemid相同但categories不同的
df_items["categories"] = df_items["categories"].fillna('[4300]')
df_items

Unnamed: 0,itemID,brand,feature_1,feature_2,feature_3,feature_4,feature_5,categories
0,22665,861,4,0,490,2,66,"[2890, 855, 3908, 3909]"
1,28640,1366,10,1,537,0,101,[4300]
2,13526,1090,10,0,511,0,0,"[3270, 163, 284, 1694, 12, 3837, 2422, 3595, 3..."
3,21399,1090,10,1,511,0,0,[3270]
4,8504,768,4,1,484,0,66,[2470]
...,...,...,...,...,...,...,...,...
32771,12036,1383,10,0,503,0,101,"[3502, 1088, 2004, 691, 3422, 2308, 308, 3688,..."
32772,22709,567,4,2,491,3,66,"[3900, 3905, 3903, 3910, 1807, 274, 1467, 914]"
32773,32322,1385,10,1,500,0,37,"[2686, 2609, 725]"
32774,19118,1113,6,2,491,0,117,"[3900, 3910, 3903, 2473, 914, 274]"


In [9]:
df_items.loc[df_items['categories'].str.contains('4300')]

Unnamed: 0,itemID,brand,feature_1,feature_2,feature_3,feature_4,feature_5,categories
1,28640,1366,10,1,537,0,101,[4300]
5,32122,5,4,1,491,0,66,[4300]
16,366,1366,10,1,537,0,101,[4300]
18,4119,167,4,1,466,0,-1,[4300]
20,14612,1277,4,1,491,0,66,[4300]
...,...,...,...,...,...,...,...,...
32744,14185,1246,6,2,536,0,46,[4300]
32747,19077,613,10,2,507,0,17,[4300]
32748,29565,1284,4,1,474,3,-1,[4300]
32764,21234,1366,10,1,537,0,101,[4300]


In [17]:
df_items.loc[df_items['itemID'].isin([28640, 32122, 366, 4119, 14612])]

Unnamed: 0,itemID,brand,feature_1,feature_2,feature_3,feature_4,feature_5,categories
1,28640,1366,10,1,537,0,101,[4300]
5,32122,5,4,1,491,0,66,[4300]
16,366,1366,10,1,537,0,101,[4300]
18,4119,167,4,1,466,0,-1,[4300]
20,14612,1277,4,1,491,0,66,[4300]


In [18]:
df_items.loc[df_items['brand'].isin([1366, 5, 167, 1277])]

Unnamed: 0,itemID,brand,feature_1,feature_2,feature_3,feature_4,feature_5,categories
1,28640,1366,10,1,537,0,101,[4300]
5,32122,5,4,1,491,0,66,[4300]
16,366,1366,10,1,537,0,101,[4300]
17,17432,1277,4,1,491,0,144,[1807]
18,4119,167,4,1,466,0,-1,[4300]
...,...,...,...,...,...,...,...,...
32448,31081,5,4,2,491,0,66,[4300]
32489,26292,5,4,2,491,0,66,[4300]
32543,4760,5,4,1,491,0,66,[4300]
32764,21234,1366,10,1,537,0,101,[4300]


In [19]:
# transform categories from str to int list
df_items['categories'] = df_items['categories'].map(lambda x: list(map(int,x.split("[")[1].split("]")[0].split(","))))

In [20]:
# from categories list generate parent_categories
df_indexed = df_cat_hierarchy.set_index(['category'])
df_indexed

Unnamed: 0_level_0,parent_category
category,Unnamed: 1_level_1
0,75
1,1499
2,1082
3,3498
4,1623
...,...
4295,3898
4296,3898
4297,3898
4298,3898


In [74]:
flat_hierachy = df_indexed.copy()
flat_hierachy['layer3'] = df_indexed['parent_category'].map(lambda x: df_indexed.loc[x]['parent_category'])
flat_hierachy['layer4'] = flat_hierachy['layer3'].map(lambda x: df_indexed.loc[x]['parent_category'])


flat_hierachy['layer4'].value_counts()

TypeError: unhashable type: 'Series'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas/_libs/hashtable_class_helper.pxi", line 5231, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'Series'


3898            3097
2793             109
252               93
2346              81
3624              71
                ... 
3174               1
[3898, 3898]       1
[875, 3898]        1
[3898, 1062]       1
4300               1
Name: layer4, Length: 112, dtype: int64

In [21]:
df_indexed.loc[[2890, 855, 3908, 3909]]['parent_category']

category
2890    2832
855     1178
3908    3898
3909    3898
Name: parent_category, dtype: int64

In [22]:
# set parent of null as null
df_indexed.loc[4300] = [4300]
df_indexed

Unnamed: 0_level_0,parent_category
category,Unnamed: 1_level_1
0,75
1,1499
2,1082
3,3498
4,1623
...,...
4296,3898
4297,3898
4298,3898
4299,3898


In [23]:
# parent_categories generate
df_items['parent_categories'] = df_items['categories'].map(lambda x : df_indexed.loc[x]['parent_category'].tolist())

In [24]:
df_items

Unnamed: 0,itemID,brand,feature_1,feature_2,feature_3,feature_4,feature_5,categories,parent_categories
0,22665,861,4,0,490,2,66,"[2890, 855, 3908, 3909]","[2832, 1178, 3898, 3898]"
1,28640,1366,10,1,537,0,101,[4300],[4300]
2,13526,1090,10,0,511,0,0,"[3270, 163, 284, 1694, 12, 3837, 2422, 3595, 3...","[1420, 3860, 600, 600, 3241, 3241, 3241, 600, ..."
3,21399,1090,10,1,511,0,0,[3270],[1420]
4,8504,768,4,1,484,0,66,[2470],[2566]
...,...,...,...,...,...,...,...,...,...
32771,12036,1383,10,0,503,0,101,"[3502, 1088, 2004, 691, 3422, 2308, 308, 3688,...","[3860, 458, 458, 458, 458, 322, 600, 600, 600,..."
32772,22709,567,4,2,491,3,66,"[3900, 3905, 3903, 3910, 1807, 274, 1467, 914]","[3898, 3898, 3898, 3898, 1072, 2920, 2475, 3565]"
32773,32322,1385,10,1,500,0,37,"[2686, 2609, 725]","[2917, 1175, 725, 1735]"
32774,19118,1113,6,2,491,0,117,"[3900, 3910, 3903, 2473, 914, 274]","[3898, 3898, 3898, 2566, 3565, 2920]"


In [25]:
# grandparent_categories generate
df_items['grandparent_categories'] = df_items['parent_categories'].map(lambda x : df_indexed.loc[x]['parent_category'].tolist())

In [50]:
# grand_grand_par_categories generate
df_items['grand_grand_par_categories'] = df_items['grandparent_categories'].map(lambda x : df_indexed.loc[x]['parent_category'].tolist())

In [33]:
# get unique grandparent
df_items['grandparent_unique'] = df_items['grandparent_categories'].map(lambda x: set(x))
df_items

Unnamed: 0,itemID,brand,feature_1,feature_2,feature_3,feature_4,feature_5,categories,parent_categories,grandparent_categories,grandparent_unique
0,22665,861,4,0,490,2,66,"[2890, 855, 3908, 3909]","[2832, 1178, 3898, 3898]","[2838, 2012, 3898, 3898]","{3898, 2012, 2838}"
1,28640,1366,10,1,537,0,101,[4300],[4300],[4300],{4300}
2,13526,1090,10,0,511,0,0,"[3270, 163, 284, 1694, 12, 3837, 2422, 3595, 3...","[1420, 3860, 600, 600, 3241, 3241, 3241, 600, ...","[2364, 3898, 3898, 3898, 600, 600, 600, 3898, ...","{600, 3898, 2364}"
3,21399,1090,10,1,511,0,0,[3270],[1420],[2364],{2364}
4,8504,768,4,1,484,0,66,[2470],[2566],[1072],{1072}
...,...,...,...,...,...,...,...,...,...,...,...
32771,12036,1383,10,0,503,0,101,"[3502, 1088, 2004, 691, 3422, 2308, 308, 3688,...","[3860, 458, 458, 458, 458, 322, 600, 600, 600,...","[3898, 2364, 2364, 2364, 2364, 3898, 3898, 389...","{600, 3898, 2364}"
32772,22709,567,4,2,491,3,66,"[3900, 3905, 3903, 3910, 1807, 274, 1467, 914]","[3898, 3898, 3898, 3898, 1072, 2920, 2475, 3565]","[3898, 3898, 3898, 3898, 3898, 3898, 2928, 256...","{2928, 3898, 1072, 2566}"
32773,32322,1385,10,1,500,0,37,"[2686, 2609, 725]","[2917, 1175, 725, 1735]","[3898, 1735, 1735, 3898]","{3898, 1735}"
32774,19118,1113,6,2,491,0,117,"[3900, 3910, 3903, 2473, 914, 274]","[3898, 3898, 3898, 2566, 3565, 2920]","[3898, 3898, 3898, 1072, 1072, 3898, 2928]","{1072, 3898, 2928}"


In [51]:
# get unique grandgrandparent
df_items['grand_grand_par_unique'] = df_items['grand_grand_par_categories'].map(lambda x: set(x))
df_items

Unnamed: 0,itemID,brand,feature_1,feature_2,feature_3,feature_4,feature_5,categories,parent_categories,grandparent_categories,grandparent_unique,grand_grand_par_categories,grand_grand_par_unique
0,22665,861,4,0,490,2,66,"[2890, 855, 3908, 3909]","[2832, 1178, 3898, 3898]","[2838, 2012, 3898, 3898]","{3898, 2012, 2838}","[3898, 3898, 3898, 3898]",{3898}
1,28640,1366,10,1,537,0,101,[4300],[4300],[4300],{4300},[4300],{4300}
2,13526,1090,10,0,511,0,0,"[3270, 163, 284, 1694, 12, 3837, 2422, 3595, 3...","[1420, 3860, 600, 600, 3241, 3241, 3241, 600, ...","[2364, 3898, 3898, 3898, 600, 600, 600, 3898, ...","{600, 3898, 2364}","[3898, 3898, 3898, 3898, 3898, 3898, 3898, 389...",{3898}
3,21399,1090,10,1,511,0,0,[3270],[1420],[2364],{2364},[3898],{3898}
4,8504,768,4,1,484,0,66,[2470],[2566],[1072],{1072},[3898],{3898}
...,...,...,...,...,...,...,...,...,...,...,...,...,...
32771,12036,1383,10,0,503,0,101,"[3502, 1088, 2004, 691, 3422, 2308, 308, 3688,...","[3860, 458, 458, 458, 458, 322, 600, 600, 600,...","[3898, 2364, 2364, 2364, 2364, 3898, 3898, 389...","{600, 3898, 2364}","[3898, 3898, 3898, 3898, 3898, 3898, 3898, 389...",{3898}
32772,22709,567,4,2,491,3,66,"[3900, 3905, 3903, 3910, 1807, 274, 1467, 914]","[3898, 3898, 3898, 3898, 1072, 2920, 2475, 3565]","[3898, 3898, 3898, 3898, 3898, 3898, 2928, 256...","{2928, 3898, 1072, 2566}","[3898, 3898, 3898, 3898, 3898, 3898, 3898, 107...","{1072, 3898}"
32773,32322,1385,10,1,500,0,37,"[2686, 2609, 725]","[2917, 1175, 725, 1735]","[3898, 1735, 1735, 3898]","{3898, 1735}","[3898, 3898, 3898, 3898]",{3898}
32774,19118,1113,6,2,491,0,117,"[3900, 3910, 3903, 2473, 914, 274]","[3898, 3898, 3898, 2566, 3565, 2920]","[3898, 3898, 3898, 1072, 1072, 3898, 2928]","{1072, 3898, 2928}","[3898, 3898, 3898, 3898, 3898, 3898, 3898]",{3898}


In [60]:
# grand_3_par_categories generate
df_items['grand_3_par_categories'] = df_items['grand_grand_par_unique'].map(lambda x : df_indexed.loc[x]['parent_category'].tolist())
df_items['grand_3_par_unique'] = df_items['grand_3_par_categories'].map(lambda x: set(x))
df_items

Unnamed: 0,itemID,brand,feature_1,feature_2,feature_3,feature_4,feature_5,categories,parent_categories,grandparent_categories,grandparent_unique,grand_grand_par_categories,grand_grand_par_unique,grand_3_par_categories,grand_3_par_unique
0,22665,861,4,0,490,2,66,"[2890, 855, 3908, 3909]","[2832, 1178, 3898, 3898]","[2838, 2012, 3898, 3898]","{3898, 2012, 2838}","[3898, 3898, 3898, 3898]",{3898},[3898],{3898}
1,28640,1366,10,1,537,0,101,[4300],[4300],[4300],{4300},[4300],{4300},[4300],{4300}
2,13526,1090,10,0,511,0,0,"[3270, 163, 284, 1694, 12, 3837, 2422, 3595, 3...","[1420, 3860, 600, 600, 3241, 3241, 3241, 600, ...","[2364, 3898, 3898, 3898, 600, 600, 600, 3898, ...","{600, 3898, 2364}","[3898, 3898, 3898, 3898, 3898, 3898, 3898, 389...",{3898},[3898],{3898}
3,21399,1090,10,1,511,0,0,[3270],[1420],[2364],{2364},[3898],{3898},[3898],{3898}
4,8504,768,4,1,484,0,66,[2470],[2566],[1072],{1072},[3898],{3898},[3898],{3898}
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32771,12036,1383,10,0,503,0,101,"[3502, 1088, 2004, 691, 3422, 2308, 308, 3688,...","[3860, 458, 458, 458, 458, 322, 600, 600, 600,...","[3898, 2364, 2364, 2364, 2364, 3898, 3898, 389...","{600, 3898, 2364}","[3898, 3898, 3898, 3898, 3898, 3898, 3898, 389...",{3898},[3898],{3898}
32772,22709,567,4,2,491,3,66,"[3900, 3905, 3903, 3910, 1807, 274, 1467, 914]","[3898, 3898, 3898, 3898, 1072, 2920, 2475, 3565]","[3898, 3898, 3898, 3898, 3898, 3898, 2928, 256...","{2928, 3898, 1072, 2566}","[3898, 3898, 3898, 3898, 3898, 3898, 3898, 107...","{1072, 3898}","[3898, 3898]",{3898}
32773,32322,1385,10,1,500,0,37,"[2686, 2609, 725]","[2917, 1175, 725, 1735]","[3898, 1735, 1735, 3898]","{3898, 1735}","[3898, 3898, 3898, 3898]",{3898},[3898],{3898}
32774,19118,1113,6,2,491,0,117,"[3900, 3910, 3903, 2473, 914, 274]","[3898, 3898, 3898, 2566, 3565, 2920]","[3898, 3898, 3898, 1072, 1072, 3898, 2928]","{1072, 3898, 2928}","[3898, 3898, 3898, 3898, 3898, 3898, 3898]",{3898},[3898],{3898}


In [75]:
# grand_4_par_categories generate
df_items['grand_4_par_categories'] = df_items['grand_3_par_unique'].map(lambda x : df_indexed.loc[x]['parent_category'].tolist())
df_items['grand_4_par_unique'] = df_items['grand_4_par_categories'].map(lambda x: set(x))
df_items

Unnamed: 0,itemID,brand,feature_1,feature_2,feature_3,feature_4,feature_5,categories,parent_categories,grandparent_categories,grandparent_unique,grand_grand_par_categories,grand_grand_par_unique,grand_3_par_categories,grand_3_par_unique,grand_4_par_categories,grand_4_par_unique
0,22665,861,4,0,490,2,66,"[2890, 855, 3908, 3909]","[2832, 1178, 3898, 3898]","[2838, 2012, 3898, 3898]","{3898, 2012, 2838}","[3898, 3898, 3898, 3898]",{3898},[3898],{3898},[3898],{3898}
1,28640,1366,10,1,537,0,101,[4300],[4300],[4300],{4300},[4300],{4300},[4300],{4300},[4300],{4300}
2,13526,1090,10,0,511,0,0,"[3270, 163, 284, 1694, 12, 3837, 2422, 3595, 3...","[1420, 3860, 600, 600, 3241, 3241, 3241, 600, ...","[2364, 3898, 3898, 3898, 600, 600, 600, 3898, ...","{600, 3898, 2364}","[3898, 3898, 3898, 3898, 3898, 3898, 3898, 389...",{3898},[3898],{3898},[3898],{3898}
3,21399,1090,10,1,511,0,0,[3270],[1420],[2364],{2364},[3898],{3898},[3898],{3898},[3898],{3898}
4,8504,768,4,1,484,0,66,[2470],[2566],[1072],{1072},[3898],{3898},[3898],{3898},[3898],{3898}
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32771,12036,1383,10,0,503,0,101,"[3502, 1088, 2004, 691, 3422, 2308, 308, 3688,...","[3860, 458, 458, 458, 458, 322, 600, 600, 600,...","[3898, 2364, 2364, 2364, 2364, 3898, 3898, 389...","{600, 3898, 2364}","[3898, 3898, 3898, 3898, 3898, 3898, 3898, 389...",{3898},[3898],{3898},[3898],{3898}
32772,22709,567,4,2,491,3,66,"[3900, 3905, 3903, 3910, 1807, 274, 1467, 914]","[3898, 3898, 3898, 3898, 1072, 2920, 2475, 3565]","[3898, 3898, 3898, 3898, 3898, 3898, 2928, 256...","{2928, 3898, 1072, 2566}","[3898, 3898, 3898, 3898, 3898, 3898, 3898, 107...","{1072, 3898}","[3898, 3898]",{3898},[3898],{3898}
32773,32322,1385,10,1,500,0,37,"[2686, 2609, 725]","[2917, 1175, 725, 1735]","[3898, 1735, 1735, 3898]","{3898, 1735}","[3898, 3898, 3898, 3898]",{3898},[3898],{3898},[3898],{3898}
32774,19118,1113,6,2,491,0,117,"[3900, 3910, 3903, 2473, 914, 274]","[3898, 3898, 3898, 2566, 3565, 2920]","[3898, 3898, 3898, 1072, 1072, 3898, 2928]","{1072, 3898, 2928}","[3898, 3898, 3898, 3898, 3898, 3898, 3898]",{3898},[3898],{3898},[3898],{3898}


In [80]:
# grand_5_par_categories generate
df_items['grand_5_par_categories'] = df_items['grand_4_par_unique'].map(lambda x : df_indexed.loc[x]['parent_category'].tolist())
df_items['grand_5_par_unique'] = df_items['grand_5_par_categories'].map(lambda x: set(x))
df_items

Unnamed: 0,itemID,brand,feature_1,feature_2,feature_3,feature_4,feature_5,categories,parent_categories,grandparent_categories,grandparent_unique,grand_grand_par_categories,grand_grand_par_unique,grand_3_par_categories,grand_3_par_unique,grand_4_par_categories,grand_4_par_unique,grand_5_par_categories,grand_5_par_unique
0,22665,861,4,0,490,2,66,"[2890, 855, 3908, 3909]","[2832, 1178, 3898, 3898]","[2838, 2012, 3898, 3898]","{3898, 2012, 2838}","[3898, 3898, 3898, 3898]",{3898},[3898],{3898},[3898],{3898},[3898],{3898}
1,28640,1366,10,1,537,0,101,[4300],[4300],[4300],{4300},[4300],{4300},[4300],{4300},[4300],{4300},[4300],{4300}
2,13526,1090,10,0,511,0,0,"[3270, 163, 284, 1694, 12, 3837, 2422, 3595, 3...","[1420, 3860, 600, 600, 3241, 3241, 3241, 600, ...","[2364, 3898, 3898, 3898, 600, 600, 600, 3898, ...","{600, 3898, 2364}","[3898, 3898, 3898, 3898, 3898, 3898, 3898, 389...",{3898},[3898],{3898},[3898],{3898},[3898],{3898}
3,21399,1090,10,1,511,0,0,[3270],[1420],[2364],{2364},[3898],{3898},[3898],{3898},[3898],{3898},[3898],{3898}
4,8504,768,4,1,484,0,66,[2470],[2566],[1072],{1072},[3898],{3898},[3898],{3898},[3898],{3898},[3898],{3898}
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32771,12036,1383,10,0,503,0,101,"[3502, 1088, 2004, 691, 3422, 2308, 308, 3688,...","[3860, 458, 458, 458, 458, 322, 600, 600, 600,...","[3898, 2364, 2364, 2364, 2364, 3898, 3898, 389...","{600, 3898, 2364}","[3898, 3898, 3898, 3898, 3898, 3898, 3898, 389...",{3898},[3898],{3898},[3898],{3898},[3898],{3898}
32772,22709,567,4,2,491,3,66,"[3900, 3905, 3903, 3910, 1807, 274, 1467, 914]","[3898, 3898, 3898, 3898, 1072, 2920, 2475, 3565]","[3898, 3898, 3898, 3898, 3898, 3898, 2928, 256...","{2928, 3898, 1072, 2566}","[3898, 3898, 3898, 3898, 3898, 3898, 3898, 107...","{1072, 3898}","[3898, 3898]",{3898},[3898],{3898},[3898],{3898}
32773,32322,1385,10,1,500,0,37,"[2686, 2609, 725]","[2917, 1175, 725, 1735]","[3898, 1735, 1735, 3898]","{3898, 1735}","[3898, 3898, 3898, 3898]",{3898},[3898],{3898},[3898],{3898},[3898],{3898}
32774,19118,1113,6,2,491,0,117,"[3900, 3910, 3903, 2473, 914, 274]","[3898, 3898, 3898, 2566, 3565, 2920]","[3898, 3898, 3898, 1072, 1072, 3898, 2928]","{1072, 3898, 2928}","[3898, 3898, 3898, 3898, 3898, 3898, 3898]",{3898},[3898],{3898},[3898],{3898},[3898],{3898}


In [83]:
# grand_6_par_categories generate
df_items['grand_6_par_categories'] = df_items['grand_5_par_unique'].map(lambda x : df_indexed.loc[x]['parent_category'].tolist())
df_items['grand_6_par_unique'] = df_items['grand_6_par_categories'].map(lambda x: set(x))
df_items

Unnamed: 0,itemID,brand,feature_1,feature_2,feature_3,feature_4,feature_5,categories,parent_categories,grandparent_categories,...,grand_grand_par_categories,grand_grand_par_unique,grand_3_par_categories,grand_3_par_unique,grand_4_par_categories,grand_4_par_unique,grand_5_par_categories,grand_5_par_unique,grand_6_par_categories,grand_6_par_unique
0,22665,861,4,0,490,2,66,"[2890, 855, 3908, 3909]","[2832, 1178, 3898, 3898]","[2838, 2012, 3898, 3898]",...,"[3898, 3898, 3898, 3898]",{3898},[3898],{3898},[3898],{3898},[3898],{3898},[3898],{3898}
1,28640,1366,10,1,537,0,101,[4300],[4300],[4300],...,[4300],{4300},[4300],{4300},[4300],{4300},[4300],{4300},[4300],{4300}
2,13526,1090,10,0,511,0,0,"[3270, 163, 284, 1694, 12, 3837, 2422, 3595, 3...","[1420, 3860, 600, 600, 3241, 3241, 3241, 600, ...","[2364, 3898, 3898, 3898, 600, 600, 600, 3898, ...",...,"[3898, 3898, 3898, 3898, 3898, 3898, 3898, 389...",{3898},[3898],{3898},[3898],{3898},[3898],{3898},[3898],{3898}
3,21399,1090,10,1,511,0,0,[3270],[1420],[2364],...,[3898],{3898},[3898],{3898},[3898],{3898},[3898],{3898},[3898],{3898}
4,8504,768,4,1,484,0,66,[2470],[2566],[1072],...,[3898],{3898},[3898],{3898},[3898],{3898},[3898],{3898},[3898],{3898}
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32771,12036,1383,10,0,503,0,101,"[3502, 1088, 2004, 691, 3422, 2308, 308, 3688,...","[3860, 458, 458, 458, 458, 322, 600, 600, 600,...","[3898, 2364, 2364, 2364, 2364, 3898, 3898, 389...",...,"[3898, 3898, 3898, 3898, 3898, 3898, 3898, 389...",{3898},[3898],{3898},[3898],{3898},[3898],{3898},[3898],{3898}
32772,22709,567,4,2,491,3,66,"[3900, 3905, 3903, 3910, 1807, 274, 1467, 914]","[3898, 3898, 3898, 3898, 1072, 2920, 2475, 3565]","[3898, 3898, 3898, 3898, 3898, 3898, 2928, 256...",...,"[3898, 3898, 3898, 3898, 3898, 3898, 3898, 107...","{1072, 3898}","[3898, 3898]",{3898},[3898],{3898},[3898],{3898},[3898],{3898}
32773,32322,1385,10,1,500,0,37,"[2686, 2609, 725]","[2917, 1175, 725, 1735]","[3898, 1735, 1735, 3898]",...,"[3898, 3898, 3898, 3898]",{3898},[3898],{3898},[3898],{3898},[3898],{3898},[3898],{3898}
32774,19118,1113,6,2,491,0,117,"[3900, 3910, 3903, 2473, 914, 274]","[3898, 3898, 3898, 2566, 3565, 2920]","[3898, 3898, 3898, 1072, 1072, 3898, 2928]",...,"[3898, 3898, 3898, 3898, 3898, 3898, 3898]",{3898},[3898],{3898},[3898],{3898},[3898],{3898},[3898],{3898}


In [86]:
# grand_7_par_categories generate
df_items['grand_7_par_categories'] = df_items['grand_6_par_unique'].map(lambda x : df_indexed.loc[x]['parent_category'].tolist())
df_items['grand_7_par_unique'] = df_items['grand_7_par_categories'].map(lambda x: set(x))
df_items

Unnamed: 0,itemID,brand,feature_1,feature_2,feature_3,feature_4,feature_5,categories,parent_categories,grandparent_categories,...,grand_3_par_categories,grand_3_par_unique,grand_4_par_categories,grand_4_par_unique,grand_5_par_categories,grand_5_par_unique,grand_6_par_categories,grand_6_par_unique,grand_7_par_categories,grand_7_par_unique
0,22665,861,4,0,490,2,66,"[2890, 855, 3908, 3909]","[2832, 1178, 3898, 3898]","[2838, 2012, 3898, 3898]",...,[3898],{3898},[3898],{3898},[3898],{3898},[3898],{3898},[3898],{3898}
1,28640,1366,10,1,537,0,101,[4300],[4300],[4300],...,[4300],{4300},[4300],{4300},[4300],{4300},[4300],{4300},[4300],{4300}
2,13526,1090,10,0,511,0,0,"[3270, 163, 284, 1694, 12, 3837, 2422, 3595, 3...","[1420, 3860, 600, 600, 3241, 3241, 3241, 600, ...","[2364, 3898, 3898, 3898, 600, 600, 600, 3898, ...",...,[3898],{3898},[3898],{3898},[3898],{3898},[3898],{3898},[3898],{3898}
3,21399,1090,10,1,511,0,0,[3270],[1420],[2364],...,[3898],{3898},[3898],{3898},[3898],{3898},[3898],{3898},[3898],{3898}
4,8504,768,4,1,484,0,66,[2470],[2566],[1072],...,[3898],{3898},[3898],{3898},[3898],{3898},[3898],{3898},[3898],{3898}
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32771,12036,1383,10,0,503,0,101,"[3502, 1088, 2004, 691, 3422, 2308, 308, 3688,...","[3860, 458, 458, 458, 458, 322, 600, 600, 600,...","[3898, 2364, 2364, 2364, 2364, 3898, 3898, 389...",...,[3898],{3898},[3898],{3898},[3898],{3898},[3898],{3898},[3898],{3898}
32772,22709,567,4,2,491,3,66,"[3900, 3905, 3903, 3910, 1807, 274, 1467, 914]","[3898, 3898, 3898, 3898, 1072, 2920, 2475, 3565]","[3898, 3898, 3898, 3898, 3898, 3898, 2928, 256...",...,"[3898, 3898]",{3898},[3898],{3898},[3898],{3898},[3898],{3898},[3898],{3898}
32773,32322,1385,10,1,500,0,37,"[2686, 2609, 725]","[2917, 1175, 725, 1735]","[3898, 1735, 1735, 3898]",...,[3898],{3898},[3898],{3898},[3898],{3898},[3898],{3898},[3898],{3898}
32774,19118,1113,6,2,491,0,117,"[3900, 3910, 3903, 2473, 914, 274]","[3898, 3898, 3898, 2566, 3565, 2920]","[3898, 3898, 3898, 1072, 1072, 3898, 2928]",...,[3898],{3898},[3898],{3898},[3898],{3898},[3898],{3898},[3898],{3898}


In [87]:
len_grand_7parent_set = df_items['grand_7_par_unique'].map(lambda x: len(x))
max(len_grand_7parent_set)

1

In [88]:
df_items['grand_7_par_unique'].value_counts()

TypeError: unhashable type: 'set'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas/_libs/hashtable_class_helper.pxi", line 5231, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'set'


{3898}    25988
{4300}     6788
Name: grand_7_par_unique, dtype: int64

In [84]:
len_grand_6parent_set = df_items['grand_6_par_unique'].map(lambda x: len(x))
max(len_grand_6parent_set)

2

In [85]:
df_items['grand_6_par_unique'].value_counts()

TypeError: unhashable type: 'set'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas/_libs/hashtable_class_helper.pxi", line 5231, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'set'


{3898}          25859
{4300}           6788
{3624, 3898}      129
Name: grand_6_par_unique, dtype: int64

In [81]:
len_grand_5parent_set = df_items['grand_5_par_unique'].map(lambda x: len(x))
max(len_grand_5parent_set)

2

In [82]:
df_items['grand_5_par_unique'].value_counts()

TypeError: unhashable type: 'set'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas/_libs/hashtable_class_helper.pxi", line 5231, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'set'


{3898}          25859
{4300}           6788
{841, 3898}        60
{3898, 894}        27
{3898, 2087}       23
{3898, 834}        19
Name: grand_5_par_unique, dtype: int64

In [78]:
len_grand_4parent_set = df_items['grand_4_par_unique'].map(lambda x: len(x))
max(len_grand_4parent_set)

3

In [79]:
df_items['grand_4_par_unique'].value_counts()

TypeError: unhashable type: 'set'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas/_libs/hashtable_class_helper.pxi", line 5231, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'set'


{3898}                25406
{4300}                 6788
{3624, 3898}            411
{3898, 2549}             59
{2346, 3898}             36
{3898, 181}              23
{3898, 2043}             19
{1562, 3898}             17
{3624, 1562, 3898}       10
{3624}                    5
{3624, 3898, 2549}        1
{2346}                    1
Name: grand_4_par_unique, dtype: int64

In [61]:
len_grand_3parent_set = df_items['grand_3_par_unique'].map(lambda x: len(x))
max(len_grand_3parent_set)

5

In [62]:
df_items['grand_3_par_unique'].value_counts()

TypeError: unhashable type: 'set'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas/_libs/hashtable_class_helper.pxi", line 5231, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'set'


{3898}                     24020
{4300}                      6788
{3898, 2346}                 715
{3898, 252}                  204
{2793, 3898}                 170
                           ...  
{1216, 3898, 671}              1
{2793, 2346, 3898}             1
{3898, 3549, 1559}             1
{2346, 2395, 3898}             1
{3624, 3898, 894, 1216}        1
Name: grand_3_par_unique, Length: 65, dtype: int64

In [52]:
len_grand_grandparent_set = df_items['grand_grand_par_unique'].map(lambda x: len(x))
max(len_grand_grandparent_set)

6

In [58]:
df_items['grand_grand_par_unique'].map(lambda x: len(x)).value_counts()

1    24882
2     6737
3     1027
4      108
5       18
6        4
Name: grand_grand_par_unique, dtype: int64

In [63]:
df_items['grand_grand_par_unique'].value_counts()

TypeError: unhashable type: 'set'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas/_libs/hashtable_class_helper.pxi", line 5231, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'set'


{3898}                                16394
{4300}                                 6788
{34}                                    939
{3898, 252}                             693
{3898, 1677}                            630
                                      ...  
{2369, 3898, 875}                         1
{225, 3624, 2266, 1559, 3898, 223}        1
{3898, 1559, 223}                         1
{2497, 3624, 2793, 2346, 3898}            1
{2564, 3624, 2701, 1562, 3898}            1
Name: grand_grand_par_unique, Length: 324, dtype: int64

In [39]:
len_grandparent_set = df_items['grandparent_unique'].map(lambda x: len(x))
max(len_grandparent_set)

15

In [59]:
df_items['grandparent_unique'].map(lambda x: len(x)).value_counts()

1     16432
2      9081
3      4613
4      1790
5       559
6       167
7        88
8        26
9        12
10        4
12        2
15        1
11        1
Name: grandparent_unique, dtype: int64

In [64]:
df_items['grandparent_unique'].value_counts()

TypeError: unhashable type: 'set'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas/_libs/hashtable_class_helper.pxi", line 5231, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'set'


{4300}                                        6788
{3898}                                        5040
{3898, 2364}                                   980
{34}                                           903
{3210}                                         888
                                              ... 
{1094, 1321, 2793, 3691, 1557, 2231, 3898}       1
{2793, 3898, 1557, 3735}                         1
{2793, 3898, 1557, 1094}                         1
{1480, 2364}                                     1
{2928, 3898, 2723, 686}                          1
Name: grandparent_unique, Length: 2385, dtype: int64

In [42]:
len_parent_set = df_items['parent_categories'].map(lambda x: set(x)).map(lambda x: len(x))
max(len_parent_set)

22

# all categories list can be converged into one. parent of all is 3898. 

In [90]:
df_items['parent_unique'] = df_items['parent_categories'].map(lambda x: set(x))
df_items

Unnamed: 0,itemID,brand,feature_1,feature_2,feature_3,feature_4,feature_5,categories,parent_categories,grandparent_categories,...,grand_3_par_unique,grand_4_par_categories,grand_4_par_unique,grand_5_par_categories,grand_5_par_unique,grand_6_par_categories,grand_6_par_unique,grand_7_par_categories,grand_7_par_unique,parent_unique
0,22665,861,4,0,490,2,66,"[2890, 855, 3908, 3909]","[2832, 1178, 3898, 3898]","[2838, 2012, 3898, 3898]",...,{3898},[3898],{3898},[3898],{3898},[3898],{3898},[3898],{3898},"{2832, 1178, 3898}"
1,28640,1366,10,1,537,0,101,[4300],[4300],[4300],...,{4300},[4300],{4300},[4300],{4300},[4300],{4300},[4300],{4300},{4300}
2,13526,1090,10,0,511,0,0,"[3270, 163, 284, 1694, 12, 3837, 2422, 3595, 3...","[1420, 3860, 600, 600, 3241, 3241, 3241, 600, ...","[2364, 3898, 3898, 3898, 600, 600, 600, 3898, ...",...,{3898},[3898],{3898},[3898],{3898},[3898],{3898},[3898],{3898},"{3241, 1420, 3860, 600, 3898}"
3,21399,1090,10,1,511,0,0,[3270],[1420],[2364],...,{3898},[3898],{3898},[3898],{3898},[3898],{3898},[3898],{3898},{1420}
4,8504,768,4,1,484,0,66,[2470],[2566],[1072],...,{3898},[3898],{3898},[3898],{3898},[3898],{3898},[3898],{3898},{2566}
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32771,12036,1383,10,0,503,0,101,"[3502, 1088, 2004, 691, 3422, 2308, 308, 3688,...","[3860, 458, 458, 458, 458, 322, 600, 600, 600,...","[3898, 2364, 2364, 2364, 2364, 3898, 3898, 389...",...,{3898},[3898],{3898},[3898],{3898},[3898],{3898},[3898],{3898},"{322, 3241, 458, 3860, 600}"
32772,22709,567,4,2,491,3,66,"[3900, 3905, 3903, 3910, 1807, 274, 1467, 914]","[3898, 3898, 3898, 3898, 1072, 2920, 2475, 3565]","[3898, 3898, 3898, 3898, 3898, 3898, 2928, 256...",...,{3898},[3898],{3898},[3898],{3898},[3898],{3898},[3898],{3898},"{2920, 2475, 3565, 1072, 3898}"
32773,32322,1385,10,1,500,0,37,"[2686, 2609, 725]","[2917, 1175, 725, 1735]","[3898, 1735, 1735, 3898]",...,{3898},[3898],{3898},[3898],{3898},[3898],{3898},[3898],{3898},"{725, 1735, 2917, 1175}"
32774,19118,1113,6,2,491,0,117,"[3900, 3910, 3903, 2473, 914, 274]","[3898, 3898, 3898, 2566, 3565, 2920]","[3898, 3898, 3898, 1072, 1072, 3898, 2928]",...,{3898},[3898],{3898},[3898],{3898},[3898],{3898},[3898],{3898},"{2920, 3898, 3565, 2566}"


In [116]:
# converge untill it firstly reach to one
def f1(x):
    if len(x)==1:
        return x[0]
def f2(x):
    if len(x)==1:
        return list(x)[0]


df_items_category_one = df_items.copy()
df_items_category_one = df_items_category_one.drop(['parent_categories', 'grandparent_categories', 'grand_3_par_categories', 'grand_4_par_categories', 'grand_5_par_categories', 'grand_6_par_categories', 'grand_7_par_categories'], axis=1)
df_items_category_one['category'] = df_items_category_one['categories'].map(lambda x : f1(x))
print(df_items_category_one['category'].isnull().sum())
print("=============")
df_items_category_one['category'] = df_items_category_one['category'].fillna(df_items_category_one['parent_unique'].map(lambda x : f2(x)))
print("=============")
print(df_items_category_one['category'].isnull().sum())
df_items_category_one['category'] = df_items_category_one['category'].fillna(df_items_category_one['grandparent_unique'].map(lambda x : f2(x)))
print("=============")
print(df_items_category_one['category'].isnull().sum())
df_items_category_one['category'] = df_items_category_one['category'].fillna(df_items_category_one['grand_grand_par_unique'].map(lambda x : f2(x)))
print("=============")
print(df_items_category_one['category'].isnull().sum())
df_items_category_one['category'] = df_items_category_one['category'].fillna(df_items_category_one['grand_3_par_unique'].map(lambda x : f2(x)))
print("=============")
print(df_items_category_one['category'].isnull().sum())
df_items_category_one['category'] = df_items_category_one['category'].fillna(df_items_category_one['grand_4_par_unique'].map(lambda x : f2(x)))
print("=============")
print(df_items_category_one['category'].isnull().sum())
df_items_category_one['category'] = df_items_category_one['category'].fillna(df_items_category_one['grand_5_par_unique'].map(lambda x : f2(x)))
print("=============")
print(df_items_category_one['category'].isnull().sum())
df_items_category_one['category'] = df_items_category_one['category'].fillna(df_items_category_one['grand_6_par_unique'].map(lambda x : f2(x)))
print("=============")
print(df_items_category_one['category'].isnull().sum())
df_items_category_one['category'] = df_items_category_one['category'].fillna(df_items_category_one['grand_7_par_unique'].map(lambda x : f2(x)))
print("=============")
print(df_items_category_one['category'].isnull().sum())



19424
19025
16255
7887
1896
576
129
129
0


In [127]:
df_items_category_integrated = df_items_category_one.drop(['categories', 'grandparent_unique', 'grand_grand_par_categories', 'grand_grand_par_unique', 'grand_3_par_unique', 'grand_4_par_unique', 'grand_5_par_unique', 'grand_6_par_unique', 'grand_7_par_unique', 'parent_unique'], axis=1)
df_items_category_integrated['category'] = df_items_category_integrated['category'].astype(int)
df_items_category_integrated['category'].value_counts()

3898    18975
4300     6788
1807      346
2136      311
300       265
        ...  
3585        1
2997        1
779         1
489         1
440         1
Name: category, Length: 700, dtype: int64

In [128]:
df_items_category_integrated

Unnamed: 0,itemID,brand,feature_1,feature_2,feature_3,feature_4,feature_5,category
0,22665,861,4,0,490,2,66,3898
1,28640,1366,10,1,537,0,101,4300
2,13526,1090,10,0,511,0,0,3898
3,21399,1090,10,1,511,0,0,3270
4,8504,768,4,1,484,0,66,2470
...,...,...,...,...,...,...,...,...
32771,12036,1383,10,0,503,0,101,3898
32772,22709,567,4,2,491,3,66,3898
32773,32322,1385,10,1,500,0,37,3898
32774,19118,1113,6,2,491,0,117,3898


In [130]:
# Joining df_orders_dec dataframe and df_items_category_integrated dataframe
merge_dec = pd.merge(df_items_category_integrated, df_balanced_dec, how='inner', on='itemID')
#change the date coulumn's datayte to datetime datatype
merge_dec['date'] = pd.to_datetime(merge_dec['date'])
merge_dec = merge_dec.set_index('date')
merge_dec

Unnamed: 0_level_0,itemID,brand,feature_1,feature_2,feature_3,feature_4,feature_5,category,userID,order,label
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2020-11-26,13526,1090,10,0,511,0,0,3898,33539,1,0
2020-11-25,13526,1090,10,0,511,0,0,3898,32226,1,0
2020-11-17,13526,1090,10,0,511,0,0,3898,18945,1,0
2020-08-31,13526,1090,10,0,511,0,0,3898,3559,1,0
2020-08-15,13526,1090,10,0,511,0,0,3898,10210,1,0
...,...,...,...,...,...,...,...,...,...,...,...
2020-09-30,1128,827,6,0,364,0,132,440,9240,1,1
2020-10-25,1128,827,6,0,364,0,132,440,21622,1,4
2020-11-25,13822,993,4,0,491,0,144,248,11606,1,4
2020-11-18,4183,96,4,1,474,0,-1,3898,21102,1,1


In [145]:
merge_dec.to_csv("filtered_dataset/dec_category_labeled.csv")

In [132]:
# Joining df_orders_dec dataframe and df_items_category_integrated dataframe
merge_jan = pd.merge(df_items_category_integrated, df_balanced_jan, how='inner', on='itemID')
#change the date coulumn's datayte to datetime datatype
merge_jan['date'] = pd.to_datetime(merge_jan['date'])
merge_jan = merge_jan.set_index('date')
merge_jan

Unnamed: 0_level_0,itemID,brand,feature_1,feature_2,feature_3,feature_4,feature_5,category,userID,order,label
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2020-12-08,13526,1090,10,0,511,0,0,3898,36624,1,0
2020-12-06,13526,1090,10,0,511,0,0,3898,27527,1,0
2020-10-18,13526,1090,10,0,511,0,0,3898,7271,1,1
2020-11-12,13526,1090,10,0,511,0,0,3898,33269,1,2
2020-12-19,13526,1090,10,0,511,0,0,3898,39232,2,3
...,...,...,...,...,...,...,...,...,...,...,...
2020-12-06,1128,827,6,0,364,0,132,440,7765,1,1
2020-11-25,13822,993,4,0,491,0,144,248,11606,1,4
2020-08-14,4183,96,4,1,474,0,-1,3898,41747,1,0
2020-11-18,4183,96,4,1,474,0,-1,3898,21102,1,1


In [144]:
merge_jan.to_csv("filtered_dataset/jan_category_labeled.csv")

In [139]:
# change 4300->3898
merge_jan['category'].replace(4300, 3898, inplace=True)
merge_jan

Unnamed: 0_level_0,itemID,brand,feature_1,feature_2,feature_3,feature_4,feature_5,category,userID,order,label
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2020-12-08,13526,1090,10,0,511,0,0,3898,36624,1,0
2020-12-06,13526,1090,10,0,511,0,0,3898,27527,1,0
2020-10-18,13526,1090,10,0,511,0,0,3898,7271,1,1
2020-11-12,13526,1090,10,0,511,0,0,3898,33269,1,2
2020-12-19,13526,1090,10,0,511,0,0,3898,39232,2,3
...,...,...,...,...,...,...,...,...,...,...,...
2020-09-13,13635,1260,6,0,342,0,29,3898,45422,1,3
2020-12-19,13635,1260,6,0,342,0,29,3898,40142,1,2
2020-08-14,4183,96,4,1,474,0,-1,3898,41747,1,0
2020-11-18,4183,96,4,1,474,0,-1,3898,21102,1,1


In [143]:
merge_dec['category'].replace(4300, 3898, inplace=True)
merge_dec

Unnamed: 0_level_0,itemID,brand,feature_1,feature_2,feature_3,feature_4,feature_5,category,userID,order,label
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2020-11-26,13526,1090,10,0,511,0,0,3898,33539,1,0
2020-11-25,13526,1090,10,0,511,0,0,3898,32226,1,0
2020-11-17,13526,1090,10,0,511,0,0,3898,18945,1,0
2020-08-31,13526,1090,10,0,511,0,0,3898,3559,1,0
2020-08-15,13526,1090,10,0,511,0,0,3898,10210,1,0
...,...,...,...,...,...,...,...,...,...,...,...
2020-09-30,1128,827,6,0,364,0,132,440,9240,1,1
2020-10-25,1128,827,6,0,364,0,132,440,21622,1,4
2020-11-25,13822,993,4,0,491,0,144,248,11606,1,4
2020-11-18,4183,96,4,1,474,0,-1,3898,21102,1,1


In [47]:
# cut from grandparent categories
# df_items_del_lists = df_items.drop(['categories', 'parent_categories', 'grandparent_categories'], axis=1)
# df_items_del_lists
# df_items_del_lists

Unnamed: 0,itemID,brand,feature_1,feature_2,feature_3,feature_4,feature_5,grandparent_unique
0,22665,861,4,0,490,2,66,"{3898, 2012, 2838}"
1,28640,1366,10,1,537,0,101,{4300}
2,13526,1090,10,0,511,0,0,"{600, 3898, 2364}"
3,21399,1090,10,1,511,0,0,{2364}
4,8504,768,4,1,484,0,66,{1072}
...,...,...,...,...,...,...,...,...
32771,12036,1383,10,0,503,0,101,"{600, 3898, 2364}"
32772,22709,567,4,2,491,3,66,"{2928, 3898, 1072, 2566}"
32773,32322,1385,10,1,500,0,37,"{3898, 1735}"
32774,19118,1113,6,2,491,0,117,"{1072, 3898, 2928}"


In [None]:
# Joining df_orders_jan dataframe and df_items dataframe
# merge_jan = pd.merge(df_items, df_balanced_jan, how='inner', on='itemID')
# #change the date coulumn's datayte to datetime datatype
# merge_jan['date'] = pd.to_datetime(merge_jan['date'])
# merge_jan = merge_jan.set_index('date')

In [None]:
# merge_jan_no_cat = merge_jan.drop(columns=['categories', 'parent_categories'])

In [None]:
y = merge_jan_no_cat['label']
X = merge_jan_no_cat.drop(columns='label')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)
X_train

In [None]:
# upload submission dataset
sub_jan = pd.read_csv("submission_jan.csv", sep="|")
sub_jan

In [None]:
# baseline(didn't consider time series)--xgboost
def xgb_train(X_train, y_train, X_test, y_test, verbose=True):
    model_xgb = xgb.XGBClassifier(
        max_depth=10, # raw8
        n_estimators=1000,
        min_child_weight=300, 
        colsample_bytree=0.8, 
        subsample=0.8, 
        eta=0.3,    
        seed=42        
    )

    model_xgb.fit(
        X_train, 
        y_train,
        eval_metric='auc',
        eval_set=[(X_train, y_train), (X_test, y_test)],
        verbose=verbose,
        early_stopping_rounds=10 
    )
    print(model_xgb.best_score)
    return model_xgb

In [None]:
model_xgb = xgb_train(X_train, y_train, X_test, y_test, verbose=False)

In [None]:
pre_merge = pd.merge(sub_jan.drop(columns='prediction'), X, on=['userID','itemID'], how="inner")
prediction = model_xgb.predict(pre_merge)
sub_xgb = sub_jan.copy()
sub_xgb['prediction'] = pd.Series(prediction)
# user and items haven't shown up predict 0
sub_xgb['prediction'] = sub_xgb['prediction'].fillna(0)
# submission.to_csv('submission_xgb.csv', index=False)
sub_xgb['prediction'].value_counts()

In [None]:
sub_xgb.to_csv('submission_xgb.csv')

In [None]:
# baseline1: fill 1
sub_jan_all1 = sub_jan.copy()
sub_jan_all1['prediction'] = sub_jan['prediction'].fillna(1)
sub_jan_all1.to_csv('submission_all1.csv')


In [None]:
# gold dataset
result_for_jan = pd.merge(sub_jan.drop(columns='prediction'), merge_jan, on=['userID','itemID'], how="inner")
gold = result_for_jan[['userID','itemID','label']]
gold.rename(columns={'label':'prediction'}, inplace=True)

In [None]:
gold

In [None]:
def count_points(pred, gold):
    df = pd.merge(pred, gold, on=['userID', 'itemID'], suffixes=('_pred', '_gold'))
    df['points'] = df.apply(_compute_points_for_row, axis=1)
    return df['points'].sum()

def _compute_points_for_row(row):
    y_pred, y_gold = row.prediction_pred, row.prediction_gold
    if y_pred == y_gold:
        # one point if "no order" (0) is predicted correctly; three points if order week is predicted correctly
        return 1 if y_pred == 0 else 3
    # one point if order is predicted correctly (but not the correct week), otherwise zero points
    return 1 if (y_pred > 0 and y_gold > 0) else 0

In [None]:
# bonus for all 1 model
pred, gold = sub_jan_all1, gold  # TODO: load your prediction and goldstandard

points = count_points(pred, gold)
max_points = count_points(gold, gold)
score = points / max_points
print(points)
print(max_points)
print(score)

In [None]:
# bonus for xgboost model
pred, gold = sub_xgb, gold  # TODO: load your prediction and goldstandard

points = count_points(pred, gold)
max_points = count_points(gold, gold)
score = points / max_points
print(points)
print(max_points)
print(score)

In [None]:
# baseline: random forest
# from sklearn.metrics import accuracy_score

# rf = RandomForestClassifier()
# rf.fit(X_train, y_train)
# rf_pred=rf.predict(X_test)
# rf_acc=rf.score(X_test, y_test)
# rf_acc