https://www.kaggle.com/code/thitchen/walmart

In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import StratifiedKFold
from scipy.sparse import csr_matrix, hstack
from sklearn.model_selection import StratifiedShuffleSplit
import xgboost as xgb
import datetime
from pandas.api.types import CategoricalDtype

In [2]:
train = pd.read_csv('/Users/young/dataset_local/walmart-recruiting-trip-type-classification/train.csv',dtype=({'Upc':object,'FinelineNumber':object,'ScanCount':np.int16,'TripType':np.int16,'VisitNumber':np.int32}))
test = pd.read_csv('/Users/young/dataset_local/walmart-recruiting-trip-type-classification/test.csv',dtype=({'Upc':object,'FinelineNumber':object,'ScanCount':np.int16,'TripType':np.int16,'VisitNumber':np.int32}))

In [3]:
test['TripType'] = 0
test['TripType']

0         0
1         0
2         0
3         0
4         0
         ..
653641    0
653642    0
653643    0
653644    0
653645    0
Name: TripType, Length: 653646, dtype: int64

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 647054 entries, 0 to 647053
Data columns (total 7 columns):
 #   Column                 Non-Null Count   Dtype 
---  ------                 --------------   ----- 
 0   TripType               647054 non-null  int16 
 1   VisitNumber            647054 non-null  int32 
 2   Weekday                647054 non-null  object
 3   Upc                    642925 non-null  object
 4   ScanCount              647054 non-null  int16 
 5   DepartmentDescription  645693 non-null  object
 6   FinelineNumber         642925 non-null  object
dtypes: int16(2), int32(1), object(4)
memory usage: 24.7+ MB


In [5]:
#훈련, 검증 데이터 유니크 분류
tr_VisitNumber = list(train.VisitNumber.unique())
te_VisitNumber = list(test.VisitNumber.unique())

In [6]:
#널 처리
train.Upc.fillna('0',inplace=True)
train.DepartmentDescription.fillna('NA',inplace=True)
train.FinelineNumber.fillna('-1',inplace=True)
test.Upc.fillna('0',inplace=True)
test.DepartmentDescription.fillna('NA',inplace=True)
test.FinelineNumber.fillna('-1',inplace=True)


In [7]:
#데이터타입 정의
train.Upc = train.Upc.astype(np.int64)
train.FinelineNumber = train.FinelineNumber.astype(np.int16)
train.ScanCount = train.ScanCount.astype(np.int32)
test.Upc = test.Upc.astype(np.int64)
test.FinelineNumber = test.FinelineNumber.astype(np.int16)
test.ScanCount = test.ScanCount.astype(np.int32)

In [8]:
#finelinenumber = 연관 상품 군집 번호
#오프라인 매장 특성상 visitnumber는 입장 순서대로 찍힘을 감안하여 정의한듯.
train.sort_values(by='VisitNumber',inplace=True)
train[train['DepartmentDescription']=='PHARMACY RX']


Unnamed: 0,TripType,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber
1155,44,496,Friday,0,1,PHARMACY RX,-1
1216,5,521,Friday,0,1,PHARMACY RX,-1
1373,5,585,Friday,0,1,PHARMACY RX,-1
1456,5,619,Friday,0,1,PHARMACY RX,-1
1457,5,619,Friday,0,1,PHARMACY RX,-1
...,...,...,...,...,...,...,...
636715,5,188839,Sunday,0,1,PHARMACY RX,-1
636716,5,188839,Sunday,0,1,PHARMACY RX,-1
636717,5,188839,Sunday,0,1,PHARMACY RX,-1
636847,5,188896,Sunday,0,1,PHARMACY RX,-1


In [9]:
train_visitnumber_triptype = train.groupby(['VisitNumber']).agg({'TripType': 'first'}).reset_index()
train_visitnumber_triptype

Unnamed: 0,VisitNumber,TripType
0,5,999
1,7,30
2,8,26
3,9,8
4,10,8
...,...,...
95669,191343,25
95670,191344,22
95671,191345,39
95672,191346,39


In [10]:
# The train and test dataframes are conbined together, so the features are dealt together
#훈련, 검증데이터에 피쳐를 동일하게 생성하여 분류에 사용할 것이기에, 합쳐서 피쳐 엔지니어링 진행하는듯.
train_test = pd.concat([train,test], ignore_index=True).sort_values('VisitNumber')
train_test.TripType = train_test.TripType.astype(np.int16)
train_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1300700 entries, 647054 to 1300699
Data columns (total 7 columns):
 #   Column                 Non-Null Count    Dtype 
---  ------                 --------------    ----- 
 0   TripType               1300700 non-null  int16 
 1   VisitNumber            1300700 non-null  int32 
 2   Weekday                1300700 non-null  object
 3   Upc                    1300700 non-null  int64 
 4   ScanCount              1300700 non-null  int32 
 5   DepartmentDescription  1300700 non-null  object
 6   FinelineNumber         1300700 non-null  int16 
dtypes: int16(2), int32(2), int64(1), object(2)
memory usage: 54.6+ MB


In [11]:
# Calculate the total number of items purchased per each visit
sCountPerVisit = train_test.groupby(['VisitNumber']).agg({'ScanCount': 'sum'})\
.rename(columns={'ScanCount': 'SCountPerVisit'}).reset_index()
sCountPerVisit.head()

Unnamed: 0,VisitNumber,SCountPerVisit
0,1,4
1,2,4
2,3,0
3,4,1
4,5,-1


In [12]:
# Calculate the total number of items per visit per Upc
sCountPerVisitPerUpc = train_test.groupby(['VisitNumber', 'Upc']).agg({'ScanCount': 'sum'})\
.rename(columns={'ScanCount': 'SCountPerVisitPerUpc'}).reset_index()
sCountPerVisitPerUpc.head()


Unnamed: 0,VisitNumber,Upc,SCountPerVisitPerUpc
0,1,1707710732,1
1,1,72503389714,1
2,1,88491211470,1
3,1,89470001026,1
4,2,2840015224,1


In [13]:
sCountPerVisitPerUpc.shape
sCountPerVisit.shape

(191348, 2)

In [14]:
# Convert weekday to number

wdict = {'Monday':1,
        'Tuesday':2,
        'Wednesday':3,
        'Thursday':4,
        'Friday':5,
        'Saturday':6,
        'Sunday':7}

train_test['wd'] = train_test.Weekday.apply(lambda x: wdict[x])

remove garbage such as triptype 999

In [15]:
# netout_visits include visits which the TotalItems <= 0
#scan이 없는 값은 쓰레기로 분류하여 걸러냄.
netout_visits = list(sCountPerVisit[sCountPerVisit.SCountPerVisit <= 0]['VisitNumber'])
# In the train set, these visits types are almost always 999
#이러한 방문의 경우 월마트에서 999로 처리됨을 추측할 수 있음. 따라서 버린다.
train_netout_visits = train_visitnumber_triptype[train_visitnumber_triptype.VisitNumber.isin(netout_visits)]
print(np.count_nonzero(train_netout_visits.TripType == 999) / train_netout_visits.shape[0] *100,'% of these visits are of TripType 999')

99.93546305259761 % of these visits are of TripType 999


In [16]:
# someout_visits include visits where the total ScanCount of some Upc items is less than 0
someout_visits = list(sCountPerVisitPerUpc[sCountPerVisitPerUpc.SCountPerVisitPerUpc < 0]['VisitNumber'])
# In the train set, these visits types are almost always 999
train_someout_visits = train_visitnumber_triptype[train_visitnumber_triptype.VisitNumber.isin(someout_visits)]
print(np.count_nonzero(train_someout_visits.TripType == 999) / train_someout_visits.shape[0] *100,'% of these visits are of TripType 999')

99.80995819080198 % of these visits are of TripType 999


In [17]:
# Therefore it is safe to assign 999 to these visits, so they are removed for now.
train_test = train_test.query('VisitNumber != @netout_visits & VisitNumber != @someout_visits')
train_test.shape

(1274023, 8)

Add basic features

In [18]:
# Add columns 'Pos' and 'Neg'.
# They are correlated but are useful to mark those return records, before aggregating ScanCount
train_test['Pos'] = (train_test.ScanCount > 0).astype(np.int16)
train_test['Neg'] = (train_test.ScanCount < 0).astype(np.int16)

In [19]:
# Aggregate ScanCount
#ScanCount 열을 부호(양수, 음수 또는 0)에 매핑한 다음 정수 데이터 형식(np.int16)으로 변환합니다. 
# 따라서 반품 열은 항목이 반품되었는지(-1), 구매했는지(1) 또는 스캔되지 않았는지(0)를 나타냅
train_test = train_test.groupby(['VisitNumber',
 'Upc',
 'DepartmentDescription',
 'Weekday',
 'FinelineNumber',
 'TripType'], as_index=False).sum().sort_values('VisitNumber')
# Add column Return, which is the sign of ScanCount
train_test['Return'] = train_test.ScanCount.map(lambda x: np.sign(x)).astype(np.int16)


In [20]:
# Add time of the day as a fraction of one day, 0 is the first visit of the day, 1 means the last visit of the day
# 'first' is the first VisitNumber of the day
#visitnumber이 순서대로 찍히는 점을 이용해서, 날짜별 마지막 visitnumber를 활용해서 방문 시간 계산
train_test['first'] = (train_test
               .groupby((train_test.Weekday != train_test.Weekday.shift()).cumsum())
               .VisitNumber
               .transform('first'))
# 'last' is the last VisitNumber of the day
train_test['last'] =  (train_test
               .groupby((train_test.Weekday != train_test.Weekday.shift()).cumsum())
               .VisitNumber
               .transform('last'))
train_test['time_of_day'] = (train_test['VisitNumber'] - train_test['first'] + 1) / (train_test['last'] - train_test['first'] + 1)

In [21]:
train_test

Unnamed: 0,VisitNumber,Upc,DepartmentDescription,Weekday,FinelineNumber,TripType,ScanCount,wd,Pos,Neg,Return,first,last,time_of_day
0,1,1707710732,DAIRY,Friday,1526,0,1,5,1,0,1,1,6213,0.000161
1,1,72503389714,SHOES,Friday,3002,0,1,5,1,0,1,1,6213,0.000161
2,1,88491211470,GROCERY DRY GOODS,Friday,3555,0,1,5,1,0,1,1,6213,0.000161
3,1,89470001026,DAIRY,Friday,1431,0,1,5,1,0,1,1,6213,0.000161
4,2,2840015224,DSD GROCERY,Friday,4408,0,1,5,1,0,1,1,6213,0.000322
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1246433,191348,88181390024,BATH AND SHOWER,Sunday,1099,0,1,7,1,0,1,184449,191348,1.000000
1246429,191348,7871535983,MENS WEAR,Sunday,4923,0,1,7,1,0,1,184449,191348,1.000000
1246428,191348,4282557050,MENS WEAR,Sunday,8220,0,1,7,1,0,1,184449,191348,1.000000
1246430,191348,66572105763,BATH AND SHOWER,Sunday,1505,0,1,7,1,0,1,184449,191348,1.000000


In [22]:
# Create a column 'day_counter'. There are 31 days in total
#순서대로 쌓인 점을 이용해서, 행별로 Weekday가 이전 행과 달라지지 않았따면 같은날로, 이전행과 달라졌다면 +1일로 계산.
train_test['day_counter'] = (train_test.Weekday != train_test.Weekday.shift()).cumsum()

In [23]:
train_test

Unnamed: 0,VisitNumber,Upc,DepartmentDescription,Weekday,FinelineNumber,TripType,ScanCount,wd,Pos,Neg,Return,first,last,time_of_day,day_counter
0,1,1707710732,DAIRY,Friday,1526,0,1,5,1,0,1,1,6213,0.000161,1
1,1,72503389714,SHOES,Friday,3002,0,1,5,1,0,1,1,6213,0.000161,1
2,1,88491211470,GROCERY DRY GOODS,Friday,3555,0,1,5,1,0,1,1,6213,0.000161,1
3,1,89470001026,DAIRY,Friday,1431,0,1,5,1,0,1,1,6213,0.000161,1
4,2,2840015224,DSD GROCERY,Friday,4408,0,1,5,1,0,1,1,6213,0.000322,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1246433,191348,88181390024,BATH AND SHOWER,Sunday,1099,0,1,7,1,0,1,184449,191348,1.000000,31
1246429,191348,7871535983,MENS WEAR,Sunday,4923,0,1,7,1,0,1,184449,191348,1.000000,31
1246428,191348,4282557050,MENS WEAR,Sunday,8220,0,1,7,1,0,1,184449,191348,1.000000,31
1246430,191348,66572105763,BATH AND SHOWER,Sunday,1505,0,1,7,1,0,1,184449,191348,1.000000,31


In [24]:
# Calculate the total number of visits per day, to be used later
visitsPerDay = train_test.groupby('day_counter').VisitNumber.apply(lambda x: len(np.unique(x))).reset_index()\
.rename(columns={'VisitNumber' :'VisitsPerDay'}).astype({'day_counter': np.int16, 'VisitsPerDay': np.int16})
visitsPerDay.head()

Unnamed: 0,day_counter,VisitsPerDay
0,1,5827
1,2,6243
2,3,6387
3,4,5663
4,5,5305


In [25]:
# Calculate the sum of ScanCount per day, to be used later
sCountPerDay = train_test.groupby('day_counter').agg({'ScanCount': 'sum'}).reset_index()\
.rename(columns={'ScanCount': 'SCountPerDay'}).astype({'day_counter': np.int16, 'SCountPerDay': np.int32})
sCountPerDay.head()

Unnamed: 0,day_counter,SCountPerDay
0,1,43752
1,2,52682
2,3,60889
3,4,46133
4,5,39576


In [26]:
# Add sCountPerVisit to train_test
train_test = pd.merge(train_test, sCountPerVisit, how='left', on=['VisitNumber'])
train_test = pd.merge(train_test, sCountPerVisitPerUpc, how='left', on=['VisitNumber', 'Upc'])

In [27]:
train_test

Unnamed: 0,VisitNumber,Upc,DepartmentDescription,Weekday,FinelineNumber,TripType,ScanCount,wd,Pos,Neg,Return,first,last,time_of_day,day_counter,SCountPerVisit,SCountPerVisitPerUpc
0,1,1707710732,DAIRY,Friday,1526,0,1,5,1,0,1,1,6213,0.000161,1,4,1
1,1,72503389714,SHOES,Friday,3002,0,1,5,1,0,1,1,6213,0.000161,1,4,1
2,1,88491211470,GROCERY DRY GOODS,Friday,3555,0,1,5,1,0,1,1,6213,0.000161,1,4,1
3,1,89470001026,DAIRY,Friday,1431,0,1,5,1,0,1,1,6213,0.000161,1,4,1
4,2,2840015224,DSD GROCERY,Friday,4408,0,1,5,1,0,1,1,6213,0.000322,1,4,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1246430,191348,88181390024,BATH AND SHOWER,Sunday,1099,0,1,7,1,0,1,184449,191348,1.000000,31,7,1
1246431,191348,7871535983,MENS WEAR,Sunday,4923,0,1,7,1,0,1,184449,191348,1.000000,31,7,1
1246432,191348,4282557050,MENS WEAR,Sunday,8220,0,1,7,1,0,1,184449,191348,1.000000,31,7,1
1246433,191348,66572105763,BATH AND SHOWER,Sunday,1505,0,1,7,1,0,1,184449,191348,1.000000,31,7,1


In [28]:
# Calculate the ScanCount/SCountPerVisit per each Upc per visit
train_test['Div'] = np.where(train_test['ScanCount']==0, 0, train_test['ScanCount'] / train_test['SCountPerVisit'])

In [29]:
# Use domain knowledge, split Upc into 2 parts, part1 for factory code, part2 for item code (-1 for missing Upc's)
#UPC 바코드를 10만단위로 앞 뒤로 잘라 공장/소비자 바코드로 쪼개기. (미국 표준인듯)
train_test['Fac_Upc'] = np.where(train_test.Upc==0,-1,train_test.Upc//100000)
train_test['Item_Upc'] = np.where(train_test.Upc==0,-1,train_test.Upc%100000)
train_test.shape

(1246435, 20)

In [30]:
train_test

Unnamed: 0,VisitNumber,Upc,DepartmentDescription,Weekday,FinelineNumber,TripType,ScanCount,wd,Pos,Neg,Return,first,last,time_of_day,day_counter,SCountPerVisit,SCountPerVisitPerUpc,Div,Fac_Upc,Item_Upc
0,1,1707710732,DAIRY,Friday,1526,0,1,5,1,0,1,1,6213,0.000161,1,4,1,0.250000,17077,10732
1,1,72503389714,SHOES,Friday,3002,0,1,5,1,0,1,1,6213,0.000161,1,4,1,0.250000,725033,89714
2,1,88491211470,GROCERY DRY GOODS,Friday,3555,0,1,5,1,0,1,1,6213,0.000161,1,4,1,0.250000,884912,11470
3,1,89470001026,DAIRY,Friday,1431,0,1,5,1,0,1,1,6213,0.000161,1,4,1,0.250000,894700,1026
4,2,2840015224,DSD GROCERY,Friday,4408,0,1,5,1,0,1,1,6213,0.000322,1,4,1,0.250000,28400,15224
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1246430,191348,88181390024,BATH AND SHOWER,Sunday,1099,0,1,7,1,0,1,184449,191348,1.000000,31,7,1,0.142857,881813,90024
1246431,191348,7871535983,MENS WEAR,Sunday,4923,0,1,7,1,0,1,184449,191348,1.000000,31,7,1,0.142857,78715,35983
1246432,191348,4282557050,MENS WEAR,Sunday,8220,0,1,7,1,0,1,184449,191348,1.000000,31,7,1,0.142857,42825,57050
1246433,191348,66572105763,BATH AND SHOWER,Sunday,1505,0,1,7,1,0,1,184449,191348,1.000000,31,7,1,0.142857,665721,5763


Calculate entropy

In [31]:
# Calculate entropy of Upc
#Ent_UPC = 구매 엔트로피를 계산한 컬럼. 상품의 다양성을 뜻함
tr_upc_ent = train_test[['VisitNumber', 'Upc', 'SCountPerVisit', 'ScanCount', 'DepartmentDescription']]\
.groupby(['VisitNumber', 'Upc'])\
.agg({'SCountPerVisit': 'first', 'ScanCount': 'sum'}).reset_index()
tr_upc_ent['Div'] = tr_upc_ent['ScanCount'] / tr_upc_ent['SCountPerVisit']
with np.errstate(divide='ignore'):
    tr_upc_ent['Ent_Upc'] = np.where(tr_upc_ent['Div']==0, 0, tr_upc_ent['Div'] * np.log2(tr_upc_ent['Div']) * -1)

tr_upc_ent = tr_upc_ent.groupby('VisitNumber').agg({'Ent_Upc': np.sum}).reset_index()
tr_upc_ent.shape

(178578, 2)

In [32]:
tr_upc_ent

Unnamed: 0,VisitNumber,Ent_Upc
0,1,2.000000
1,2,2.000000
2,4,0.000000
3,7,1.000000
4,8,4.280395
...,...,...
178573,191344,2.321928
178574,191345,3.616875
178575,191346,4.087463
178576,191347,1.000000


In [33]:
# Calculate entropy of DepartmentDescription
tr_dept_ent = train_test[['VisitNumber', 'DepartmentDescription','SCountPerVisit', 'ScanCount']]\
.groupby(['VisitNumber','DepartmentDescription'])\
.agg({'SCountPerVisit': 'first', 'ScanCount': 'sum'}).reset_index()
tr_dept_ent['Div'] = tr_dept_ent['ScanCount'] / tr_dept_ent['SCountPerVisit']
with np.errstate(divide='ignore'):
    tr_dept_ent['Ent_Dept'] = np.where(tr_dept_ent['Div']==0, 0, tr_dept_ent['Div'] * np.log2(tr_dept_ent['Div']) * -1)
tr_dept_ent = tr_dept_ent.groupby('VisitNumber').agg({'Ent_Dept': np.sum}).reset_index()
tr_dept_ent.shape

(178578, 2)

In [34]:
# Calculate entropy of FinelineNumber
tr_fln_ent = train_test[['VisitNumber', 'FinelineNumber', 'SCountPerVisit', 'ScanCount', 'DepartmentDescription']]\
.groupby(['VisitNumber', 'FinelineNumber'])\
.agg({'SCountPerVisit': 'first', 'ScanCount': 'sum'}).reset_index()
tr_fln_ent['Div'] = tr_fln_ent['ScanCount'] / tr_fln_ent['SCountPerVisit']
with np.errstate(divide='ignore'):
    tr_fln_ent['Ent_Fln'] = np.where(tr_fln_ent['Div']==0, 0, tr_fln_ent['Div'] * np.log2(tr_fln_ent['Div']) * -1)
tr_fln_ent = tr_fln_ent.groupby('VisitNumber').agg({'Ent_Fln': np.sum}).reset_index()
tr_fln_ent.shape

(178578, 2)

In [35]:
tr_fln_ent

Unnamed: 0,VisitNumber,Ent_Fln
0,1,2.000000
1,2,1.500000
2,4,0.000000
3,7,1.000000
4,8,3.838804
...,...,...
178573,191344,1.370951
178574,191345,3.499228
178575,191346,3.969816
178576,191347,1.000000


In [36]:
# Calculate entropy of Fac_Upc
tr_fac_ent = train_test[['VisitNumber', 'Fac_Upc', 'SCountPerVisit', 'ScanCount']]\
.groupby(['VisitNumber', 'Fac_Upc'])\
.agg({'SCountPerVisit': 'first', 'ScanCount': 'sum'}).reset_index()
tr_fac_ent['Div'] = tr_fac_ent['ScanCount'] / tr_fac_ent['SCountPerVisit']
with np.errstate(divide='ignore'):
    tr_fac_ent['Ent_Fac'] = np.where(tr_fac_ent['Div']==0, 0, tr_fac_ent['Div'] * np.log2(tr_fac_ent['Div']) * -1)
tr_fac_ent = tr_fac_ent.groupby('VisitNumber').agg({'Ent_Fac': np.sum}).reset_index()
tr_fac_ent.shape

(178578, 2)

Calculate the number of unique items

In [37]:
tr_uni_dept = train_test.groupby('VisitNumber')['DepartmentDescription'].apply(lambda x: len(np.unique(x))).reset_index()
tr_uni_dept.rename(columns={'DepartmentDescription': 'Uni_Dept'}, inplace=True)
tr_uni_dept

Unnamed: 0,VisitNumber,Uni_Dept
0,1,3
1,2,3
2,4,1
3,7,2
4,8,7
...,...,...
178573,191344,2
178574,191345,8
178575,191346,8
178576,191347,2


In [38]:
# Calculate the number of unique FinelineNumber
tr_uni_fln = train_test.groupby('VisitNumber')['FinelineNumber'].apply(lambda x: len(np.unique(x))).reset_index()
tr_uni_fln.rename(columns = {'FinelineNumber': 'Uni_Fln'}, inplace = True)
tr_uni_fln.shape

(178578, 2)

In [39]:
# Calculate the number of unique Upc
tr_uni_upc = train_test.groupby('VisitNumber')['Upc'].apply(lambda x: len(np.unique(x))).reset_index()
tr_uni_upc.rename(columns = {'Upc': 'Uni_Upc'}, inplace = True)
tr_uni_upc.shape

(178578, 2)

In [40]:
# Calculate the number of unique factory code
tr_uni_fac = train_test.groupby('VisitNumber')['Fac_Upc'].apply(lambda x: len(np.unique(x))).reset_index()
tr_uni_fac.rename(columns = {'Fac_Upc': 'Uni_Fac'}, inplace = True)
tr_uni_fac.shape

(178578, 2)

Create one-hot dummy variables
= 원핫인코딩. 범주형 데이터를 각 컬럼으로 생성하여 1,0으로 구분


In [41]:
# Create dummy variable for DepartmentDescription
#스캔횟수를 1로 행을 펼친다. garbage 드롭, pd.get_dummies 함수로 범주형 데이터를 스파스 매트릭스로 펼친다.
tr_dept_dummy = train_test[['VisitNumber', 'DepartmentDescription', 'ScanCount', 'FinelineNumber']]
tr_dept_dummy = tr_dept_dummy.query('ScanCount > 0')
tr_dept_dummy = tr_dept_dummy.loc[np.repeat(tr_dept_dummy.index.values, tr_dept_dummy.ScanCount)]
tr_dept_dummy.drop(['FinelineNumber', 'ScanCount'], axis=1, inplace= True)
tr_dept_dummy = pd.get_dummies(tr_dept_dummy, prefix='dept', columns=['DepartmentDescription'])
tr_dept_dummy = tr_dept_dummy.groupby('VisitNumber').sum().reset_index()
tr_dept_dummy = tr_dept_dummy.astype('Sparse[int64, 0]')
tr_dept_dummy.columns.values.__len__()

70

In [42]:
tr_pop_Upc_day = train_test.groupby(['day_counter', 'Upc']).ScanCount.agg('sum').reset_index()
tr_pop_Upc_day.sort_values(['day_counter', 'ScanCount'], ascending = [1,0], inplace=True)
tr_pop_Upc_day['shifted'] = tr_pop_Upc_day.day_counter.shift(10)
tr_pop_Upc_day['keep'] = (tr_pop_Upc_day.day_counter != tr_pop_Upc_day.shifted)
tr_pop_Upc_day = tr_pop_Upc_day.query('keep')
tr_pop_Upc_day_list = list(tr_pop_Upc_day.Upc.unique())
tr_pop_Upc_day_list.__len__()

27

In [43]:
# Create popular Upc dummy variable
tr_pop_upc_dummy = train_test.query('Upc == @tr_pop_Upc_day_list')[['VisitNumber', 'Upc', 'ScanCount']]
tr_pop_upc_dummy = tr_pop_upc_dummy.loc[np.repeat(tr_pop_upc_dummy.index.values, tr_pop_upc_dummy.ScanCount)]
tr_pop_upc_dummy = tr_pop_upc_dummy[['VisitNumber', 'Upc']]
tr_pop_upc_dummy = pd.get_dummies(tr_pop_upc_dummy, prefix='pop_Upc', columns=['Upc'])
tr_pop_upc_dummy = tr_pop_upc_dummy.groupby('VisitNumber').sum().reset_index()
tr_pop_upc_dummy = tr_pop_upc_dummy.astype('Sparse[int64, 0]')
tr_pop_upc_dummy.shape

(45572, 28)

In [44]:
# Create weekday dummy
tr_weekday_dummy = train_test[['VisitNumber', 'Weekday']]
tr_weekday_dummy = tr_weekday_dummy.groupby('VisitNumber').Weekday.agg('first').reset_index()
tr_weekday_dummy = pd.get_dummies(tr_weekday_dummy, prefix='Weekday', columns=['Weekday'])
tr_weekday_dummy = tr_weekday_dummy.astype('Sparse[int64, 0]')


In [45]:
# Create month day dummy
tr_monthday_dummy = train_test[['VisitNumber', 'day_counter']]
tr_monthday_dummy = tr_monthday_dummy.groupby('VisitNumber').day_counter.agg('first').reset_index()
tr_monthday_dummy = pd.get_dummies(tr_monthday_dummy, prefix='mday', columns=['day_counter'])
tr_monthday_dummy = tr_monthday_dummy.astype('Sparse[int64, 0]')
tr_monthday_dummy.shape

(178578, 32)

Calculate tf-idf for some variables
- 자연 로그는 문서 모음에서 자주 발생하는 용어와 드물게 발생하는 용어에 대해 균형 잡힌 가중치를 제공하기 때문에 idf 점수를 계산하기 위해 tf-idf 알고리즘에서 일반적으로 사용됩니다.
- idf 점수는 총 문서 수를 특정 용어가 포함된 문서 수로 나눈 로그로 계산됩니다. 
- 이 공식은 상대적으로 적은 수의 문서에서 발생하는 용어에 높은 가중치를 부여하고 많은 문서에서 발생하는 용어에 낮은 가중치를 부여합니다.

- tf-idf를 사용하여 상품별 weighting을 진행했다. term frequency * inverse document frequency
- 유저의 각 스캔한 UPC 를 나열한 후, 해당상품 스캔횟수/ 해당 유저의 방문당 총 스캔횟수로 tf를 구했다.
- log(총방문횟수 / 해당 상품을 스캔한 유니크한 유저 수)로 idf를 구했다. idf는 보통 총 문서수/특정 단어 등장빈도이니, 적절한 응용인것 같다.


In [46]:
total_visits = np.unique(train_test.VisitNumber).__len__()

In [47]:
Upc_total_visits = train_test.groupby('Upc').VisitNumber.apply(lambda x: len(np.unique(x))).reset_index()\
.rename(columns={'VisitNumber': 'Upc_total_visits'})

tfidf_upc = train_test.groupby(['VisitNumber','Upc']).agg({'ScanCount': 'sum', 'SCountPerVisit': 'first'})\
.reset_index()\
.assign(tf = lambda x: x.ScanCount / x.SCountPerVisit)\
.merge(Upc_total_visits, how='left')\
.assign(idf = lambda x: np.log( total_visits/x.Upc_total_visits ))\
.assign(Upc_tfidf = lambda x: x.tf * x.idf)\
.groupby('VisitNumber').Upc_tfidf.agg([np.sum, np.std]).reset_index()\
.fillna(0)\
.rename(columns={'sum': 'Upc_tfidf_sum', 'std': 'Upc_tfidf_std'})

In [48]:
# Calculate tf-idf for Fac_Upc
Fac_Upc_total_visits = train_test.groupby('Fac_Upc').VisitNumber.apply(lambda x: len(np.unique(x))).reset_index()\
.rename(columns={'VisitNumber': 'Fac_Upc_total_visits'})
tfidf_fac_upc = train_test.groupby(['VisitNumber','Fac_Upc']).agg({'ScanCount': 'sum', 'SCountPerVisit': 'first'})\
.reset_index()\
.assign(tf = lambda x: x.ScanCount / x.SCountPerVisit)\
.merge(Fac_Upc_total_visits, how='left')\
.assign(idf = lambda x: np.log( total_visits/x.Fac_Upc_total_visits ))\
.assign(Fac_Upc_tfidf = lambda x: x.tf * x.idf)\
.groupby('VisitNumber').Fac_Upc_tfidf.agg([np.sum, np.std]).reset_index()\
.fillna(0)\
.rename(columns={'sum': 'Fac_Upc_tfidf_sum', 'std': 'Fac_Upc_tfidf_std'})
tfidf_fac_upc.shape

(178578, 3)

In [49]:
# Calculate tf-idf for FinelineNumber
fln_total_visits = train_test.groupby('FinelineNumber').VisitNumber.apply(lambda x: len(np.unique(x))).reset_index()\
.rename(columns={'VisitNumber': 'fln_total_visits'})
tfidf_fln = train_test.groupby(['VisitNumber','FinelineNumber']).agg({'ScanCount': 'sum', 'SCountPerVisit': 'first'})\
.reset_index()\
.assign(tf = lambda x: x.ScanCount / x.SCountPerVisit)\
.merge(fln_total_visits, how='left')\
.assign(idf = lambda x: np.log( total_visits/x.fln_total_visits ))\
.assign(fln_tfidf = lambda x: x.tf * x.idf)\
.groupby('VisitNumber').fln_tfidf.agg([np.sum, np.std]).reset_index()\
.fillna(0)\
.rename(columns={'sum': 'fln_tfidf_sum', 'std': 'fln_tfidf_std'})
tfidf_fln.shape

(178578, 3)

In [50]:
# Calculate tf-idf for DepartmentDescription
dept_total_visits = train_test.groupby('DepartmentDescription').VisitNumber.apply(lambda x: len(np.unique(x))).reset_index()\
.rename(columns={'VisitNumber': 'dept_total_visits'})
tfidf_dept = train_test.groupby(['VisitNumber','DepartmentDescription']).agg({'ScanCount': 'sum', 'SCountPerVisit': 'first'})\
.reset_index()\
.assign(tf = lambda x: x.ScanCount / x.SCountPerVisit)\
.merge(dept_total_visits, how='left')\
.assign(idf = lambda x: np.log( total_visits/x.dept_total_visits ))\
.assign(dept_tfidf = lambda x: x.tf * x.idf)\
.groupby('VisitNumber').dept_tfidf.agg([np.sum, np.std]).reset_index()\
.fillna(0)\
.rename(columns={'sum': 'dept_tfidf_sum', 'std': 'dept_tfidf_std'})
tfidf_dept.shape

(178578, 3)

Merge

In [51]:
print(len(tr_dept_ent))
print(len(tr_fln_ent))
print(len(tr_upc_ent))
print(len(tr_fac_ent))
print(len(tr_uni_dept))
print(len(tr_uni_fln))
print(len(tr_uni_upc))
print(len(tr_uni_fac))
print(len(tr_dept_dummy))
print(len(tr_pop_upc_dummy))
print(len(tr_weekday_dummy))
print(len(tr_monthday_dummy))
print(len(visitsPerDay))
print(len(sCountPerDay))
print(len(tfidf_upc))
print(len(tfidf_fac_upc))
print(len(tfidf_fln))
print(len(tfidf_dept))

178578
178578
178578
178578
178578
178578
178578
178578
178578
45572
178578
178578
31
31
178578
178578
178578
178578


In [52]:
tr_base = train_test.groupby('VisitNumber').agg({'time_of_day': 'first',
                                                 'TripType' : 'first',
                                                 'SCountPerVisit': 'first',
                                                 'day_counter': 'first',
                                                 'wd': 'first',
                                                 'Pos': 'sum',
                                                 'Neg': 'sum',
                                                 'Return': 'sum'}).reset_index()
tr_base = pd.merge(tr_base, tr_dept_ent, how='left')
tr_base = pd.merge(tr_base, tr_fln_ent, how='left')
tr_base = pd.merge(tr_base, tr_upc_ent, how='left')
tr_base = pd.merge(tr_base, tr_fac_ent, how='left')
tr_base = pd.merge(tr_base, tr_uni_dept, how='left')
tr_base = pd.merge(tr_base, tr_uni_fln, how='left')
tr_base = pd.merge(tr_base, tr_uni_upc, how='left')
tr_base = pd.merge(tr_base, tr_uni_fac, how='left')
tr_base = pd.merge(tr_base, tr_dept_dummy.sparse.to_dense(), how='left')
tr_base = pd.merge(tr_base, tr_pop_upc_dummy.sparse.to_dense(), how='left')
#tr_base = pd.merge(tr_base, tr_pop_fac_dummy.sparse.to_dense(), how='left')
#tr_base = pd.merge(tr_base, tr_pop_fln_dummy.sparse.to_dense(), how='left')
tr_base = pd.merge(tr_base, tr_weekday_dummy.sparse.to_dense(), how='left')
tr_base = pd.merge(tr_base, tr_monthday_dummy.sparse.to_dense(), how='left')
tr_base = pd.merge(tr_base, visitsPerDay, how='left')
tr_base = pd.merge(tr_base, sCountPerDay, how='left')
tr_base = pd.merge(tr_base, tfidf_upc)
tr_base = pd.merge(tr_base, tfidf_fac_upc)
tr_base = pd.merge(tr_base, tfidf_fln)
tr_base = pd.merge(tr_base, tfidf_dept)#.fillna(0).astype('Sparse[int64, 0]')
tr_base.shape

(178578, 161)

In [53]:
tr_base

Unnamed: 0,VisitNumber,time_of_day,TripType,SCountPerVisit,day_counter,wd,Pos,Neg,Return,Ent_Dept,...,VisitsPerDay,SCountPerDay,Upc_tfidf_sum,Upc_tfidf_std,Fac_Upc_tfidf_sum,Fac_Upc_tfidf_std,fln_tfidf_sum,fln_tfidf_std,dept_tfidf_sum,dept_tfidf_std
0,1,0.000161,0,4,1,5,4,0,4,1.500000,...,5827,43752,9.058118,0.258204,5.877505,0.295722,5.507225,0.361498,1.929213,0.248067
1,2,0.000322,0,4,1,5,4,0,4,1.500000,...,5827,43752,9.758758,0.421289,4.172925,1.597394,6.456928,0.980996,1.752860,0.268855
2,4,0.000644,0,1,1,5,1,0,1,0.000000,...,5827,43752,11.399634,0.000000,6.129201,0.000000,8.049729,0.000000,3.709119,0.000000
3,7,0.001127,30,2,1,5,2,0,2,1.000000,...,5827,43752,11.543475,0.388418,3.687834,1.243487,7.645857,0.991647,2.352288,0.510902
4,8,0.001288,26,28,1,5,21,2,21,1.769546,...,5827,43752,10.207754,0.239735,6.460186,0.321434,6.585358,0.316740,3.802210,1.033529
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
178573,191344,0.999420,22,5,31,7,5,0,5,0.721928,...,6435,62168,10.567857,0.151861,7.127289,0.879923,6.150503,1.083668,2.784494,0.735252
178574,191345,0.999565,39,17,31,7,13,0,13,2.771902,...,6435,62168,8.421376,0.270515,5.406571,0.215134,5.861140,0.228981,1.840476,0.091550
178575,191346,0.999710,39,17,31,7,17,0,17,2.771902,...,6435,62168,8.326484,0.088568,3.954139,0.113898,5.406917,0.080097,1.650918,0.098712
178576,191347,0.999855,8,2,31,7,2,0,2,1.000000,...,6435,62168,7.996911,0.679088,4.023909,0.700410,5.318720,1.061410,1.517831,0.062613


XGboost

In [54]:
# Helper function to create sparse pivot table
def to_sparse_pivot(fln, idd, item, count):
    visit_u = list(fln[idd].unique())
    fln_u = list(np.sort(fln[item].unique()))
    data = fln[count].tolist()
    visit_type = CategoricalDtype(categories=visit_u, ordered=True)
    row = fln[idd].astype(visit_type).cat.codes
    fln_type = CategoricalDtype(categories=fln_u, ordered=True)
    col = fln[item].astype(fln_type).cat.codes
    return csr_matrix((data, (row, col)), shape=(len(visit_u), len(fln_u)))

In [55]:
# Seperate feature and labels
tr_df = tr_base.query('TripType != 0')
te_df = tr_base.query('TripType == 0')
te_df = te_df.drop('TripType', axis=1)
tr_df_labels = tr_df.TripType
tr_df = tr_df.drop('TripType', axis=1)
feat = pd.concat([tr_df,te_df], ignore_index=True)
feat.shape

(178578, 160)

In [62]:
tr_agg = pd.concat([train_test.query('VisitNumber == @tr_VisitNumber'), 
                    train_test.query('VisitNumber != @tr_VisitNumber')], 
                   ignore_index=True)

In [72]:
fln_sparse = to_sparse_pivot(tr_agg[['VisitNumber', 'ScanCount', 'FinelineNumber']], 
                             'VisitNumber', 'FinelineNumber', 'ScanCount')
fac_sparse = to_sparse_pivot(tr_agg[['VisitNumber', 'ScanCount', 'Fac_Upc']], 
                             'VisitNumber', 'Fac_Upc', 'ScanCount')
dept_fac_uni = tr_agg.groupby(['VisitNumber', 'DepartmentDescription'])\
                .Fac_Upc.agg(lambda x: len(np.unique(x))).reset_index()
dept_fac_uni_sparse = to_sparse_pivot(dept_fac_uni, 'VisitNumber', 'DepartmentDescription', 'Fac_Upc')
dept_fln_uni = tr_agg.groupby(['VisitNumber', 'DepartmentDescription'])\
                .FinelineNumber.agg(lambda x: len(np.unique(x))).reset_index()
dept_fln_uni_sparse = to_sparse_pivot(dept_fln_uni, 'VisitNumber', 'DepartmentDescription', 'FinelineNumber')
dept_upc_uni = tr_agg.groupby(['VisitNumber', 'DepartmentDescription'])\
                .Upc.agg(lambda x: len(np.unique(x))).reset_index()
dept_upc_uni_sparse = to_sparse_pivot(dept_upc_uni, 'VisitNumber', 'DepartmentDescription', 'Upc')

In [77]:
# Construct feature matrix
feat_mtx= hstack([csr_matrix(feat.values), fln_sparse, fac_sparse, dept_fac_uni_sparse, dept_fln_uni_sparse,
                     dept_upc_uni_sparse])
feat_mtx.shape

(178578, 11977)

In [84]:
# Sepearte train and test matrix
tr_df_features = feat_mtx.tocsr()[:len(tr_df),:]
te_df_features = feat_mtx.tocsr()[len(tr_df):,:]

In [85]:
# Prepare train labels
le = preprocessing.LabelEncoder()
train_labels = le.fit_transform(tr_df_labels)

In [86]:
# Split into train and validation set
sss = StratifiedShuffleSplit(1, test_size=0.1, random_state=0)
for train_id, val_id in sss.split(tr_df_features,train_labels):
    xgtrain = xgb.DMatrix(tr_df_features[train_id], label=train_labels[train_id])
    xgval = xgb.DMatrix(tr_df_features[val_id ], label=train_labels[val_id])

In [88]:
# Set parameters
param = {'objective': 'multi:softprob', #multiclass classfication, output is probabilities
         'num_class': 38, #number of classes
         'eta': 0.0795628155661, #learning rate
         "eval_metric": "mlogloss",#evaluation metric
         'subsample': 0.929631734622,
         'colsample_bytree': 0.538628701606,
         'gamma' : 0, #regularization
         'min_child_weight': 4,#minimun weight
         'max_depth': 10,#maximun tree depth
         'max_delta_step': 3,#maximum weigh leaf
         'nthread': 8}

In [90]:
# Run the model
watchlist = [(xgtrain, 'train'), (xgval, 'val')]
num_rounds = 2000
model = xgb.train(param, xgtrain, num_rounds, watchlist, early_stopping_rounds=50)



[0]	train-mlogloss:3.42405	val-mlogloss:3.43013
[1]	train-mlogloss:3.21502	val-mlogloss:3.22938
[2]	train-mlogloss:3.01608	val-mlogloss:3.03950
[3]	train-mlogloss:2.81478	val-mlogloss:2.84594
[4]	train-mlogloss:2.62565	val-mlogloss:2.66762
[5]	train-mlogloss:2.45310	val-mlogloss:2.50473
[6]	train-mlogloss:2.28648	val-mlogloss:2.34726
[7]	train-mlogloss:2.13440	val-mlogloss:2.20568
[8]	train-mlogloss:2.00131	val-mlogloss:2.08488
[9]	train-mlogloss:1.88994	val-mlogloss:1.98292
[10]	train-mlogloss:1.78849	val-mlogloss:1.88979
[11]	train-mlogloss:1.69996	val-mlogloss:1.80921
[12]	train-mlogloss:1.62127	val-mlogloss:1.73856
[13]	train-mlogloss:1.55132	val-mlogloss:1.67610
[14]	train-mlogloss:1.48557	val-mlogloss:1.61808
[15]	train-mlogloss:1.42546	val-mlogloss:1.56413
[16]	train-mlogloss:1.36870	val-mlogloss:1.51397
[17]	train-mlogloss:1.31618	val-mlogloss:1.46810
[18]	train-mlogloss:1.26855	val-mlogloss:1.42606
[19]	train-mlogloss:1.22450	val-mlogloss:1.38692
[20]	train-mlogloss:1.18420	va

In [91]:
# Make prediction for the test set
pred = model.predict(xgb.DMatrix(te_df_features), iteration_range = (model.best_iteration-1 ,model.best_iteration))

In [121]:
result = pd.DataFrame(pred)
samsub = pd.read_csv('/Users/young/dataset_local/walmart-recruiting-trip-type-classification/sample_submission.csv')
result.columns = samsub.columns.values[1:]
result['VisitNumber'] = te_df['VisitNumber']
result = result[samsub.columns.values]
submission = pd.merge(pd.DataFrame(samsub.VisitNumber), result, how='left')
submission.fillna(value=0, inplace=True)
result_visitnumber_list = list(result.VisitNumber)
submission.loc[~submission.VisitNumber.isin(result_visitnumber_list), 'TripType_999'] = 1


In [156]:
result

Unnamed: 0,VisitNumber,TripType_3,TripType_4,TripType_5,TripType_6,TripType_7,TripType_8,TripType_9,TripType_12,TripType_14,...,TripType_36,TripType_37,TripType_38,TripType_39,TripType_40,TripType_41,TripType_42,TripType_43,TripType_44,TripType_999
0,1.0,0.026096,0.026015,0.026391,0.026778,0.026332,0.026940,0.027362,0.026097,0.026293,...,0.026360,0.026380,0.026409,0.026383,0.026371,0.026757,0.026230,0.026013,0.026240,0.026082
1,2.0,0.026059,0.025983,0.026422,0.026082,0.026293,0.025450,0.026572,0.026128,0.026324,...,0.026391,0.026411,0.026172,0.026414,0.026402,0.027111,0.026446,0.026044,0.026271,0.025796
2,4.0,0.026043,0.026209,0.026405,0.026134,0.025558,0.026432,0.026555,0.026151,0.026307,...,0.026374,0.026394,0.026423,0.026397,0.026386,0.026082,0.026429,0.026991,0.026297,0.026466
3,,0.026034,0.026915,0.026327,0.026461,0.026476,0.026355,0.026477,0.026074,0.026230,...,0.026297,0.026317,0.026346,0.026320,0.026308,0.027014,0.026167,0.025951,0.025379,0.026389
4,,0.026100,0.026198,0.026028,0.025957,0.026266,0.026364,0.027366,0.026114,0.026296,...,0.026333,0.026383,0.026145,0.026386,0.026375,0.027083,0.026399,0.025959,0.026804,0.026085
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89277,95649.0,0.026203,0.026193,0.026400,0.026298,0.026382,0.026370,0.025613,0.026557,0.026302,...,0.026369,0.026389,0.026418,0.027245,0.026381,0.026343,0.026487,0.026022,0.026232,0.026532
89278,95650.0,0.026607,0.026359,0.026398,0.026058,0.026339,0.026368,0.026366,0.026118,0.026300,...,0.026367,0.026387,0.026416,0.026390,0.026379,0.026209,0.026237,0.025716,0.026283,0.025902
89279,,0.026216,0.026374,0.026413,0.026073,0.026562,0.026440,0.026451,0.026119,0.026315,...,0.026382,0.027051,0.026431,0.026405,0.026394,0.026356,0.026253,0.026035,0.026298,0.025779
89280,,0.026615,0.025697,0.026406,0.026384,0.026347,0.026376,0.026374,0.026057,0.026308,...,0.026375,0.026395,0.026424,0.026398,0.026387,0.026349,0.026270,0.025508,0.026447,0.026538


In [137]:
# max_row_by_category = submission.groupby('VisitNumber').apply(lambda x: x.loc[x.iloc[:, 1:].idxmax(axis=1)])
max_row_by_category = submission.groupby('VisitNumber').apply(lambda x: max(np.unique(x.drop('VisitNumber',axis=1)))).reset_index()


In [154]:
submission.query('VisitNumber==300').groupby('VisitNumber').apply(lambda x: max(np.unique(x.drop('VisitNumber',axis=1)))).reset_index()
submission.query('VisitNumber==200').transpose()

Unnamed: 0,105
VisitNumber,200.0
TripType_3,0.026442
TripType_4,0.026201
TripType_5,0.026397
TripType_6,0.026126
TripType_7,0.02555
TripType_8,0.026424
TripType_9,0.026547
TripType_12,0.026117
TripType_14,0.026299


In [155]:
max_row_by_category.head(20)

Unnamed: 0,VisitNumber,0
0,1,0.027362
1,2,0.027111
2,3,1.0
3,4,0.027056
4,6,1.0
5,13,0.026919
6,14,0.027354
7,16,0.026757
8,18,0.027097
9,21,0.027097


In [113]:
pd.DataFrame(samsub.VisitNumber)

Unnamed: 0,VisitNumber
0,1
1,2
2,3
3,4
4,6
...,...
95669,191338
95670,191339
95671,191340
95672,191341
