https://www.kaggle.com/code/thitchen/walmart

In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import StratifiedKFold
from scipy.sparse import csr_matrix, hstack
from sklearn.model_selection import StratifiedShuffleSplit
import xgboost as xgb
import datetime
from pandas.api.types import CategoricalDtype

In [2]:
train = pd.read_csv('/Users/young/dataset_local/walmart-recruiting-trip-type-classification/train.csv',dtype=({'Upc':object,'FinelineNumber':object,'ScanCount':np.int16,'TripType':np.int16,'VisitNumber':np.int32}))
test = pd.read_csv('/Users/young/dataset_local/walmart-recruiting-trip-type-classification/test.csv',dtype=({'Upc':object,'FinelineNumber':object,'ScanCount':np.int16,'TripType':np.int16,'VisitNumber':np.int32}))

In [3]:
test['TripType'] = 0
test['TripType']

0         0
1         0
2         0
3         0
4         0
         ..
653641    0
653642    0
653643    0
653644    0
653645    0
Name: TripType, Length: 653646, dtype: int64

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 647054 entries, 0 to 647053
Data columns (total 7 columns):
 #   Column                 Non-Null Count   Dtype 
---  ------                 --------------   ----- 
 0   TripType               647054 non-null  int16 
 1   VisitNumber            647054 non-null  int32 
 2   Weekday                647054 non-null  object
 3   Upc                    642925 non-null  object
 4   ScanCount              647054 non-null  int16 
 5   DepartmentDescription  645693 non-null  object
 6   FinelineNumber         642925 non-null  object
dtypes: int16(2), int32(1), object(4)
memory usage: 24.7+ MB


In [5]:
#훈련, 검증 데이터 유니크 분류
tr_VisitNumber = list(train.VisitNumber.unique())
te_VisitNumber = list(test.VisitNumber.unique())

In [6]:
#널 처리
train.Upc.fillna('0',inplace=True)
train.DepartmentDescription.fillna('NA',inplace=True)
train.FinelineNumber.fillna('-1',inplace=True)
test.Upc.fillna('0',inplace=True)
test.DepartmentDescription.fillna('NA',inplace=True)
test.FinelineNumber.fillna('-1',inplace=True)


In [7]:
#데이터타입 정의
train.Upc = train.Upc.astype(np.int64)
train.FinelineNumber = train.FinelineNumber.astype(np.int16)
train.ScanCount = train.ScanCount.astype(np.int32)
test.Upc = test.Upc.astype(np.int64)
test.FinelineNumber = test.FinelineNumber.astype(np.int16)
test.ScanCount = test.ScanCount.astype(np.int32)

In [8]:
#finelinenumber = 연관 상품 군집 번호
#오프라인 매장 특성상 visitnumber는 입장 순서대로 찍힘을 감안하여 정의한듯.
train.sort_values(by='VisitNumber',inplace=True)
train[train['DepartmentDescription']=='PHARMACY RX']


Unnamed: 0,TripType,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber
1155,44,496,Friday,0,1,PHARMACY RX,-1
1216,5,521,Friday,0,1,PHARMACY RX,-1
1373,5,585,Friday,0,1,PHARMACY RX,-1
1456,5,619,Friday,0,1,PHARMACY RX,-1
1457,5,619,Friday,0,1,PHARMACY RX,-1
...,...,...,...,...,...,...,...
636715,5,188839,Sunday,0,1,PHARMACY RX,-1
636716,5,188839,Sunday,0,1,PHARMACY RX,-1
636717,5,188839,Sunday,0,1,PHARMACY RX,-1
636847,5,188896,Sunday,0,1,PHARMACY RX,-1


In [9]:
train_visitnumber_triptype = train.groupby(['VisitNumber']).agg({'TripType': 'first'}).reset_index()
train_visitnumber_triptype

Unnamed: 0,VisitNumber,TripType
0,5,999
1,7,30
2,8,26
3,9,8
4,10,8
...,...,...
95669,191343,25
95670,191344,22
95671,191345,39
95672,191346,39


In [10]:
# The train and test dataframes are conbined together, so the features are dealt together
#훈련, 검증데이터에 피쳐를 동일하게 생성하여 분류에 사용할 것이기에, 합쳐서 피쳐 엔지니어링 진행하는듯.
train_test = pd.concat([train,test], ignore_index=True).sort_values('VisitNumber')
train_test.TripType = train_test.TripType.astype(np.int16)
train_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1300700 entries, 647054 to 1300699
Data columns (total 7 columns):
 #   Column                 Non-Null Count    Dtype 
---  ------                 --------------    ----- 
 0   TripType               1300700 non-null  int16 
 1   VisitNumber            1300700 non-null  int32 
 2   Weekday                1300700 non-null  object
 3   Upc                    1300700 non-null  int64 
 4   ScanCount              1300700 non-null  int32 
 5   DepartmentDescription  1300700 non-null  object
 6   FinelineNumber         1300700 non-null  int16 
dtypes: int16(2), int32(2), int64(1), object(2)
memory usage: 54.6+ MB


In [11]:
# Calculate the total number of items purchased per each visit
sCountPerVisit = train_test.groupby(['VisitNumber']).agg({'ScanCount': 'sum'})\
.rename(columns={'ScanCount': 'SCountPerVisit'}).reset_index()
sCountPerVisit.head()

Unnamed: 0,VisitNumber,SCountPerVisit
0,1,4
1,2,4
2,3,0
3,4,1
4,5,-1


In [12]:
# Calculate the total number of items per visit per Upc
sCountPerVisitPerUpc = train_test.groupby(['VisitNumber', 'Upc']).agg({'ScanCount': 'sum'})\
.rename(columns={'ScanCount': 'SCountPerVisitPerUpc'}).reset_index()
sCountPerVisitPerUpc.head()


Unnamed: 0,VisitNumber,Upc,SCountPerVisitPerUpc
0,1,1707710732,1
1,1,72503389714,1
2,1,88491211470,1
3,1,89470001026,1
4,2,2840015224,1


In [13]:
sCountPerVisitPerUpc.shape
sCountPerVisit.shape

(191348, 2)

In [14]:
# Convert weekday to number

wdict = {'Monday':1,
        'Tuesday':2,
        'Wednesday':3,
        'Thursday':4,
        'Friday':5,
        'Saturday':6,
        'Sunday':7}

train_test['wd'] = train_test.Weekday.apply(lambda x: wdict[x])

remove garbage such as triptype 999

In [17]:
# netout_visits include visits which the TotalItems <= 0
#scan이 없는 값은 쓰레기로 분류하여 걸러냄.
netout_visits = list(sCountPerVisit[sCountPerVisit.SCountPerVisit <= 0]['VisitNumber'])
# In the train set, these visits types are almost always 999
#이러한 방문의 경우 월마트에서 999로 처리됨을 추측할 수 있음. 따라서 버린다.
train_netout_visits = train_visitnumber_triptype[train_visitnumber_triptype.VisitNumber.isin(netout_visits)]
print(np.count_nonzero(train_netout_visits.TripType == 999) / train_netout_visits.shape[0] *100,'% of these visits are of TripType 999')

99.93546305259761 % of these visits are of TripType 999


In [18]:
# someout_visits include visits where the total ScanCount of some Upc items is less than 0
someout_visits = list(sCountPerVisitPerUpc[sCountPerVisitPerUpc.SCountPerVisitPerUpc < 0]['VisitNumber'])
# In the train set, these visits types are almost always 999
train_someout_visits = train_visitnumber_triptype[train_visitnumber_triptype.VisitNumber.isin(someout_visits)]
print(np.count_nonzero(train_someout_visits.TripType == 999) / train_someout_visits.shape[0] *100,'% of these visits are of TripType 999')

99.80995819080198 % of these visits are of TripType 999


In [19]:
# Therefore it is safe to assign 999 to these visits, so they are removed for now.
train_test = train_test.query('VisitNumber != @netout_visits & VisitNumber != @someout_visits')
train_test.shape

(1274023, 8)

Add basic features

In [21]:
# Add columns 'Pos' and 'Neg'.
# They are correlated but are useful to mark those return records, before aggregating ScanCount
train_test['Pos'] = (train_test.ScanCount > 0).astype(np.int16)
train_test['Neg'] = (train_test.ScanCount < 0).astype(np.int16)

In [22]:
# Aggregate ScanCount
#ScanCount 열을 부호(양수, 음수 또는 0)에 매핑한 다음 정수 데이터 형식(np.int16)으로 변환합니다. 
# 따라서 반품 열은 항목이 반품되었는지(-1), 구매했는지(1) 또는 스캔되지 않았는지(0)를 나타냅
train_test = train_test.groupby(['VisitNumber',
 'Upc',
 'DepartmentDescription',
 'Weekday',
 'FinelineNumber',
 'TripType'], as_index=False).sum().sort_values('VisitNumber')
# Add column Return, which is the sign of ScanCount
train_test['Return'] = train_test.ScanCount.map(lambda x: np.sign(x)).astype(np.int16)


In [24]:
# Add time of the day as a fraction of one day, 0 is the first visit of the day, 1 means the last visit of the day
# 'first' is the first VisitNumber of the day
#visitnumber이 순서대로 찍히는 점을 이용해서, 날짜별 마지막 visitnumber를 활용해서 방문 시간 계산
train_test['first'] = (train_test
               .groupby((train_test.Weekday != train_test.Weekday.shift()).cumsum())
               .VisitNumber
               .transform('first'))
# 'last' is the last VisitNumber of the day
train_test['last'] =  (train_test
               .groupby((train_test.Weekday != train_test.Weekday.shift()).cumsum())
               .VisitNumber
               .transform('last'))
train_test['time_of_day'] = (train_test['VisitNumber'] - train_test['first'] + 1) / (train_test['last'] - train_test['first'] + 1)

In [25]:
train_test

Unnamed: 0,VisitNumber,Upc,DepartmentDescription,Weekday,FinelineNumber,TripType,ScanCount,wd,Pos,Neg,Return,first,last,time_of_day
0,1,1707710732,DAIRY,Friday,1526,0,1,5,1,0,1,1,6213,0.000161
1,1,72503389714,SHOES,Friday,3002,0,1,5,1,0,1,1,6213,0.000161
2,1,88491211470,GROCERY DRY GOODS,Friday,3555,0,1,5,1,0,1,1,6213,0.000161
3,1,89470001026,DAIRY,Friday,1431,0,1,5,1,0,1,1,6213,0.000161
4,2,2840015224,DSD GROCERY,Friday,4408,0,1,5,1,0,1,1,6213,0.000322
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1246433,191348,88181390024,BATH AND SHOWER,Sunday,1099,0,1,7,1,0,1,184449,191348,1.000000
1246429,191348,7871535983,MENS WEAR,Sunday,4923,0,1,7,1,0,1,184449,191348,1.000000
1246428,191348,4282557050,MENS WEAR,Sunday,8220,0,1,7,1,0,1,184449,191348,1.000000
1246430,191348,66572105763,BATH AND SHOWER,Sunday,1505,0,1,7,1,0,1,184449,191348,1.000000


In [26]:
# Create a column 'day_counter'. There are 31 days in total
#순서대로 쌓인 점을 이용해서, 행별로 Weekday가 이전 행과 달라지지 않았따면 같은날로, 이전행과 달라졌다면 +1일로 계산.
train_test['day_counter'] = (train_test.Weekday != train_test.Weekday.shift()).cumsum()

In [27]:
train_test

Unnamed: 0,VisitNumber,Upc,DepartmentDescription,Weekday,FinelineNumber,TripType,ScanCount,wd,Pos,Neg,Return,first,last,time_of_day,day_counter
0,1,1707710732,DAIRY,Friday,1526,0,1,5,1,0,1,1,6213,0.000161,1
1,1,72503389714,SHOES,Friday,3002,0,1,5,1,0,1,1,6213,0.000161,1
2,1,88491211470,GROCERY DRY GOODS,Friday,3555,0,1,5,1,0,1,1,6213,0.000161,1
3,1,89470001026,DAIRY,Friday,1431,0,1,5,1,0,1,1,6213,0.000161,1
4,2,2840015224,DSD GROCERY,Friday,4408,0,1,5,1,0,1,1,6213,0.000322,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1246433,191348,88181390024,BATH AND SHOWER,Sunday,1099,0,1,7,1,0,1,184449,191348,1.000000,31
1246429,191348,7871535983,MENS WEAR,Sunday,4923,0,1,7,1,0,1,184449,191348,1.000000,31
1246428,191348,4282557050,MENS WEAR,Sunday,8220,0,1,7,1,0,1,184449,191348,1.000000,31
1246430,191348,66572105763,BATH AND SHOWER,Sunday,1505,0,1,7,1,0,1,184449,191348,1.000000,31


In [28]:
# Calculate the total number of visits per day, to be used later
visitsPerDay = train_test.groupby('day_counter').VisitNumber.apply(lambda x: len(np.unique(x))).reset_index()\
.rename(columns={'VisitNumber' :'VisitsPerDay'}).astype({'day_counter': np.int16, 'VisitsPerDay': np.int16})
visitsPerDay.head()

Unnamed: 0,day_counter,VisitsPerDay
0,1,5827
1,2,6243
2,3,6387
3,4,5663
4,5,5305


In [29]:
# Calculate the sum of ScanCount per day, to be used later
sCountPerDay = train_test.groupby('day_counter').agg({'ScanCount': 'sum'}).reset_index()\
.rename(columns={'ScanCount': 'SCountPerDay'}).astype({'day_counter': np.int16, 'SCountPerDay': np.int32})
sCountPerDay.head()

Unnamed: 0,day_counter,SCountPerDay
0,1,43752
1,2,52682
2,3,60889
3,4,46133
4,5,39576


In [30]:
# Add sCountPerVisit to train_test
train_test = pd.merge(train_test, sCountPerVisit, how='left', on=['VisitNumber'])
train_test = pd.merge(train_test, sCountPerVisitPerUpc, how='left', on=['VisitNumber', 'Upc'])

In [31]:
train_test

Unnamed: 0,VisitNumber,Upc,DepartmentDescription,Weekday,FinelineNumber,TripType,ScanCount,wd,Pos,Neg,Return,first,last,time_of_day,day_counter,SCountPerVisit,SCountPerVisitPerUpc
0,1,1707710732,DAIRY,Friday,1526,0,1,5,1,0,1,1,6213,0.000161,1,4,1
1,1,72503389714,SHOES,Friday,3002,0,1,5,1,0,1,1,6213,0.000161,1,4,1
2,1,88491211470,GROCERY DRY GOODS,Friday,3555,0,1,5,1,0,1,1,6213,0.000161,1,4,1
3,1,89470001026,DAIRY,Friday,1431,0,1,5,1,0,1,1,6213,0.000161,1,4,1
4,2,2840015224,DSD GROCERY,Friday,4408,0,1,5,1,0,1,1,6213,0.000322,1,4,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1246430,191348,88181390024,BATH AND SHOWER,Sunday,1099,0,1,7,1,0,1,184449,191348,1.000000,31,7,1
1246431,191348,7871535983,MENS WEAR,Sunday,4923,0,1,7,1,0,1,184449,191348,1.000000,31,7,1
1246432,191348,4282557050,MENS WEAR,Sunday,8220,0,1,7,1,0,1,184449,191348,1.000000,31,7,1
1246433,191348,66572105763,BATH AND SHOWER,Sunday,1505,0,1,7,1,0,1,184449,191348,1.000000,31,7,1


In [33]:
# Calculate the ScanCount/SCountPerVisit per each Upc per visit
train_test['Div'] = np.where(train_test['ScanCount']==0, 0, train_test['ScanCount'] / train_test['SCountPerVisit'])

In [35]:
# Use domain knowledge, split Upc into 2 parts, part1 for factory code, part2 for item code (-1 for missing Upc's)
#UPC 바코드를 10만단위로 앞 뒤로 잘라 공장/소비자 바코드로 쪼개기. (미국 표준인듯)
train_test['Fac_Upc'] = np.where(train_test.Upc==0,-1,train_test.Upc//100000)
train_test['Item_Upc'] = np.where(train_test.Upc==0,-1,train_test.Upc%100000)
train_test.shape

(1246435, 20)

In [36]:
train_test

Unnamed: 0,VisitNumber,Upc,DepartmentDescription,Weekday,FinelineNumber,TripType,ScanCount,wd,Pos,Neg,Return,first,last,time_of_day,day_counter,SCountPerVisit,SCountPerVisitPerUpc,Div,Fac_Upc,Item_Upc
0,1,1707710732,DAIRY,Friday,1526,0,1,5,1,0,1,1,6213,0.000161,1,4,1,0.250000,17077,10732
1,1,72503389714,SHOES,Friday,3002,0,1,5,1,0,1,1,6213,0.000161,1,4,1,0.250000,725033,89714
2,1,88491211470,GROCERY DRY GOODS,Friday,3555,0,1,5,1,0,1,1,6213,0.000161,1,4,1,0.250000,884912,11470
3,1,89470001026,DAIRY,Friday,1431,0,1,5,1,0,1,1,6213,0.000161,1,4,1,0.250000,894700,1026
4,2,2840015224,DSD GROCERY,Friday,4408,0,1,5,1,0,1,1,6213,0.000322,1,4,1,0.250000,28400,15224
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1246430,191348,88181390024,BATH AND SHOWER,Sunday,1099,0,1,7,1,0,1,184449,191348,1.000000,31,7,1,0.142857,881813,90024
1246431,191348,7871535983,MENS WEAR,Sunday,4923,0,1,7,1,0,1,184449,191348,1.000000,31,7,1,0.142857,78715,35983
1246432,191348,4282557050,MENS WEAR,Sunday,8220,0,1,7,1,0,1,184449,191348,1.000000,31,7,1,0.142857,42825,57050
1246433,191348,66572105763,BATH AND SHOWER,Sunday,1505,0,1,7,1,0,1,184449,191348,1.000000,31,7,1,0.142857,665721,5763


Calculate entropy

In [45]:
# Calculate entropy of Upc
#Ent_UPC = 구매 엔트로피를 계산한 컬럼. 상품의 다양성을 뜻함
tr_upc_ent = train_test[['VisitNumber', 'Upc', 'SCountPerVisit', 'ScanCount', 'DepartmentDescription']]\
.groupby(['VisitNumber', 'Upc'])\
.agg({'SCountPerVisit': 'first', 'ScanCount': 'sum'}).reset_index()
tr_upc_ent['Div'] = tr_upc_ent['ScanCount'] / tr_upc_ent['SCountPerVisit']
with np.errstate(divide='ignore'):
    tr_upc_ent['Ent_Upc'] = np.where(tr_upc_ent['Div']==0, 0, tr_upc_ent['Div'] * np.log2(tr_upc_ent['Div']) * -1)

tr_upc_ent = tr_upc_ent.groupby('VisitNumber').agg({'Ent_Upc': np.sum}).reset_index()
tr_upc_ent.shape

(178578, 2)

In [46]:
tr_upc_ent

Unnamed: 0,VisitNumber,Ent_Upc
0,1,2.000000
1,2,2.000000
2,4,0.000000
3,7,1.000000
4,8,4.280395
...,...,...
178573,191344,2.321928
178574,191345,3.616875
178575,191346,4.087463
178576,191347,1.000000


In [47]:
# Calculate entropy of DepartmentDescription
tr_dept_ent = train_test[['VisitNumber', 'DepartmentDescription','SCountPerVisit', 'ScanCount']]\
.groupby(['VisitNumber','DepartmentDescription'])\
.agg({'SCountPerVisit': 'first', 'ScanCount': 'sum'}).reset_index()
tr_dept_ent['Div'] = tr_dept_ent['ScanCount'] / tr_dept_ent['SCountPerVisit']
with np.errstate(divide='ignore'):
    tr_dept_ent['Ent_Dept'] = np.where(tr_dept_ent['Div']==0, 0, tr_dept_ent['Div'] * np.log2(tr_dept_ent['Div']) * -1)
tr_dept_ent = tr_dept_ent.groupby('VisitNumber').agg({'Ent_Dept': np.sum}).reset_index()
tr_dept_ent.shape

(178578, 2)

In [48]:
# Calculate entropy of FinelineNumber
tr_fln_ent = train_test[['VisitNumber', 'FinelineNumber', 'SCountPerVisit', 'ScanCount', 'DepartmentDescription']]\
.groupby(['VisitNumber', 'FinelineNumber'])\
.agg({'SCountPerVisit': 'first', 'ScanCount': 'sum'}).reset_index()
tr_fln_ent['Div'] = tr_fln_ent['ScanCount'] / tr_fln_ent['SCountPerVisit']
with np.errstate(divide='ignore'):
    tr_fln_ent['Ent_Fln'] = np.where(tr_fln_ent['Div']==0, 0, tr_fln_ent['Div'] * np.log2(tr_fln_ent['Div']) * -1)
tr_fln_ent = tr_fln_ent.groupby('VisitNumber').agg({'Ent_Fln': np.sum}).reset_index()
tr_fln_ent.shape

(178578, 2)

In [50]:
tr_fln_ent

Unnamed: 0,VisitNumber,Ent_Fln
0,1,2.000000
1,2,1.500000
2,4,0.000000
3,7,1.000000
4,8,3.838804
...,...,...
178573,191344,1.370951
178574,191345,3.499228
178575,191346,3.969816
178576,191347,1.000000


In [51]:
# Calculate entropy of Fac_Upc
tr_fac_ent = train_test[['VisitNumber', 'Fac_Upc', 'SCountPerVisit', 'ScanCount']]\
.groupby(['VisitNumber', 'Fac_Upc'])\
.agg({'SCountPerVisit': 'first', 'ScanCount': 'sum'}).reset_index()
tr_fac_ent['Div'] = tr_fac_ent['ScanCount'] / tr_fac_ent['SCountPerVisit']
with np.errstate(divide='ignore'):
    tr_fac_ent['Ent_Fac'] = np.where(tr_fac_ent['Div']==0, 0, tr_fac_ent['Div'] * np.log2(tr_fac_ent['Div']) * -1)
tr_fac_ent = tr_fac_ent.groupby('VisitNumber').agg({'Ent_Fac': np.sum}).reset_index()
tr_fac_ent.shape

(178578, 2)

Calculate the number of unique items

In [52]:
tr_uni_dept = train_test.groupby('VisitNumber')['DepartmentDescription'].apply(lambda x: len(np.unique(x))).reset_index()
tr_uni_dept.rename(columns={'DepartmentDescription': 'Uni_Dept'}, inplace=True)
tr_uni_dept

Unnamed: 0,VisitNumber,Uni_Dept
0,1,3
1,2,3
2,4,1
3,7,2
4,8,7
...,...,...
178573,191344,2
178574,191345,8
178575,191346,8
178576,191347,2


In [53]:
# Calculate the number of unique FinelineNumber
tr_uni_fln = train_test.groupby('VisitNumber')['FinelineNumber'].apply(lambda x: len(np.unique(x))).reset_index()
tr_uni_fln.rename(columns = {'FinelineNumber': 'Uni_Fln'}, inplace = True)
tr_uni_fln.shape

(178578, 2)

In [56]:
# Calculate the number of unique Upc
tr_uni_upc = train_test.groupby('VisitNumber')['Upc'].apply(lambda x: len(np.unique(x))).reset_index()
tr_uni_upc.rename(columns = {'Upc': 'Uni_Upc'}, inplace = True)
tr_uni_upc.shape

(178578, 2)

In [57]:
# Calculate the number of unique factory code
tr_uni_fac = train_test.groupby('VisitNumber')['Fac_Upc'].apply(lambda x: len(np.unique(x))).reset_index()
tr_uni_fac.rename(columns = {'Fac_Upc': 'Uni_Fac'}, inplace = True)
tr_uni_fac.shape

(178578, 2)

Create one-hot dummy variables
= 원핫인코딩. 범주형 데이터를 각 컬럼으로 생성하여 1,0으로 구분


In [58]:
# Create dummy variable for DepartmentDescription
#스캔횟수를 1로 행을 펼친다. garbage 드롭, pd.get_dummies 함수로 범주형 데이터를 스파스 매트릭스로 펼친다.
tr_dept_dummy = train_test[['VisitNumber', 'DepartmentDescription', 'ScanCount', 'FinelineNumber']]
tr_dept_dummy = tr_dept_dummy.query('ScanCount > 0')
tr_dept_dummy = tr_dept_dummy.loc[np.repeat(tr_dept_dummy.index.values, tr_dept_dummy.ScanCount)]
tr_dept_dummy.drop(['FinelineNumber', 'ScanCount'], axis=1, inplace= True)
tr_dept_dummy = pd.get_dummies(tr_dept_dummy, prefix='dept', columns=['DepartmentDescription'])
tr_dept_dummy = tr_dept_dummy.groupby('VisitNumber').sum().reset_index()
tr_dept_dummy = tr_dept_dummy.astype('Sparse[int64, 0]')
tr_dept_dummy.columns.values.__len__()

70

In [68]:
tr_pop_Upc_day = train_test.groupby(['day_counter', 'Upc']).ScanCount.agg('sum').reset_index()
tr_pop_Upc_day.sort_values(['day_counter', 'ScanCount'], ascending = [1,0], inplace=True)
tr_pop_Upc_day['shifted'] = tr_pop_Upc_day.day_counter.shift(10)
tr_pop_Upc_day['keep'] = (tr_pop_Upc_day.day_counter != tr_pop_Upc_day.shifted)
tr_pop_Upc_day = tr_pop_Upc_day.query('keep')
tr_pop_Upc_day_list = list(tr_pop_Upc_day.Upc.unique())
tr_pop_Upc_day_list.__len__()

27

In [84]:
# Create popular Upc dummy variable
tr_pop_upc_dummy = train_test.query('Upc == @tr_pop_Upc_day_list')[['VisitNumber', 'Upc', 'ScanCount']]
tr_pop_upc_dummy = tr_pop_upc_dummy.loc[np.repeat(tr_pop_upc_dummy.index.values, tr_pop_upc_dummy.ScanCount)]
tr_pop_upc_dummy = tr_pop_upc_dummy[['VisitNumber', 'Upc']]
tr_pop_upc_dummy = pd.get_dummies(tr_pop_upc_dummy, prefix='pop_Upc', columns=['Upc'])
tr_pop_upc_dummy = tr_pop_upc_dummy.groupby('VisitNumber').sum().reset_index()
tr_pop_upc_dummy = tr_pop_upc_dummy.astype('Sparse[int64, 0]')
tr_pop_upc_dummy.shape

(45572, 28)

In [85]:
# Create weekday dummy
tr_weekday_dummy = train_test[['VisitNumber', 'Weekday']]
tr_weekday_dummy = tr_weekday_dummy.groupby('VisitNumber').Weekday.agg('first').reset_index()
tr_weekday_dummy = pd.get_dummies(tr_weekday_dummy, prefix='Weekday', columns=['Weekday'])
tr_weekday_dummy = tr_weekday_dummy.astype('Sparse[int64, 0]')


(178578, 8)

In [91]:
# Create month day dummy
tr_monthday_dummy = train_test[['VisitNumber', 'day_counter']]
tr_monthday_dummy = tr_monthday_dummy.groupby('VisitNumber').day_counter.agg('first').reset_index()
tr_monthday_dummy = pd.get_dummies(tr_monthday_dummy, prefix='mday', columns=['day_counter'])
tr_monthday_dummy = tr_monthday_dummy.astype('Sparse[int64, 0]')
tr_monthday_dummy.shape

(178578, 32)

Calculate tf-idf for some variables

In [92]:
total_visits = np.unique(train_test.VisitNumber).__len__()

In [106]:
Upc_total_visits

Unnamed: 0,Upc,Upc_total_visits
0,0,5140
1,834,15
2,837,1
3,3032,1
4,3035,129
...,...,...
122832,978968151182,2
122833,978970627397,2
122834,978970627937,2
122835,978970666419,1


In [119]:
#idf 점수는 총 문서 수를 특정 용어가 포함된 문서 수로 나눈 로그로 계산됩니다. 
# 이 공식은 상대적으로 적은 수의 문서에서 발생하는 용어에 높은 가중치를 부여하고 많은 문서에서 발생하는 용어에 낮은 가중치를 부여합니다.

#tf-idf를 사용하여 상품별 weighting을 진행했다. term frequency * inverse document frequency
#유저의 각 스캔한 UPC 를 나열한 후, 해당상품 스캔횟수/ 해당 유저의 방문당 총 스캔횟수로 tf를 구했다.
#log(총방문횟수 / 해당 상품을 스캔한 유니크한 유저 수)로 idf를 구했다. idf는 보통 총 문서수/특정 단어 등장빈도이니, 적절한 응용인것 같다.
#자연 로그는 문서 모음에서 자주 발생하는 용어와 드물게 발생하는 용어에 대해 균형 잡힌 가중치를 제공하기 때문에 idf 점수를 계산하기 위해 tf-idf 알고리즘에서 일반적으로 사용됩니다.



Upc_total_visits = train_test.groupby('Upc').VisitNumber.apply(lambda x: len(np.unique(x))).reset_index()\
.rename(columns={'VisitNumber': 'Upc_total_visits'})

tfidf_upc = train_test.groupby(['VisitNumber','Upc']).agg({'ScanCount': 'sum', 'SCountPerVisit': 'first'})\
.reset_index()\
.assign(tf = lambda x: x.ScanCount / x.SCountPerVisit)\
.merge(Upc_total_visits, how='left')\
.assign(idf = lambda x: np.log( total_visits/x.Upc_total_visits ))\
.assign(Upc_tfidf = lambda x: x.tf * x.idf)\
.groupby('VisitNumber').Upc_tfidf.agg([np.sum, np.std]).reset_index()\
.fillna(0)\
.rename(columns={'sum': 'Upc_tfidf_sum', 'std': 'Upc_tfidf_std'})