In [1]:
import calendar
import numpy as np
import pandas as pd
import xgboost as xgb

In [2]:
train = pd.read_csv('~/Kaggle/walmart/train.csv', dtype={'Upc': str, 'FinelineNumber': str})

In [3]:
test = pd.read_csv('~/Kaggle/walmart/test.csv', dtype={'Upc': str, 'FinelineNumber': str})

In [4]:
target_column = train.columns[0]
target_column

'TripType'

In [5]:
data = train.append(test)[train.columns]

In [6]:
feature_columns = []

In [7]:
data['NumberItems'] = data.groupby('VisitNumber')['VisitNumber'].transform('count')
feature_columns += ['NumberItems']

In [8]:
day_number_by_name = {d: n for n, d in enumerate(calendar.day_name)}
data['WeekdayNumber'] = data['Weekday'].map(day_number_by_name)
data['IsWeekday'] = data['WeekdayNumber'] < 5
feature_columns += ['WeekdayNumber', 'IsWeekday']

In [9]:
data['TotalScanCount'] = data.groupby('VisitNumber')['ScanCount'].transform('sum')
feature_columns += ['TotalScanCount']

In [10]:
feature_columns

['NumberItems', 'WeekdayNumber', 'IsWeekday', 'TotalScanCount']

In [11]:
data.head()

Unnamed: 0,TripType,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber,NumberItems,WeekdayNumber,IsWeekday,TotalScanCount
0,999,5,Friday,68113152929,-1,FINANCIAL SERVICES,1000,1,4,True,-1
1,30,7,Friday,60538815980,1,SHOES,8931,2,4,True,2
2,30,7,Friday,7410811099,1,PERSONAL CARE,4504,2,4,True,2
3,26,8,Friday,2238403510,2,PAINT AND ACCESSORIES,3565,23,4,True,28
4,26,8,Friday,2006613744,2,PAINT AND ACCESSORIES,1017,23,4,True,28


In [12]:
data.tail()

Unnamed: 0,TripType,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber,NumberItems,WeekdayNumber,IsWeekday,TotalScanCount
653641,,191348,Sunday,66572105763,1,BATH AND SHOWER,1505,7,6,False,7
653642,,191348,Sunday,88181390024,1,BATH AND SHOWER,1099,7,6,False,7
653643,,191348,Sunday,4282557050,1,MENS WEAR,8220,7,6,False,7
653644,,191348,Sunday,80469193740,1,SWIMWEAR/OUTERWEAR,114,7,6,False,7
653645,,191348,Sunday,7871535983,1,MENS WEAR,4923,7,6,False,7


In [13]:
train_mask = data[target_column].notnull()
X_train = data[train_mask][feature_columns].as_matrix()
y_train = data[train_mask][target_column].as_matrix()
X_test = data[~train_mask][feature_columns].as_matrix()

In [14]:
%%time
gbm = xgb.XGBClassifier().fit(X_train, y_train)

CPU times: user 10min 47s, sys: 2.27 s, total: 10min 49s
Wall time: 10min 54s


In [15]:
%time
predictions = gbm.predict_proba(X_test)

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 6.91 µs


In [16]:
predictions

array([[ 0.0036874 ,  0.00396368,  0.05629753, ...,  0.01655755,
         0.001817  ,  0.016499  ],
       [ 0.0036874 ,  0.00396368,  0.05629753, ...,  0.01655755,
         0.001817  ,  0.016499  ],
       [ 0.0036874 ,  0.00396368,  0.05629753, ...,  0.01655755,
         0.001817  ,  0.016499  ],
       ..., 
       [ 0.00027287,  0.00223752,  0.02518542, ...,  0.03384478,
         0.00808891,  0.00726072],
       [ 0.00027287,  0.00223752,  0.02518542, ...,  0.03384478,
         0.00808891,  0.00726072],
       [ 0.00027287,  0.00223752,  0.02518542, ...,  0.03384478,
         0.00808891,  0.00726072]], dtype=float32)