In [1]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

import calendar
import numpy as np
import pandas as pd
import xgboost as xgb

In [2]:
train = pd.read_csv('~/Kaggle/walmart/train.csv', dtype={'Upc': str, 'FinelineNumber': str})

In [3]:
test = pd.read_csv('~/Kaggle/walmart/test.csv', dtype={'Upc': str, 'FinelineNumber': str})

In [4]:
target_column = train.columns[0]
target_column

'TripType'

In [5]:
data = train.append(test)[train.columns]

In [6]:
feature_columns = []

In [7]:
data['NumberItems'] = data.groupby('VisitNumber')['VisitNumber'].transform('count')
feature_columns += ['NumberItems']

In [8]:
day_number_by_name = {d: n for n, d in enumerate(calendar.day_name)}
data['WeekdayNumber'] = data['Weekday'].map(day_number_by_name)
data['IsWeekday'] = data['WeekdayNumber'] < 5
feature_columns += ['WeekdayNumber', 'IsWeekday']

In [9]:
data['TotalScanCount'] = data.groupby('VisitNumber')['ScanCount'].transform('sum')
feature_columns += ['ScanCount', 'TotalScanCount']

In [10]:
departments = list(data['DepartmentDescription'].unique())
feature_columns += departments

for department in departments:
    data[department] = data['DepartmentDescription'] == department

In [11]:
feature_columns

['NumberItems',
 'WeekdayNumber',
 'IsWeekday',
 'ScanCount',
 'TotalScanCount',
 'FINANCIAL SERVICES',
 'SHOES',
 'PERSONAL CARE',
 'PAINT AND ACCESSORIES',
 'DSD GROCERY',
 'MEAT - FRESH & FROZEN',
 'DAIRY',
 'PETS AND SUPPLIES',
 'HOUSEHOLD CHEMICALS/SUPP',
 nan,
 'IMPULSE MERCHANDISE',
 'PRODUCE',
 'CANDY, TOBACCO, COOKIES',
 'GROCERY DRY GOODS',
 'BOYS WEAR',
 'FABRICS AND CRAFTS',
 'JEWELRY AND SUNGLASSES',
 'MENS WEAR',
 'ACCESSORIES',
 'HOME MANAGEMENT',
 'FROZEN FOODS',
 'SERVICE DELI',
 'INFANT CONSUMABLE HARDLINES',
 'PRE PACKED DELI',
 'COOK AND DINE',
 'PHARMACY OTC',
 'LADIESWEAR',
 'COMM BREAD',
 'BAKERY',
 'HOUSEHOLD PAPER GOODS',
 'CELEBRATION',
 'HARDWARE',
 'BEAUTY',
 'AUTOMOTIVE',
 'BOOKS AND MAGAZINES',
 'SEAFOOD',
 'OFFICE SUPPLIES',
 'LAWN AND GARDEN',
 'SHEER HOSIERY',
 'WIRELESS',
 'BEDDING',
 'BATH AND SHOWER',
 'HORTICULTURE AND ACCESS',
 'HOME DECOR',
 'TOYS',
 'INFANT APPAREL',
 'LADIES SOCKS',
 'PLUS AND MATERNITY',
 'ELECTRONICS',
 'GIRLS WEAR, 4-6X  AND 

In [12]:
data.head()

Unnamed: 0,TripType,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber,NumberItems,WeekdayNumber,IsWeekday,...,SWIMWEAR/OUTERWEAR,OTHER DEPARTMENTS,MEDIA AND GAMING,FURNITURE,OPTICAL - LENSES,SEASONAL,LARGE HOUSEHOLD GOODS,1-HR PHOTO,CONCEPT STORES,HEALTH AND BEAUTY AIDS
0,999,5,Friday,68113152929,-1,FINANCIAL SERVICES,1000,1,4,True,...,False,False,False,False,False,False,False,False,False,False
1,30,7,Friday,60538815980,1,SHOES,8931,2,4,True,...,False,False,False,False,False,False,False,False,False,False
2,30,7,Friday,7410811099,1,PERSONAL CARE,4504,2,4,True,...,False,False,False,False,False,False,False,False,False,False
3,26,8,Friday,2238403510,2,PAINT AND ACCESSORIES,3565,23,4,True,...,False,False,False,False,False,False,False,False,False,False
4,26,8,Friday,2006613744,2,PAINT AND ACCESSORIES,1017,23,4,True,...,False,False,False,False,False,False,False,False,False,False


In [13]:
data.tail()

Unnamed: 0,TripType,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber,NumberItems,WeekdayNumber,IsWeekday,...,SWIMWEAR/OUTERWEAR,OTHER DEPARTMENTS,MEDIA AND GAMING,FURNITURE,OPTICAL - LENSES,SEASONAL,LARGE HOUSEHOLD GOODS,1-HR PHOTO,CONCEPT STORES,HEALTH AND BEAUTY AIDS
653641,,191348,Sunday,66572105763,1,BATH AND SHOWER,1505,7,6,False,...,False,False,False,False,False,False,False,False,False,False
653642,,191348,Sunday,88181390024,1,BATH AND SHOWER,1099,7,6,False,...,False,False,False,False,False,False,False,False,False,False
653643,,191348,Sunday,4282557050,1,MENS WEAR,8220,7,6,False,...,False,False,False,False,False,False,False,False,False,False
653644,,191348,Sunday,80469193740,1,SWIMWEAR/OUTERWEAR,114,7,6,False,...,True,False,False,False,False,False,False,False,False,False
653645,,191348,Sunday,7871535983,1,MENS WEAR,4923,7,6,False,...,False,False,False,False,False,False,False,False,False,False


In [14]:
train_mask = data[target_column].notnull()
X_train = data[train_mask][feature_columns].as_matrix()
y_train = data[train_mask][target_column].as_matrix()
X_test = data[~train_mask][feature_columns].as_matrix()

In [15]:
%%time
gbm = xgb.XGBClassifier().fit(X_train, y_train)

CPU times: user 14min 42s, sys: 1.39 s, total: 14min 44s
Wall time: 14min 44s


In [16]:
%time
predictions = gbm.predict_proba(X_test)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 6.2 µs


In [17]:
columns = ['TripType_{}'.format(int(t)) for t in gbm._le.inverse_transform(range(predictions.shape[1]))]

In [18]:
submission = pd.concat([test['VisitNumber'], pd.DataFrame(predictions, columns=columns)], axis=1)

In [19]:
submission = submission.groupby('VisitNumber').mean()

In [20]:
submission.to_csv('submission.csv')