In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

import calendar
import numpy as np
import pandas as pd
import xgboost as xgb

In [None]:
train = pd.read_csv('~/Kaggle/walmart/train.csv', dtype={'Upc': str, 'FinelineNumber': str})

In [None]:
test = pd.read_csv('~/Kaggle/walmart/test.csv', dtype={'Upc': str, 'FinelineNumber': str})

In [None]:
target_column = train.columns[0]
target_column

In [None]:
data = train.append(test)[train.columns]

In [None]:
feature_columns = []

In [None]:
data['NumberItems'] = data.groupby('VisitNumber')['VisitNumber'].transform('count')
feature_columns += ['NumberItems']

In [None]:
day_number_by_name = {d: n for n, d in enumerate(calendar.day_name)}
data['WeekdayNumber'] = data['Weekday'].map(day_number_by_name)
data['IsWeekday'] = data['WeekdayNumber'] < 5
feature_columns += ['WeekdayNumber', 'IsWeekday']

In [None]:
# data['LenUpc'] = data['Upc'].fillna('').apply(len)
# feature_columns += ['LenUpc']

data['Upc'] = data['Upc'].fillna('0').apply(int)
feature_columns += ['Upc']

In [None]:
data['TotalScanCount'] = data.groupby('VisitNumber')['ScanCount'].transform('sum')
feature_columns += ['ScanCount', 'TotalScanCount']

In [None]:
departments = list(data['DepartmentDescription'].unique())
feature_columns += departments

for department in departments:
    data[department] = data['DepartmentDescription'] == department

In [None]:
# data['LenFinelineNumber'] = data['FinelineNumber'].fillna('').apply(len)
# feature_columns += ['LenFinelineNumber']

data['FinelineNumber'] = data['FinelineNumber'].fillna('0').apply(int)
feature_columns += ['FinelineNumber']

In [None]:
feature_columns

In [None]:
data.head()

In [None]:
data.tail()

In [None]:
train_mask = data[target_column].notnull()
X_train = data[train_mask][feature_columns].as_matrix()
y_train = data[train_mask][target_column].as_matrix()
X_test = data[~train_mask][feature_columns].as_matrix()
sample_weight = 1 / data['NumberItems']

In [None]:
%%time
gbm = xgb.XGBClassifier().fit(X_train, y_train, sample_weight=sample_weight)

In [None]:
%time
predictions = gbm.predict_proba(X_test)

In [None]:
columns = ['TripType_{}'.format(int(t)) for t in gbm._le.inverse_transform(range(predictions.shape[1]))]

In [None]:
submission = pd.concat([test['VisitNumber'], pd.DataFrame(predictions, columns=columns)], axis=1)

In [None]:
submission = submission.groupby('VisitNumber').mean()

In [None]:
submission.to_csv('submission.csv')