In [82]:
import numpy as np
import pandas as pd

## 1. Get the Data 

In [83]:
train = pd.read_csv('data/train.csv', low_memory=False)
test = pd.read_csv('data/test.csv', low_memory=False)

In [84]:
train[:5]

Unnamed: 0,TripType,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber
0,999,5,Friday,68113152929,-1,FINANCIAL SERVICES,1000
1,30,7,Friday,60538815980,1,SHOES,8931
2,30,7,Friday,7410811099,1,PERSONAL CARE,4504
3,26,8,Friday,2238403510,2,PAINT AND ACCESSORIES,3565
4,26,8,Friday,2006613744,2,PAINT AND ACCESSORIES,1017


In [85]:
train.dtypes

TripType                   int64
VisitNumber                int64
Weekday                   object
Upc                      float64
ScanCount                  int64
DepartmentDescription     object
FinelineNumber           float64
dtype: object

## 2. Feature Engineering

In [86]:
# Replace labels with floats
from sklearn.preprocessing import LabelEncoder
lbl_enc = LabelEncoder()

for c in ['Weekday', 'DepartmentDescription']:
    train[c] = lbl_enc.fit_transform(train[c])
    test[c] = lbl_enc.transform(test[c])
    
train[:3]

Unnamed: 0,TripType,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber
0,999,5,0,68113152929,-1,21,1000
1,30,7,0,60538815980,1,63,8931
2,30,7,0,7410811099,1,51,4504


In [87]:
train.fillna(0, inplace=True)
test.fillna(0, inplace=True)

In [88]:
train = train.astype(int)
test = test.astype(int)

In [89]:
train_builded = pd.get_dummies(train, columns=['DepartmentDescription'], prefix=['Department'])
test_builded = pd.get_dummies(test, columns=['DepartmentDescription'], prefix=['Department'])

In [90]:
test_builded['Department_27'] = 0

In [91]:
test_builded[:5]

Unnamed: 0,VisitNumber,Weekday,Upc,ScanCount,FinelineNumber,Department_0,Department_1,Department_2,Department_3,Department_4,...,Department_60,Department_61,Department_62,Department_63,Department_64,Department_65,Department_66,Department_67,Department_68,Department_27
0,1,0,72503389714,1,3002,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,1,0,1707710732,1,1526,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,89470001026,1,1431,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,88491211470,1,3555,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2,0,2840015224,1,4408,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [92]:
train_builded.fillna(0, inplace=True)
test_builded.fillna(0, inplace=True)

#### Make grouped data 

In [93]:
cols_tr = list(train_builded.columns)
[cols_tr.remove(c) for c in ['Upc', 'FinelineNumber']]
cols_te = list(test_builded.columns)
[cols_te.remove(c) for c in ['Upc', 'FinelineNumber']]

[None, None]

In [94]:
grouped_train = train_builded[cols_tr].groupby(['VisitNumber', 'TripType', 'Weekday']).sum().reset_index()
grouped_test = test_builded[cols_te].groupby(['VisitNumber', 'Weekday']).sum().reset_index()

## 3. Cross Validation 

In [101]:
def multiclass_log_loss(y_true, y_pred, eps=1e-15):
    classes = np.sort(train.TripType.unique())
    Y_true = y_true.apply(lambda x: np.where(classes==x)[0][0]).values
    predictions = np.clip(y_pred, eps, 1 - eps)

    # normalize row sums to 1
    predictions /= predictions.sum(axis=1)[:, np.newaxis]

    actual = np.zeros(y_pred.shape)
    rows = actual.shape[0]
    actual[np.arange(rows), Y_true.astype(int)] = 1
    vsota = np.sum(actual * np.log(predictions))
    return -1.0 / rows * vsota

In [117]:
from sklearn.cross_validation import train_test_split

xtrain, xcv, ytrain, ycv = train_test_split(grouped_train[cols_tr[2:]], grouped_train['TripType'], test_size = 0.052,  random_state = 42)

#### 1. Try Random Forest Classifier

In [118]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=100,
    max_depth=21,
    max_features=17,
    n_jobs=2,
    oob_score=True
)
rf.fit(xtrain, ytrain)
predictionRF = rf.predict_proba(xcv)

In [119]:
multiclass_log_loss(ycv, predictionRF)

1.2368335090867641

The best score is: 1.2235092995470789

#### 2. Try XGBoost multiclass (gbtree)

#### XGB1 

In [285]:
# Set params

params = {"objective": "multi:softprob",
          "booster": "gbtree",
          "eta": 0.39,
          "max_depth": 100,
          "min_child_weight": 0.1,
          "subsample": 1,
          "colsample_bytree": 0.4,
          "silent": 1,
          "num_class": 38,
          "seed": 4,
          "eval_metric": 'mlogloss'
          }
num_trees = 5
stop = 20

In [286]:
labels = np.sort(ytrain.unique())

In [287]:
ytrain_labeled = pd.Series([np.where(labels==x)[0][0] for x in ytrain])
ycv_labeled = pd.Series([np.where(labels==x)[0][0] for x in ycv])

In [288]:
import xgboost as xgb
from sklearn.cross_validation import train_test_split

dtrain = xgb.DMatrix(xtrain.values, label=ytrain_labeled)
dvalid = xgb.DMatrix(xcv.values, label=ycv_labeled)
watchlist = [(dvalid, 'eval'), (dtrain, 'train')]
gbm = xgb.train(params, dtrain, num_trees, evals=watchlist, early_stopping_rounds=stop, verbose_eval=True)

Will train until train error hasn't decreased in 20 rounds.
[0]	eval-mlogloss:2.338542	train-mlogloss:2.239893
[1]	eval-mlogloss:1.922698	train-mlogloss:1.738426
[2]	eval-mlogloss:1.741028	train-mlogloss:1.487123
[3]	eval-mlogloss:1.549916	train-mlogloss:1.246863
[4]	eval-mlogloss:1.476564	train-mlogloss:1.129317


In [289]:
dcv = xgb.DMatrix(xcv.values)
XGB1 = gbm.predict(dcv)
indices = XGB1 < 0
XGB1[indices] = 0

In [290]:
multiclass_log_loss(ycv, XGB1)

1.4765637368218332

**The best score is:** 0.864797

#### XGB2 

In [293]:
# Set params
import xgboost as xgb
from sklearn.cross_validation import train_test_split

params = {"objective": "multi:softprob",
          "booster": "gbtree",
          "eta": 0.2,
          "max_depth": 15,
          "min_child_weight": 15,
          "subsample": 1,
          "colsample_bytree": 0.9,
          "silent": 1,
          "num_class": 38,
          "seed": 5,
          "eval_metric": 'mlogloss'
          }
num_trees = 35
stop = 20

labels = np.sort(ytrain.unique())
ytrain_labeled = pd.Series([np.where(labels==x)[0][0] for x in ytrain])
ycv_labeled = pd.Series([np.where(labels==x)[0][0] for x in ycv])

dtrain = xgb.DMatrix(xtrain.values, label=ytrain_labeled)
dvalid = xgb.DMatrix(xcv.values, label=ycv_labeled)
watchlist = [(dvalid, 'eval'), (dtrain, 'train')]
gbm = xgb.train(params, dtrain, num_trees, evals=watchlist, early_stopping_rounds=stop, verbose_eval=True)

Will train until train error hasn't decreased in 20 rounds.
[0]	eval-mlogloss:2.226817	train-mlogloss:2.203922
[1]	eval-mlogloss:1.945919	train-mlogloss:1.916536
[2]	eval-mlogloss:1.759757	train-mlogloss:1.722979
[3]	eval-mlogloss:1.624922	train-mlogloss:1.580562
[4]	eval-mlogloss:1.520053	train-mlogloss:1.471578
[5]	eval-mlogloss:1.436654	train-mlogloss:1.383192
[6]	eval-mlogloss:1.362805	train-mlogloss:1.303988
[7]	eval-mlogloss:1.301918	train-mlogloss:1.238419
[8]	eval-mlogloss:1.252194	train-mlogloss:1.183875
[9]	eval-mlogloss:1.207868	train-mlogloss:1.134984
[10]	eval-mlogloss:1.172417	train-mlogloss:1.093741
[11]	eval-mlogloss:1.142192	train-mlogloss:1.059772
[12]	eval-mlogloss:1.114445	train-mlogloss:1.027803
[13]	eval-mlogloss:1.092733	train-mlogloss:1.000501
[14]	eval-mlogloss:1.073508	train-mlogloss:0.977025
[15]	eval-mlogloss:1.055005	train-mlogloss:0.955363
[16]	eval-mlogloss:1.038287	train-mlogloss:0.936060
[17]	eval-mlogloss:1.024724	train-mlogloss:0.919284
[18]	eval-mlog

In [295]:
dcv = xgb.DMatrix(xcv.values)
XGB2 = gbm.predict(dcv)
indices = XGB2 < 0
XGB2[indices] = 0

multiclass_log_loss(ycv, XGB2)

0.91863965193154362

#### XGB3 

In [296]:
# Set params
import xgboost as xgb
from sklearn.cross_validation import train_test_split

params = {"objective": "multi:softprob",
          "booster": "gbtree",
          "eta": 0.05,
          "max_depth": 5,
          "min_child_weight": 10,
          "subsample": 1,
          "colsample_bytree": 1,
          "silent": 1,
          "num_class": 38,
          "seed": 1,
          "eval_metric": 'mlogloss'
          }
num_trees = 700
stop = 20

labels = np.sort(ytrain.unique())
ytrain_labeled = pd.Series([np.where(labels==x)[0][0] for x in ytrain])
ycv_labeled = pd.Series([np.where(labels==x)[0][0] for x in ycv])

dtrain = xgb.DMatrix(xtrain.values, label=ytrain_labeled)
dvalid = xgb.DMatrix(xcv.values, label=ycv_labeled)
watchlist = [(dvalid, 'eval'), (dtrain, 'train')]
gbm = xgb.train(params, dtrain, num_trees, evals=watchlist, early_stopping_rounds=stop, verbose_eval=True)

Will train until train error hasn't decreased in 20 rounds.
[0]	eval-mlogloss:3.255256	train-mlogloss:3.251645
[1]	eval-mlogloss:3.029601	train-mlogloss:3.025386
[2]	eval-mlogloss:2.860281	train-mlogloss:2.853860
[3]	eval-mlogloss:2.722080	train-mlogloss:2.715069
[4]	eval-mlogloss:2.605189	train-mlogloss:2.597332
[5]	eval-mlogloss:2.504069	train-mlogloss:2.495484
[6]	eval-mlogloss:2.415589	train-mlogloss:2.406080
[7]	eval-mlogloss:2.335830	train-mlogloss:2.325943
[8]	eval-mlogloss:2.265009	train-mlogloss:2.254093
[9]	eval-mlogloss:2.200358	train-mlogloss:2.188171
[10]	eval-mlogloss:2.140880	train-mlogloss:2.128107
[11]	eval-mlogloss:2.086666	train-mlogloss:2.072885
[12]	eval-mlogloss:2.036414	train-mlogloss:2.021779
[13]	eval-mlogloss:1.989563	train-mlogloss:1.974323
[14]	eval-mlogloss:1.945999	train-mlogloss:1.930036
[15]	eval-mlogloss:1.905346	train-mlogloss:1.888112
[16]	eval-mlogloss:1.867122	train-mlogloss:1.849287
[17]	eval-mlogloss:1.831250	train-mlogloss:1.812668
[18]	eval-mlog

In [298]:
dcv = xgb.DMatrix(xcv.values)
XGB3 = gbm.predict(dcv)
indices = XGB3 < 0
XGB3[indices] = 0

multiclass_log_loss(ycv, XGB3)

0.88041952343850272

#### XGB4 

In [300]:
# Set params
import xgboost as xgb
from sklearn.cross_validation import train_test_split

params = {"objective": "multi:softprob",
          "booster": "gbtree",
          "eta": 0.02,
          "max_depth": 5,
          "min_child_weight": 8,
          "subsample": 1,
          "colsample_bytree": 1,
          "silent": 1,
          "num_class": 38,
          "seed": 1,
          "eval_metric": 'mlogloss'
          }
num_trees = 1500
stop = 20

labels = np.sort(ytrain.unique())
ytrain_labeled = pd.Series([np.where(labels==x)[0][0] for x in ytrain])
ycv_labeled = pd.Series([np.where(labels==x)[0][0] for x in ycv])

dtrain = xgb.DMatrix(xtrain.values, label=ytrain_labeled)
dvalid = xgb.DMatrix(xcv.values, label=ycv_labeled)
watchlist = [(dvalid, 'eval'), (dtrain, 'train')]
gbm = xgb.train(params, dtrain, num_trees, evals=watchlist, early_stopping_rounds=stop, verbose_eval=True)

Will train until train error hasn't decreased in 20 rounds.
[0]	eval-mlogloss:3.481587	train-mlogloss:3.480039
[1]	eval-mlogloss:3.357503	train-mlogloss:3.354848
[2]	eval-mlogloss:3.252057	train-mlogloss:3.249119
[3]	eval-mlogloss:3.160614	train-mlogloss:3.156922
[4]	eval-mlogloss:3.078926	train-mlogloss:3.074862
[5]	eval-mlogloss:3.005603	train-mlogloss:3.000935
[6]	eval-mlogloss:2.938670	train-mlogloss:2.933552
[7]	eval-mlogloss:2.877141	train-mlogloss:2.871518
[8]	eval-mlogloss:2.820802	train-mlogloss:2.814225
[9]	eval-mlogloss:2.767588	train-mlogloss:2.760532
[10]	eval-mlogloss:2.718107	train-mlogloss:2.710480
[11]	eval-mlogloss:2.671660	train-mlogloss:2.663514
[12]	eval-mlogloss:2.627557	train-mlogloss:2.618994
[13]	eval-mlogloss:2.585880	train-mlogloss:2.576887
[14]	eval-mlogloss:2.546400	train-mlogloss:2.537098
[15]	eval-mlogloss:2.508804	train-mlogloss:2.499040
[16]	eval-mlogloss:2.472780	train-mlogloss:2.462722
[17]	eval-mlogloss:2.438577	train-mlogloss:2.428027
[18]	eval-mlog

In [301]:
dcv = xgb.DMatrix(xcv.values)
XGB4 = gbm.predict(dcv)
indices = XGB4 < 0
XGB4[indices] = 0

multiclass_log_loss(ycv, XGB4)

0.88091455060306534

#### XGB5 

In [303]:
# Set params
import xgboost as xgb
from sklearn.cross_validation import train_test_split

params = {"objective": "multi:softprob",
          "booster": "gbtree",
          "eta": 0.5,
          "max_depth": 155,
          "min_child_weight": 0.1,
          "subsample": 1,
          "colsample_bytree": 0.4,
          "silent": 1,
          "num_class": 38,
          "seed": 4,
          "eval_metric": 'mlogloss'
          }
num_trees = 7
stop = 20

labels = np.sort(ytrain.unique())
ytrain_labeled = pd.Series([np.where(labels==x)[0][0] for x in ytrain])
ycv_labeled = pd.Series([np.where(labels==x)[0][0] for x in ycv])

dtrain = xgb.DMatrix(xtrain.values, label=ytrain_labeled)
dvalid = xgb.DMatrix(xcv.values, label=ycv_labeled)
watchlist = [(dvalid, 'eval'), (dtrain, 'train')]
gbm = xgb.train(params, dtrain, num_trees, evals=watchlist, early_stopping_rounds=stop, verbose_eval=True)

Will train until train error hasn't decreased in 20 rounds.
[0]	eval-mlogloss:2.267635	train-mlogloss:2.137937
[1]	eval-mlogloss:1.741737	train-mlogloss:1.473946
[2]	eval-mlogloss:1.611978	train-mlogloss:1.269854
[3]	eval-mlogloss:1.491831	train-mlogloss:1.076345
[4]	eval-mlogloss:1.398134	train-mlogloss:0.950537
[5]	eval-mlogloss:1.335281	train-mlogloss:0.839885
[6]	eval-mlogloss:1.286330	train-mlogloss:0.743229


In [304]:
dcv = xgb.DMatrix(xcv.values)
XGB5 = gbm.predict(dcv)
indices = XGB5 < 0
XGB5[indices] = 0

multiclass_log_loss(ycv, XGB5)

1.2863292725661273

#### 3. Try an ensemble 

In [964]:
vtest = [XGB1, XGB2]
wtest = [XGB5, XGB3, XGB4]

# good fore the best prediction
# ensemble = (alpha * predictionsXGB ** 1.083 + beta * predictionsXGB ** 1.07) / 2




ensemble = (0.052 * XGB1 ** 1.64 + \
            0.15 * XGB2 ** 0.97 + \
            0.04 * XGB5 ** 3.5 + \
            0.41 * XGB3 ** 0.9 * XGB2 ** 0.04 * XGB1 ** 0.03 + \
            0.5 * XGB4 * XGB3 ** 0.1) ** 1.07
multiclass_log_loss(ycv, ensemble)

0.87486347004225207

**The best score is** 0.87486347004225207

## 4. Predict classes probabilities

#### 1. With RF

In [19]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=250,
    max_depth=20,
    max_features=16,
    # n_jobs=2,
    oob_score=True
)
rf.fit(featured_train[cols], featured_train['TripType'])
submitRF = rf.predict_proba(featured_test[cols])

In [111]:
submitRF

array([[  8.47336322e-05,   0.00000000e+00,   2.28909884e-04, ...,
          2.09900422e-03,   1.47259690e-03,   1.31402960e-02],
       [  2.36784164e-03,   7.34320380e-04,   1.58061994e-02, ...,
          1.96299542e-02,   5.01692983e-03,   1.39586652e-02],
       [  0.00000000e+00,   0.00000000e+00,   3.05188199e-05, ...,
          2.66920877e-05,   1.71591992e-05,   9.75645995e-01],
       ..., 
       [  5.03898222e-02,   2.12782057e-04,   5.36054290e-03, ...,
          1.45702693e-04,   5.26427125e-05,   3.88865881e-02],
       [  0.00000000e+00,   0.00000000e+00,   2.21086773e-04, ...,
          7.42077791e-03,   2.72459177e-02,   2.88190941e-03],
       [  1.09909018e-05,   4.05482709e-04,   2.61374524e-03, ...,
          3.88168257e-03,   6.54779274e-03,   3.77339845e-03]])

#### With XGBoost

In [275]:
test_columns = list(featured_test.columns)
test_columns.remove('VisitNumber')

In [276]:
dtest = xgb.DMatrix(featured_test[test_columns].as_matrix())
test_probs = gbm.predict(dtest)
indices = test_probs < 0
test_probs[indices] = 0

In [277]:
submit_XGB = test_probs

In [278]:
submit_XGB

array([[  8.48208128e-06,   1.76258447e-06,   8.20940695e-05, ...,
          2.89405347e-03,   4.16647701e-04,   2.63160635e-02],
       [  1.79485178e-05,   1.49473735e-05,   2.56662024e-04, ...,
          4.10267012e-03,   1.52383989e-04,   9.38953366e-03],
       [  2.37433273e-08,   2.53936534e-08,   1.57850184e-06, ...,
          9.41496907e-08,   3.33652139e-07,   9.99707401e-01],
       ..., 
       [  1.19914056e-03,   2.15627806e-05,   4.04380087e-04, ...,
          7.10017848e-05,   4.44339821e-05,   3.76716368e-02],
       [  2.46197231e-07,   1.03446958e-07,   5.35441677e-06, ...,
          4.48310422e-03,   1.24673927e-02,   5.12472878e-04],
       [  7.59113874e-08,   1.53634989e-08,   8.82566624e-08, ...,
          6.74853276e-04,   3.06470611e-04,   2.17852241e-04]], dtype=float32)

## 5. Submit probas 

In [279]:
cl_names =['VisitNumber']
for cls in rf.classes_:
    cl_names.append('TripType_' + str(cls))

In [280]:
submit = pd.DataFrame(columns=cl_names)
submit['VisitNumber'] = featured_test['VisitNumber']
submit[cl_names[1:]] = submit_XGB
submit[:5]

Unnamed: 0,VisitNumber,TripType_3,TripType_4,TripType_5,TripType_6,TripType_7,TripType_8,TripType_9,TripType_12,TripType_14,...,TripType_36,TripType_37,TripType_38,TripType_39,TripType_40,TripType_41,TripType_42,TripType_43,TripType_44,TripType_999
0,1,8.482081e-06,1.762584e-06,8.209407e-05,0.0001402805,0.006414,0.01098,0.006802,0.0006161014,1.30773e-06,...,0.0005314536,0.00137811,0.3152148,0.05435668,0.000330255,0.005865734,0.007142,0.002894053,0.0004166477,0.026316
1,2,1.794852e-05,1.494737e-05,0.000256662,0.000936985,0.031214,0.040327,0.003835,0.002634516,4.183709e-06,...,0.002561774,0.001491819,0.04334561,0.09160373,0.0001948045,0.001775917,0.008924,0.00410267,0.000152384,0.00939
2,3,2.374333e-08,2.539365e-08,1.578502e-06,1.37212e-06,4.7e-05,7.3e-05,3e-06,1.946458e-07,8.891202e-10,...,0.0001400732,5.527077e-07,1.380433e-06,7.497422e-07,2.84247e-07,3.239047e-07,2e-06,9.414969e-08,3.336521e-07,0.999707
3,4,0.0001306773,5.348821e-06,0.0004838282,0.0001563608,0.009294,0.079691,0.821099,2.252554e-05,8.890865e-06,...,0.0004102861,4.788312e-05,0.0002549397,0.0001961299,3.21165e-05,5.132495e-05,0.000596,4.645357e-05,3.616352e-05,0.035303
4,6,4.104282e-08,6.492356e-09,5.132719e-07,5.875433e-07,1.3e-05,4e-06,1.3e-05,1.437678e-07,7.095804e-10,...,2.697614e-07,6.958407e-08,5.995254e-07,1.604568e-07,1.304756e-07,8.175431e-07,4e-06,6.307836e-08,3.218855e-07,0.999933


In [281]:
submit.to_csv('submit.csv', index=False)