In [1]:
import numpy as np
import pandas as pd

## 1. Get the Data 

In [151]:
train = pd.read_csv('data/train.csv', low_memory=False)
test = pd.read_csv('data/test.csv', low_memory=False)

featured_train = pd.read_csv('data/grouped_train.csv', low_memory=False)
featured_test = pd.read_csv('data/grouped_test.csv', low_memory=False)

In [3]:
train[:5]

Unnamed: 0,TripType,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber
0,999,5,Friday,68113152929,-1,FINANCIAL SERVICES,1000
1,30,7,Friday,60538815980,1,SHOES,8931
2,30,7,Friday,7410811099,1,PERSONAL CARE,4504
3,26,8,Friday,2238403510,2,PAINT AND ACCESSORIES,3565
4,26,8,Friday,2006613744,2,PAINT AND ACCESSORIES,1017


In [4]:
train.dtypes

TripType                   int64
VisitNumber                int64
Weekday                   object
Upc                      float64
ScanCount                  int64
DepartmentDescription     object
FinelineNumber           float64
dtype: object

## 2. Feature Engineering

In [5]:
# Replace labels with floats
from sklearn.preprocessing import LabelEncoder
lbl_enc = LabelEncoder()

for c in ['Weekday', 'DepartmentDescription']:
    train[c] = lbl_enc.fit_transform(train[c])
    test[c] = lbl_enc.transform(test[c])
    
train[:3]

  flag = np.concatenate(([True], aux[1:] != aux[:-1]))
  return aux[:-1][aux[1:] == aux[:-1]]


Unnamed: 0,TripType,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber
0,999,5,0,68113152929,-1,21,1000
1,30,7,0,60538815980,1,63,8931
2,30,7,0,7410811099,1,51,4504


In [6]:
# Show unique department descriptions
departments = np.sort(train.DepartmentDescription.unique())

In [7]:
def build_features(data):
    for dep in departments:
        data['Department_' + str(dep)] = 0
        
    for index, row in data.iterrows():
        data.loc[index, 'Department_' + str(int(row.DepartmentDescription))] = 1

In [8]:
build_features(train)
build_features(test)

In [9]:
cols_tr = list(train.columns)
[cols_tr.remove(c) for c in ['Weekday', 'DepartmentDescription', 'Upc', 'FinelineNumber']]
cols_te = list(train.columns)
[cols_te.remove(c) for c in ['Weekday', 'TripType', 'DepartmentDescription', 'Upc', 'FinelineNumber']]

[None, None, None, None, None]

In [10]:
featured_train = train[cols_tr].groupby(['VisitNumber', 'TripType']).sum().reset_index()
featured_test = test[cols_te].groupby(['VisitNumber']).sum().reset_index()

In [158]:
cols = list(featured_train.columns)
cols.remove('VisitNumber')

## 3. Cross Validation 

In [35]:
def multiclass_log_loss(clf, y_true, y_pred, eps=1e-15):
    Y_true = ycv.apply(lambda x: np.where(rf.classes_==x)[0][0]).values
    predictions = np.clip(y_pred, eps, 1 - eps)

    # normalize row sums to 1
    predictions /= predictions.sum(axis=1)[:, np.newaxis]

    actual = np.zeros(y_pred.shape)
    rows = actual.shape[0]
    actual[np.arange(rows), Y_true.astype(int)] = 1
    vsota = np.sum(actual * np.log(predictions))
    return -1.0 / rows * vsota

In [166]:
from sklearn.cross_validation import train_test_split

xtrain, xcv, ytrain, ycv = train_test_split(featured_train[cols[1:]], featured_train['TripType'], test_size = 0.052,  random_state = 42)

#### 1. Try Random Forest Classifier

In [22]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=100,
    max_depth=20,
    max_features=16,
    n_jobs=2,
    oob_score=True
)
rf.fit(xtrain, ytrain)
predictionRF = rf.predict_proba(xcv)

In [23]:
multiclass_log_loss(rf, ycv, predictionRF)

1.2526831494360087

The best score is: 1.2526831494360087

#### 2. Try Logistic Regression 

In [285]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(
    penalty='l2',
    solver='lbfgs',
    multi_class='multinomial'
)

lr.fit(xtrain, ytrain)
predictionLR = lr.predict_proba(xcv)

In [286]:
multiclass_log_loss(lr, ycv, predictionLR)

3.4526601797370922

#### 3. Try XGBoost multiclass

In [227]:
# Set params

params = {"objective": "multi:softprob",
          #"booster": "gbtree",
          "eta": 0.1,
          "max_depth": 5,
          "subsample": 0.7,
          "colsample_bytree": 0.7,
          "silent": 1,
          "num_class": 38,
          "eval_metric": 'mlogloss'
          }
num_trees = 500
stop = 20

In [228]:
labels = np.sort(ytrain.unique())

In [229]:
ytrain_labeled = pd.Series([np.where(labels==x)[0][0] for x in ytrain])
ycv_labeled = pd.Series([np.where(labels==x)[0][0] for x in ycv])

In [230]:
import xgboost as xgb
from sklearn.cross_validation import train_test_split

dtrain = xgb.DMatrix(xtrain.as_matrix(), label=ytrain_labeled)
dvalid = xgb.DMatrix(xcv.as_matrix(), label=ycv_labeled)
watchlist = [(dvalid, 'eval'), (dtrain, 'train')]
gbm = xgb.train(params, dtrain, num_trees, evals=watchlist, early_stopping_rounds=stop, verbose_eval=True)

Will train until train error hasn't decreased in 20 rounds.
[0]	eval-mlogloss:3.060632	train-mlogloss:3.048442
[1]	eval-mlogloss:2.733049	train-mlogloss:2.718265
[2]	eval-mlogloss:2.542099	train-mlogloss:2.524906
[3]	eval-mlogloss:2.373853	train-mlogloss:2.356514
[4]	eval-mlogloss:2.252983	train-mlogloss:2.232906
[5]	eval-mlogloss:2.144183	train-mlogloss:2.122234
[6]	eval-mlogloss:2.054298	train-mlogloss:2.030032
[7]	eval-mlogloss:1.970899	train-mlogloss:1.943988
[8]	eval-mlogloss:1.900977	train-mlogloss:1.872876
[9]	eval-mlogloss:1.829679	train-mlogloss:1.799765
[10]	eval-mlogloss:1.767051	train-mlogloss:1.735574
[11]	eval-mlogloss:1.713817	train-mlogloss:1.681560
[12]	eval-mlogloss:1.665678	train-mlogloss:1.631624
[13]	eval-mlogloss:1.621144	train-mlogloss:1.585907
[14]	eval-mlogloss:1.580132	train-mlogloss:1.543851
[15]	eval-mlogloss:1.542087	train-mlogloss:1.505094
[16]	eval-mlogloss:1.506959	train-mlogloss:1.468696
[17]	eval-mlogloss:1.472891	train-mlogloss:1.433193
[18]	eval-mlog

**The best score is:** 0.873472

## 4. Predict classes probabilities

#### 1. With RF

In [19]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=250,
    max_depth=20,
    max_features=16,
    # n_jobs=2,
    oob_score=True
)
rf.fit(featured_train[cols], featured_train['TripType'])
submitRF = rf.predict_proba(featured_test[cols])

In [111]:
submitRF

array([[  8.47336322e-05,   0.00000000e+00,   2.28909884e-04, ...,
          2.09900422e-03,   1.47259690e-03,   1.31402960e-02],
       [  2.36784164e-03,   7.34320380e-04,   1.58061994e-02, ...,
          1.96299542e-02,   5.01692983e-03,   1.39586652e-02],
       [  0.00000000e+00,   0.00000000e+00,   3.05188199e-05, ...,
          2.66920877e-05,   1.71591992e-05,   9.75645995e-01],
       ..., 
       [  5.03898222e-02,   2.12782057e-04,   5.36054290e-03, ...,
          1.45702693e-04,   5.26427125e-05,   3.88865881e-02],
       [  0.00000000e+00,   0.00000000e+00,   2.21086773e-04, ...,
          7.42077791e-03,   2.72459177e-02,   2.88190941e-03],
       [  1.09909018e-05,   4.05482709e-04,   2.61374524e-03, ...,
          3.88168257e-03,   6.54779274e-03,   3.77339845e-03]])

#### With XGBoost

In [231]:
test_columns = list(featured_test.columns)
test_columns.remove('VisitNumber')

In [232]:
dtest = xgb.DMatrix(featured_test[test_columns].as_matrix())
test_probs = gbm.predict(dtest)
indices = test_probs < 0
test_probs[indices] = 0

In [233]:
submit_XGB = test_probs

In [234]:
submit_XGB

array([[  1.20367549e-05,   4.27226769e-06,   1.45024285e-04, ...,
          2.91725248e-03,   5.21892798e-04,   2.45908406e-02],
       [  5.25757168e-05,   2.38046650e-05,   2.99324805e-04, ...,
          5.44686150e-03,   2.41568676e-04,   7.46783568e-03],
       [  1.99037615e-07,   9.66535794e-08,   2.67540440e-06, ...,
          5.93166874e-07,   1.36867095e-06,   9.99633312e-01],
       ..., 
       [  1.75808999e-03,   2.70718810e-05,   4.62111580e-04, ...,
          1.00745805e-04,   7.68516766e-05,   4.32697423e-02],
       [  4.70975465e-06,   9.00235648e-07,   3.46708184e-05, ...,
          1.36217745e-02,   1.56321861e-02,   1.19461806e-03],
       [  9.24449807e-07,   1.71101945e-07,   1.25795327e-06, ...,
          1.67042471e-03,   4.71482461e-04,   4.81910189e-04]], dtype=float32)

## 5. Submit probas 

In [235]:
cl_names =['VisitNumber']
for cls in rf.classes_:
    cl_names.append('TripType_' + str(cls))

In [236]:
submit = pd.DataFrame(columns=cl_names)
submit['VisitNumber'] = featured_test['VisitNumber']
submit[cl_names[1:]] = submit_XGB
submit[:5]

Unnamed: 0,VisitNumber,TripType_3,TripType_4,TripType_5,TripType_6,TripType_7,TripType_8,TripType_9,TripType_12,TripType_14,...,TripType_36,TripType_37,TripType_38,TripType_39,TripType_40,TripType_41,TripType_42,TripType_43,TripType_44,TripType_999
0,1,1.203675e-05,4.272268e-06,0.000145,0.00021,0.009562,0.015328,0.011489,0.0006573525,3.77623e-06,...,0.0005331435,0.001088904,0.301934,0.06305782,0.0002870736,0.006547401,0.009046,0.002917252,0.000522,0.024591
1,2,5.257572e-05,2.380467e-05,0.000299,0.00076,0.049258,0.043971,0.00443,0.002088601,1.009277e-05,...,0.002284032,0.001154415,0.037092,0.07990259,0.0001633226,0.002516618,0.010685,0.005446861,0.000242,0.007468
2,3,1.990376e-07,9.665358e-08,3e-06,2e-06,6.3e-05,8.7e-05,1.4e-05,7.759482e-07,1.869699e-08,...,0.000140089,1.323068e-06,3e-06,3.095897e-06,7.616079e-07,9.969655e-07,4e-06,5.931669e-07,1e-06,0.999633
3,4,0.0001588964,1.209551e-05,0.000631,0.000237,0.009547,0.097613,0.79757,4.851065e-05,3.610074e-05,...,0.0003519841,7.419621e-05,0.000467,0.0002430175,5.264738e-05,8.953391e-05,0.000948,9.658957e-05,7.5e-05,0.039259
4,6,3.410809e-07,5.584445e-08,1e-06,1e-06,3.8e-05,1.3e-05,3.5e-05,6.882497e-07,1.586814e-08,...,7.422587e-07,3.658921e-07,2e-06,9.353922e-07,4.810268e-07,1.88556e-06,9e-06,3.780883e-07,1e-06,0.999824


In [237]:
submit.to_csv('submit.csv', index=False)