In [147]:
import numpy as np
import pandas as pd

In [3]:
import gc
gc.enable()

## 1. Get the Data 

In [149]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [150]:
train[:5]

Unnamed: 0,TripType,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber
0,999,5,Friday,68113152929,-1,FINANCIAL SERVICES,1000
1,30,7,Friday,60538815980,1,SHOES,8931
2,30,7,Friday,7410811099,1,PERSONAL CARE,4504
3,26,8,Friday,2238403510,2,PAINT AND ACCESSORIES,3565
4,26,8,Friday,2006613744,2,PAINT AND ACCESSORIES,1017


In [151]:
train.dtypes

TripType                   int64
VisitNumber                int64
Weekday                   object
Upc                      float64
ScanCount                  int64
DepartmentDescription     object
FinelineNumber           float64
dtype: object

## 2. Feature Engineering

In [152]:
# Replace labels with floats
from sklearn.preprocessing import LabelEncoder
lbl_enc = LabelEncoder()

for c in ['Weekday', 'DepartmentDescription']:
    train[c] = lbl_enc.fit_transform(train[c])
    test[c] = lbl_enc.transform(test[c])
    
train[:3]

Unnamed: 0,TripType,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber
0,999,5,0,68113152929,-1,21,1000
1,30,7,0,60538815980,1,63,8931
2,30,7,0,7410811099,1,51,4504


In [153]:
train.isnull().sum(axis=0)

TripType                    0
VisitNumber                 0
Weekday                     0
Upc                      4129
ScanCount                   0
DepartmentDescription       0
FinelineNumber           4129
dtype: int64

In [154]:
from sklearn.preprocessing import Imputer
imputer = Imputer(strategy='mean')
train = pd.DataFrame(imputer.fit_transform(train), columns=train.columns)
test = pd.DataFrame(imputer.fit_transform(test), columns=test.columns)

In [155]:
train = train.astype(int)
test = test.astype(int)

In [156]:
train.dtypes

TripType                 int32
VisitNumber              int32
Weekday                  int32
Upc                      int32
ScanCount                int32
DepartmentDescription    int32
FinelineNumber           int32
dtype: object

In [166]:
%%time
train_builded = pd.get_dummies(train, columns=['TripType', 'DepartmentDescription'], prefix=['TripType', 'Department'])
test_builded = pd.get_dummies(test, columns=['DepartmentDescription'], prefix=['Department'])

train_builded['DepartmentDescription'] = train['DepartmentDescription']
test_builded['DepartmentDescription'] = test['DepartmentDescription']

Wall time: 3.29 s


In [167]:
train[:5]

Unnamed: 0,TripType,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber
0,999,5,0,-2147483648,-1,21,1000
1,30,7,0,-2147483648,1,63,8931
2,30,7,0,-2147483648,1,51,4504
3,26,8,0,-2147483648,2,50,3565
4,26,8,0,2006613744,2,50,1017


In [None]:
train_pivot = pd.pivot_table(train, values='ScanCount', index='VisitNumber', columns='FinelineNumber')

In [None]:
%%time
train_builded.fillna(0, inplace=True)
test_builded.fillna(0, inplace=True)

In [160]:
train_builded[:3]

Unnamed: 0,VisitNumber,Weekday,Upc,ScanCount,FinelineNumber,TripType_3,TripType_4,TripType_5,TripType_6,TripType_7,...,Department_60,Department_61,Department_62,Department_63,Department_64,Department_65,Department_66,Department_67,Department_68,DepartmentDescription
0,5,0,-2147483648,-1,1000,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,21
1,7,0,-2147483648,1,8931,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,63
2,7,0,-2147483648,1,4504,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,51


In [161]:
cols_tr = list(train_builded.columns)
cols_te = list(test.columns)
cols_to_remove = ['Upc', 'FinelineNumber']
[cols_tr.remove(c) for c in cols_to_remove]
[cols_te.remove(c) for c in cols_to_remove]

train_builded = train_builded[cols_tr]
test_builded = test[cols_te]

In [162]:
for col in cols_tr:
    if col not in cols_te:
        test_builded[col] = 0
        cols_te.append(col)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [163]:
for col in cols_te:
    if col not in cols_tr:
        train_builded[col] = 0
        cols_tr.append(col)

In [164]:
print len(cols_tr)
print len(cols_te)

111
111


In [165]:
%%time
train_builded = train_builded.reindex_axis(sorted(cols_tr), axis=1)
test_builded = test_builded.reindex_axis(sorted(cols_tr), axis=1)

Wall time: 854 ms


In [77]:
%%time
sorted_depts = sorted(train_builded.FinelineNumber.unique())
department_means = []
for i in range(len(train_builded.DepartmentDescription.unique())):
    dep = sorted_depts[i]
    group_len = len(train_builded[train_builded.DepartmentDescription == dep])
    one_group = train_builded[train_builded.DepartmentDescription == dep].groupby('DepartmentDescription', as_index=False).sum() / group_len
    department_means.append(one_group[one_group.columns[71:109]].values[0])

def get_triptypes_list(dep):
    idx = sorted_depts.index(dep)
    return department_means[idx]

def get_triptypes_pandas(series):
    array = []
    for el in series.values:
        array.append(get_triptypes_list(el))
    return np.array(array)

Wall time: 2.49 s


In [78]:
%%time
train_builded[train_builded.columns[71:109]] = get_triptypes_pandas(train_builded.DepartmentDescription)
test_builded[test_builded.columns[71:109]] = get_triptypes_pandas(test_builded.DepartmentDescription)

Wall time: 20.5 s


In [79]:
train_builded[:5]

Unnamed: 0,DepartmentDescription,Department_0,Department_1,Department_10,Department_11,Department_12,Department_13,Department_14,Department_15,Department_16,...,TripType_43,TripType_44,TripType_5,TripType_6,TripType_7,TripType_8,TripType_9,TripType_999,VisitNumber,Weekday
0,21,0,0,0,0,0,0,0,0,0,...,0.00858,0.01203,0.004663,0.001306,0.006062,0.0,0.005689,0.213466,5,0
1,63,0,0,0,0,0,0,0,0,0,...,0.018036,0.068208,0.006558,0.002131,0.004755,0.008198,0.104279,0.048533,7,0
2,51,0,0,0,0,0,0,0,0,0,...,0.012462,0.044247,0.021969,0.001739,0.004908,0.063142,0.008173,0.015654,7,0
3,50,0,0,0,0,0,0,0,0,0,...,0.034302,0.077907,0.005814,0.000581,0.006977,0.013953,0.115698,0.04593,8,0
4,50,0,0,0,0,0,0,0,0,0,...,0.034302,0.077907,0.005814,0.000581,0.006977,0.013953,0.115698,0.04593,8,0


#### Make grouped data 

In [80]:
train_builded['Counter'] = 1
test_builded['Counter'] = 1

newcols = list(train_builded.columns)
newcols.remove('DepartmentDescription')
train_builded = train_builded[newcols]
test_builded = test_builded[newcols]

In [81]:
train_builded[:3]

Unnamed: 0,Department_0,Department_1,Department_10,Department_11,Department_12,Department_13,Department_14,Department_15,Department_16,Department_17,...,TripType_44,TripType_5,TripType_6,TripType_7,TripType_8,TripType_9,TripType_999,VisitNumber,Weekday,Counter
0,0,0,0,0,0,0,0,0,0,0,...,0.01203,0.004663,0.001306,0.006062,0.0,0.005689,0.213466,5,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0.068208,0.006558,0.002131,0.004755,0.008198,0.104279,0.048533,7,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0.044247,0.021969,0.001739,0.004908,0.063142,0.008173,0.015654,7,0,1


In [82]:
%%time
train_grouped = train_builded.groupby(['VisitNumber', 'Weekday'], as_index=False).sum()
test_grouped = test_builded.groupby(['VisitNumber', 'Weekday'], as_index=False).sum()

Wall time: 2.56 s


In [84]:
train_grouped[:4]

Unnamed: 0,VisitNumber,Weekday,Department_0,Department_1,Department_10,Department_11,Department_12,Department_13,Department_14,Department_15,...,TripType_42,TripType_43,TripType_44,TripType_5,TripType_6,TripType_7,TripType_8,TripType_9,TripType_999,Counter
0,5,0,0,0,0,0,0,0,0,0,...,0.011098,0.00858,0.01203,0.004663,0.001306,0.006062,0.0,0.005689,0.213466,1
1,7,0,0,0,0,0,0,0,0,0,...,0.121983,0.030497,0.112455,0.028527,0.003871,0.009663,0.07134,0.112452,0.064187,2
2,8,0,1,0,0,0,0,0,0,0,...,1.85594,0.60015,1.452755,0.144358,0.029211,0.371744,0.538846,1.933242,0.981146,23
3,9,0,0,0,0,0,0,0,0,0,...,0.057619,0.024438,0.055311,0.035097,0.013697,0.139349,0.163891,0.054436,0.057399,3


In [87]:
train_grouped[train_grouped.columns[72:110]] = train_grouped[train_grouped.columns[72:110]].apply(lambda x: x / train_grouped.Counter)
test_grouped[test_grouped.columns[72:110]] = test_grouped[test_grouped.columns[72:110]].apply(lambda x: x / test_grouped.Counter)

In [101]:
train_grouped['Sat'] = train_grouped['Weekday'].apply(lambda x: int(x == 2))
train_grouped['Sun'] = train_grouped['Weekday'].apply(lambda x: int(x == 3))
train_grouped['Fri'] = train_grouped['Weekday'].apply(lambda x: int(x == 0))
train_grouped['WE'] = train_grouped['Weekday'].apply(lambda x: int(x == 2 or x == 3))

test_grouped['Sat'] = test_grouped['Weekday'].apply(lambda x: int(x == 2))
test_grouped['Sun'] = test_grouped['Weekday'].apply(lambda x: int(x == 3))
test_grouped['Fri'] = test_grouped['Weekday'].apply(lambda x: int(x == 0))
test_grouped['WE'] = test_grouped['Weekday'].apply(lambda x: int(x == 2 or x == 3))

In [108]:
train_answers = pd.read_csv('submit_train.csv')

for i in range(38):
    train_grouped['Answer_' + str(i)] = train_answers.ix[:, i+1:i+2]

## 3. Cross Validation 

In [117]:
columns = list(train_grouped.columns)
columns.remove('VisitNumber')
columns.remove('Counter')

In [123]:
xtrain = train_grouped[columns]
ytrain = train.groupby(['VisitNumber', 'TripType'], as_index=False).sum().TripType
# xtest = test_grouped[test_grouped.columns[1:-1]]

In [4]:
xtrain = pd.read_csv('data/xtrain.csv')
ytrain = pd.read_csv('data/ytrain.csv', names=['TripType']).TripType
xtest = pd.read_csv('data/xtest.csv')

In [124]:
def multiclass_log_loss(y_true, y_pred, eps=1e-15):
    classes = np.sort(train.TripType.unique())
    Y_true = y_true.apply(lambda x: np.where(classes==x)[0][0]).values
    predictions = np.clip(y_pred, eps, 1 - eps)

    # normalize row sums to 1
    predictions /= predictions.sum(axis=1)[:, np.newaxis]

    actual = np.zeros(y_pred.shape)
    rows = actual.shape[0]
    actual[np.arange(rows), Y_true.astype(int)] = 1
    vsota = np.sum(actual * np.log(predictions))
    return -1.0 / rows * vsota

#### 3.1b. Try Random Forest

In [125]:
%%time
from sklearn.cross_validation import train_test_split

xtr, xcv, ytr, ycv = train_test_split(xtrain, ytrain, test_size = 0.052,  random_state = 42)

Wall time: 181 ms


In [132]:
%%time
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=100,
#     max_depth=10,
#     max_features=17,
    n_jobs=3,
    oob_score=True
)
rf.fit(xtr, ytr)

Wall time: 24.8 s


In [133]:
predictionRF_train = rf.predict_proba(xcv)
print multiclass_log_loss(ycv, predictionRF_train)

1.23035867596


In [25]:
predictionRF_test = rf.predict_proba(xtest)

#### 2. Try XGBoost multiclass (gbtree)

In [128]:
# Set params

params = {"objective": "multi:softprob",
          "booster": "gbtree",
          "eta": 0.02,
          "max_depth": 12,
          "subsample": 0.9,
          "colsample_bytree": 0.7,
          "silent": 1,
          "num_class": 38,
          "eval_metric": 'mlogloss'
          }
num_trees = 1000
stop = 20

In [129]:
labels = np.sort(ytrain.unique())

In [130]:
ytr_labeled = pd.Series([np.where(labels==x)[0][0] for x in ytr])
ycv_labeled = pd.Series([np.where(labels==x)[0][0] for x in ycv])

In [131]:
import xgboost as xgb

dtrain = xgb.DMatrix(xtr.values, label=ytr_labeled)
dvalid = xgb.DMatrix(xcv.values, label=ycv_labeled)
watchlist = [(dvalid, 'eval'), (dtrain, 'train')]
gbm = xgb.train(params, dtrain, num_trees, evals=watchlist, early_stopping_rounds=stop, verbose_eval=True)

Will train until train error hasn't decreased in 20 rounds.
[0]	eval-mlogloss:3.448480	train-mlogloss:3.438576
[1]	eval-mlogloss:3.309149	train-mlogloss:3.287743
[2]	eval-mlogloss:3.179650	train-mlogloss:3.147759
[3]	eval-mlogloss:3.072754	train-mlogloss:3.031006
[4]	eval-mlogloss:2.982092	train-mlogloss:2.931877
[5]	eval-mlogloss:2.892929	train-mlogloss:2.834865
[6]	eval-mlogloss:2.813817	train-mlogloss:2.747912
[7]	eval-mlogloss:2.742533	train-mlogloss:2.669620
[8]	eval-mlogloss:2.680436	train-mlogloss:2.599904
[9]	eval-mlogloss:2.622680	train-mlogloss:2.534653
[10]	eval-mlogloss:2.566970	train-mlogloss:2.472409
[11]	eval-mlogloss:2.514904	train-mlogloss:2.413132
[12]	eval-mlogloss:2.465389	train-mlogloss:2.357791
[13]	eval-mlogloss:2.419327	train-mlogloss:2.305848
[14]	eval-mlogloss:2.375197	train-mlogloss:2.255863
[15]	eval-mlogloss:2.332348	train-mlogloss:2.207612
[16]	eval-mlogloss:2.293153	train-mlogloss:2.162899
[17]	eval-mlogloss:2.255188	train-mlogloss:2.119705
[18]	eval-mlog

KeyboardInterrupt: 

In [190]:
dcv = xgb.DMatrix(xcv.values)
XGB1 = gbm.predict(dcv)
indices = XGB1 < 0
XGB1[indices] = 0

In [191]:
multiclass_log_loss(ycv, XGB1)

0.63641752444569921

**The best score is:** 0.45266287447996978

## 5. Predict classes probabilities

#### Prepare submit with LR 

In [216]:
rf.fit(train_stacked_grouped[cv_columns_tr], train_stacked_grouped.TripType)
predictionRF = rf.predict_proba(test_stacked_grouped[cv_columns_te])

In [217]:
submit_XGB = predictionRF
#test_stacked_grouped[:3]

#### With XGBoost

In [50]:
dtest = xgb.DMatrix(test_stacked_grouped[cv_columns_te].as_matrix())
predictionXGB = gbm.predict(dtest)
indices = predictionXGB < 0
predictionXGB[indices] = 0

In [51]:
submit_XGB = predictionXGB

In [52]:
submit_XGB

array([[  1.64106523e-03,   1.30538503e-03,   1.37398229e-03, ...,
          1.74849585e-03,   1.49152183e-03,   1.07993763e-02],
       [  1.00550351e-04,   9.42308543e-05,   9.79431061e-05, ...,
          1.39050113e-04,   1.05309147e-04,   1.89659011e-04],
       [  4.80866984e-05,   4.82663418e-05,   5.00364113e-05, ...,
          4.78701841e-05,   4.76965142e-05,   9.98090565e-01],
       ..., 
       [  1.03858393e-03,   1.03626784e-03,   1.06251438e-03, ...,
          1.03483291e-03,   1.03107875e-03,   1.03711940e-01],
       [  4.93862433e-04,   5.00966038e-04,   5.21119975e-04, ...,
          1.05963531e-03,   1.96996005e-03,   6.51085167e-04],
       [  5.03390438e-05,   5.16264081e-05,   5.40690926e-05, ...,
          8.16527972e-05,   7.47916711e-05,   8.40847642e-05]], dtype=float32)

## 5. Submit probas 

In [53]:
cl_names = ['VisitNumber']
for cls in rf.classes_:
    cl_names.append('TripType_' + str(int(cls)))

In [54]:
submit = pd.DataFrame(columns=cl_names)
submit['VisitNumber'] = test_stacked_grouped['VisitNumber']
submit[cl_names[1:]] = submit_XGB
submit[:5]

Unnamed: 0,VisitNumber,TripType_3,TripType_4,TripType_5,TripType_6,TripType_7,TripType_8,TripType_9,TripType_12,TripType_14,...,TripType_36,TripType_37,TripType_38,TripType_39,TripType_40,TripType_41,TripType_42,TripType_43,TripType_44,TripType_999
0,1,0.001641,0.001305,0.001374,0.001411,0.003621,0.003603,0.002,0.001282,0.001255,...,0.00138,0.00162,0.816814,0.002736,0.001812,0.001567,0.00199,0.001748,0.001492,0.010799
1,2,0.000101,9.4e-05,9.8e-05,0.000128,0.000231,0.000301,0.00019,0.000101,8.7e-05,...,0.000147,0.000124,0.0001,0.000148,0.000106,9.3e-05,0.000101,0.000139,0.000105,0.00019
2,3,4.8e-05,4.8e-05,5e-05,4.9e-05,5.6e-05,8.9e-05,6.1e-05,4.9e-05,4.8e-05,...,8.6e-05,4.9e-05,4.8e-05,4.8e-05,4.9e-05,4.8e-05,4.8e-05,4.8e-05,4.8e-05,0.998091
3,4,0.000421,0.000423,0.000481,0.000431,0.000496,0.00051,0.83641,0.000429,0.000522,...,0.00043,0.000427,0.000436,0.000429,0.00043,0.000424,0.000455,0.000423,0.000458,0.075559
4,6,4.8e-05,4.8e-05,5e-05,4.9e-05,5.5e-05,8.9e-05,6.1e-05,4.9e-05,4.8e-05,...,4.9e-05,4.9e-05,4.8e-05,4.8e-05,4.9e-05,4.8e-05,4.8e-05,4.8e-05,5.6e-05,0.998081


In [55]:
submit.to_csv('submit.csv', index=False)