In [1]:
import setuptools
%load_ext Cython

In [2]:
%%cython -a
import cython
cimport cython
import numpy as np
cimport numpy as np
import pandas as pd

In [3]:
import gc
gc.enable()

## 1. Get the Data 

In [4]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [5]:
train[:5]

Unnamed: 0,TripType,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber
0,999,5,Friday,68113152929,-1,FINANCIAL SERVICES,1000
1,30,7,Friday,60538815980,1,SHOES,8931
2,30,7,Friday,7410811099,1,PERSONAL CARE,4504
3,26,8,Friday,2238403510,2,PAINT AND ACCESSORIES,3565
4,26,8,Friday,2006613744,2,PAINT AND ACCESSORIES,1017


In [6]:
train.dtypes

TripType                   int64
VisitNumber                int64
Weekday                   object
Upc                      float64
ScanCount                  int64
DepartmentDescription     object
FinelineNumber           float64
dtype: object

## 2. Feature Engineering

In [7]:
# Replace labels with floats
from sklearn.preprocessing import LabelEncoder
lbl_enc = LabelEncoder()

for c in ['Weekday', 'DepartmentDescription']:
    train[c] = lbl_enc.fit_transform(train[c])
    test[c] = lbl_enc.transform(test[c])
    
train[:3]

  flag = np.concatenate(([True], aux[1:] != aux[:-1]))
  return aux[:-1][aux[1:] == aux[:-1]]


Unnamed: 0,TripType,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber
0,999,5,0,68113152929,-1,21,1000
1,30,7,0,60538815980,1,63,8931
2,30,7,0,7410811099,1,51,4504


In [8]:
train.isnull().sum(axis=0)

TripType                    0
VisitNumber                 0
Weekday                     0
Upc                      4129
ScanCount                   0
DepartmentDescription       0
FinelineNumber           4129
dtype: int64

In [9]:
from sklearn.preprocessing import Imputer
imputer = Imputer(strategy='mean')
train = pd.DataFrame(imputer.fit_transform(train), columns=train.columns)
test = pd.DataFrame(imputer.fit_transform(test), columns=test.columns)

In [10]:
train = train.astype(int)
test = test.astype(int)

In [11]:
train.dtypes

TripType                 int32
VisitNumber              int32
Weekday                  int32
Upc                      int32
ScanCount                int32
DepartmentDescription    int32
FinelineNumber           int32
dtype: object

In [12]:
%%time
train_builded = pd.get_dummies(train, columns=['DepartmentDescription', 'FinelineNumber'], prefix=['Department', 'Fineline'], sparse=True)
test_builded = pd.get_dummies(test, columns=['DepartmentDescription', 'FinelineNumber'], prefix=['Department', 'Fineline'], sparse=True)

Wall time: 2min 28s


In [13]:
%%time
train_builded.fillna(0, inplace=True)
test_builded.fillna(0, inplace=True)

Wall time: 37.8 s


In [14]:
cols_tr = list(train_builded.columns)
cols_te = list(test_builded.columns)
cols_tr.remove('Upc')
cols_te.remove('Upc')

train_builded = train_builded[cols_tr]
test_builded = test_builded[cols_te]

In [15]:
train_builded.columns

Index([u'TripType', u'VisitNumber', u'Weekday', u'ScanCount', u'Department_0',
       u'Department_1', u'Department_2', u'Department_3', u'Department_4',
       u'Department_5',
       ...
       u'Fineline_9964', u'Fineline_9966', u'Fineline_9967', u'Fineline_9970',
       u'Fineline_9971', u'Fineline_9974', u'Fineline_9975', u'Fineline_9991',
       u'Fineline_9997', u'Fineline_9998'],
      dtype='object', length=5268)

In [16]:
for col in cols_tr:
    if col not in cols_te:
        test_builded[col] = 0
        cols_te.append(col)

In [17]:
for col in cols_te:
    if col not in cols_tr:
        train_builded[col] = 0
        cols_tr.append(col)

In [18]:
print len(cols_tr)
print len(cols_te)

5426
5426


In [19]:
%%time
train_builded = train_builded.reindex_axis(sorted(cols_tr), axis=1)
test_builded = test_builded.reindex_axis(sorted(cols_tr), axis=1)

Wall time: 908 ms


#### Make grouped data 

In [20]:
def build_part(data, isfinal):
    grouped_part = pd.DataFrame(columns=list(data.columns)[1:])
    rows = data.iloc[0][1:].copy()
    rows = 0
    j = data.iloc[0]['VisitNumber']
    index_end = len(data)
#     print data.index[0]
#     print len(data)

    for i, row in data.iterrows():
        if (row['VisitNumber'] == j):
            rows += row[1:]
        else: 
            grouped_part.loc[len(grouped_part)] = rows
            rows = row[1:]
            j = row['VisitNumber']
            if(not isfinal and i - data.index[0] > len(data) - 100):
                    index_end = i
                    break
        if(isfinal and i == data.index[len(data)-1]):
            grouped_part.loc[len(grouped_part)] = rows
    return grouped_part, index_end

def build_grouped(data):
    print 'build data with numbers (' + str(data.ix[0].VisitNumber) + ',' + str(data.ix[len(data)-1].VisitNumber) + ')'
        
    grouped = pd.DataFrame(columns=data.columns[1:])
    start = 0
    for k in range(50):
        end = len(data) if k == 49 else (k + 1) * (len(data) / 50)
#         print k, start, end
        part = data[start:end]
        
        grouped_part, start = build_part(part, k == 49)
        grouped = grouped.append(grouped_part)
    
    return grouped

##### Train 

In [21]:
cols = list(train_builded.columns)
num_cols = ['VisitNumber', 'TripType', 'Weekday', 'ScanCount']
[cols.remove(c) for c in num_cols]
train_numbers = train_builded[num_cols].groupby(['VisitNumber', 'Weekday', 'TripType'], as_index=False).sum()
grouped_cols = ['VisitNumber'] + cols

In [22]:
%%time
tr_b_1 = train_builded[grouped_cols][:99981].reset_index(drop=True)
tr_b_2 = train_builded[grouped_cols][99981:199999].reset_index(drop=True)
tr_b_3 = train_builded[grouped_cols][199999:299979].reset_index(drop=True)
tr_b_4 = train_builded[grouped_cols][299979:399991].reset_index(drop=True)
tr_b_5 = train_builded[grouped_cols][399991:499994].reset_index(drop=True)
tr_b_6 = train_builded[grouped_cols][499994:599991].reset_index(drop=True)
tr_b_7 = train_builded[grouped_cols][599991:].reset_index(drop=True)

Wall time: 1min 20s


In [24]:
%%time
tr_g_1 = build_grouped(tr_b_1).to_sparse()
tr_g_2 = build_grouped(tr_b_2).to_sparse()
tr_g_3 = build_grouped(tr_b_3).to_sparse()
tr_g_4 = build_grouped(tr_b_4).to_sparse()
tr_g_5 = build_grouped(tr_b_5).to_sparse()
tr_g_6 = build_grouped(tr_b_6).to_sparse()
tr_g_7 = build_grouped(tr_b_7).to_sparse()

build data with numbers (5,28987)
build data with numbers (28988,59429)
build data with numbers (59431,90437)
build data with numbers (90438,118089)
build data with numbers (118090,148190)
build data with numbers (148192,179080)
build data with numbers (179081,191347)
Wall time: 41min


In [25]:
%%time
result = tr_g_1.append(tr_g_2)
result = result.append(tr_g_3)
result = result.append(tr_g_4)
result = result.append(tr_g_5)
result = result.append(tr_g_6)
result = result.append(tr_g_7)

Wall time: 4min 58s


In [26]:
# pd.concat([train_numbers, grouped], axis=1, join='inner')
train_grouped = result.reset_index(drop=True)
# train_grouped['TripType'] = train_triptype

In [31]:
%%time
from sklearn.preprocessing import Binarizer

scaler = Binarizer()
train_scaled = scaler.fit_transform(train_grouped)

Wall time: 4min 30s


##### Test 

In [32]:
cols = list(test_builded.columns)
num_cols = ['VisitNumber', 'TripType', 'Weekday', 'ScanCount']
[cols.remove(c) for c in num_cols]
test_numbers = test_builded[num_cols].groupby(['VisitNumber', 'Weekday', 'TripType'], as_index=False).sum()
grouped_cols = ['VisitNumber'] + cols

In [33]:
%%time
te_b_1 = test_builded[grouped_cols][:99998].reset_index(drop=True)
te_b_2 = test_builded[grouped_cols][99998:199996].reset_index(drop=True)
te_b_3 = test_builded[grouped_cols][199996:299950].reset_index(drop=True)
te_b_4 = test_builded[grouped_cols][299950:388934].reset_index(drop=True)
te_b_5 = test_builded[grouped_cols][388934:478991].reset_index(drop=True)
te_b_6 = test_builded[grouped_cols][478991:569982].reset_index(drop=True)
te_b_7 = test_builded[grouped_cols][569982:].reset_index(drop=True)

Wall time: 2min 54s


In [34]:
%%time
te_g_1 = build_grouped(te_b_1).to_sparse()
te_g_2 = build_grouped(te_b_2).to_sparse()
te_g_3 = build_grouped(te_b_3).to_sparse()
te_g_4 = build_grouped(te_b_4).to_sparse()
te_g_5 = build_grouped(te_b_5).to_sparse()
te_g_6 = build_grouped(te_b_6).to_sparse()
te_g_7 = build_grouped(te_b_7).to_sparse()

build data with numbers (1,29352)
build data with numbers (29353,59894)
build data with numbers (59895,90831)
build data with numbers (90832,114963)
build data with numbers (114965,142337)
build data with numbers (142338,168446)
build data with numbers (168447,191348)
Wall time: 1h 41s


In [35]:
%%time
result = te_g_1.append(te_g_2)
result = result.append(te_g_3)
result = result.append(te_g_4)
result = result.append(te_g_5)
result = result.append(te_g_6)
result = result.append(te_g_7)

Wall time: 22min 48s


In [36]:
test_grouped = result.reset_index(drop=True)

In [37]:
%%time
from sklearn.preprocessing import Binarizer

scaler = Binarizer()
test_scaled = scaler.fit_transform(test_grouped)

Wall time: 6min 27s


In [55]:
xtest[:2]

Unnamed: 0,Department_0,Department_1,Department_10,Department_11,Department_12,Department_13,Department_14,Department_15,Department_16,Department_17,...,Fineline_9969,Fineline_9970,Fineline_9971,Fineline_9974,Fineline_9975,Fineline_998,Fineline_9991,Fineline_9997,Fineline_9998,Fineline_9999
0,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## 3. Cross Validation 

In [149]:
%%time
from scipy import sparse
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=300, random_state=42)
truncated_train = svd.fit_transform(sparse.csr_matrix(train_scaled))
truncated_test = svd.transform(sparse.csr_matrix(test_scaled))

Wall time: 36 s


In [150]:
xtrain = pd.DataFrame(truncated_train)
ytrain = train_numbers.TripType

In [151]:
xtest = pd.DataFrame(truncated_test)

In [38]:
def multiclass_log_loss(y_true, y_pred, eps=1e-15):
    classes = np.sort(train.TripType.unique())
    Y_true = y_true.apply(lambda x: np.where(classes==x)[0][0]).values
    predictions = np.clip(y_pred, eps, 1 - eps)

    # normalize row sums to 1
    predictions /= predictions.sum(axis=1)[:, np.newaxis]

    actual = np.zeros(y_pred.shape)
    rows = actual.shape[0]
    actual[np.arange(rows), Y_true.astype(int)] = 1
    vsota = np.sum(actual * np.log(predictions))
    return -1.0 / rows * vsota

In [152]:
%%time
from sklearn.cross_validation import train_test_split

xtr, xcv, ytr, ycv = train_test_split(truncated_train, train_numbers.TripType, test_size = 0.052,  random_state = 42)

Wall time: 173 ms


In [57]:
print xtrain.shape
print ytrain.shape

(95674, 5422)
(95674L,)


#### 3.1. Try Logistic Regression

In [59]:
%%time
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(
    penalty='l2',
#     C=0.45,
#     fit_intercept=False,
    solver='lbfgs',
    multi_class='ovr'
)

lr.fit(xtrain, ytrain)

Wall time: 32min 16s


In [67]:
%%time
predictionLR_train = lr.predict_proba(xtrain)
print multiclass_log_loss(ytrain, predictionLR_train)

0.805446228137
Wall time: 1.49 s


In [69]:
predictionLR_test = lr.predict_proba(xtest)

#### 3.2 Prepare stacked dataset (train)

In [199]:
train_stacked_builded = pd.get_dummies(train, columns=['DepartmentDescription'], prefix=['Department'])

In [200]:
train_stacked_columns = list(train_stacked_builded.columns)
[train_stacked_columns.remove(c) for c in ['Upc', 'FinelineNumber']]

[None, None]

In [201]:
train_stacked_grouped = train_stacked_builded[train_stacked_columns].groupby(['VisitNumber', 'TripType','Weekday']).sum().reset_index()

In [202]:
for i in range(len(train.TripType.unique())):
    train_stacked_columns.append('Prediction_' + str(i))

for column in train_stacked_columns[-38:]:
    train_stacked_grouped[column] = 0

In [203]:
train_stacked_grouped[train_stacked_columns[-38:]] = predictionLR_train

In [204]:
train_stacked_grouped[:5]

Unnamed: 0,VisitNumber,TripType,Weekday,ScanCount,Department_0,Department_1,Department_2,Department_3,Department_4,Department_5,...,Prediction_28,Prediction_29,Prediction_30,Prediction_31,Prediction_32,Prediction_33,Prediction_34,Prediction_35,Prediction_36,Prediction_37
0,5,999,0,-1,0,0,0,0,0,0,...,0.00087,0.000849,0.000646,0.003617,9.5e-05,0.000255,0.000539,0.000502,0.000198,0.783389
1,7,30,0,2,0,0,0,0,0,0,...,0.065265,0.001754,0.001004,0.022227,0.001296,0.024676,0.004835,0.023995,0.001948,0.326371
2,8,26,0,28,1,0,0,0,0,0,...,8.9e-05,0.005156,0.001222,0.004028,0.139068,0.000745,0.000878,0.00154,0.015092,0.001455
3,9,8,0,3,0,0,0,0,0,0,...,0.010078,0.402235,0.007193,0.037941,0.000906,0.0002,0.002348,0.002424,0.000473,0.009539
4,10,8,0,3,0,0,0,0,0,0,...,0.011797,0.004242,0.003271,0.079791,0.000499,0.000482,0.00141,0.004004,0.000261,0.005539


#### 3.2 Prepare stacked dataset (test)

In [205]:
test_stacked_builded = pd.get_dummies(test, columns=['DepartmentDescription'], prefix=['Department'])

In [206]:
test_stacked_columns = list(test_stacked_builded.columns)
[test_stacked_columns.remove(c) for c in ['Upc', 'FinelineNumber']]

[None, None]

In [207]:
test_stacked_grouped = test_stacked_builded[test_stacked_columns].groupby(['VisitNumber', 'Weekday']).sum().reset_index()

In [208]:
for i in range(len(train.TripType.unique())):
    test_stacked_columns.append('Prediction_' + str(i))

for column in test_stacked_columns[-38:]:
    test_stacked_grouped[column] = 0

In [209]:
test_stacked_grouped[test_stacked_columns[-38:]] = predictionLR_test

In [210]:
test_stacked_grouped['Department_27'] = 0

In [180]:
test_stacked_grouped.columns

Index([u'VisitNumber', u'Weekday', u'ScanCount', u'Department_0',
       u'Department_1', u'Department_2', u'Department_3', u'Department_4',
       u'Department_5', u'Department_6',
       ...
       u'Prediction_29', u'Prediction_30', u'Prediction_31', u'Prediction_32',
       u'Prediction_33', u'Prediction_34', u'Prediction_35', u'Prediction_36',
       u'Prediction_37', u'Department_27'],
      dtype='object', length=110)

In [211]:
train_stacked_grouped = train_stacked_grouped.reindex_axis(sorted(train_stacked_grouped.columns), axis=1)
test_stacked_grouped = test_stacked_grouped.reindex_axis(sorted(test_stacked_grouped.columns), axis=1)

In [212]:
test_stacked_grouped[:5]

Unnamed: 0,Department_0,Department_1,Department_10,Department_11,Department_12,Department_13,Department_14,Department_15,Department_16,Department_17,...,Prediction_37,Prediction_4,Prediction_5,Prediction_6,Prediction_7,Prediction_8,Prediction_9,ScanCount,VisitNumber,Weekday
0,0,0,0,0,0,0,0,0,0,2,...,0.067496,0.000211,0.004612,0.011434,0.000601,1.7e-05,0.000724,4,1,0
1,0,0,0,0,0,0,0,0,0,0,...,0.808969,0.007174,0.059962,0.00129,0.000834,3.2e-05,0.015866,4,2,0
2,0,0,0,0,0,0,0,0,0,0,...,0.178412,0.012269,0.557474,0.061175,0.000331,1.5e-05,0.001503,0,3,0
3,0,0,0,0,0,0,0,0,0,0,...,0.08063,0.017234,0.174363,0.601123,0.000457,0.000153,0.00203,1,4,0
4,0,0,0,0,0,0,0,0,0,0,...,0.121425,0.014054,0.07909,0.58836,0.000843,2.9e-05,0.000578,0,6,0


In [213]:
cv_columns_tr = list(train_stacked_grouped.columns)
cv_columns_tr.remove('TripType')
cv_columns_tr.remove('VisitNumber')

cv_columns_te = list(test_stacked_grouped.columns)
cv_columns_te.remove('VisitNumber')

In [214]:
train_stacked_grouped[cv_columns_tr][:3]

Unnamed: 0,Department_0,Department_1,Department_10,Department_11,Department_12,Department_13,Department_14,Department_15,Department_16,Department_17,...,Prediction_36,Prediction_37,Prediction_4,Prediction_5,Prediction_6,Prediction_7,Prediction_8,Prediction_9,ScanCount,Weekday
0,0,0,0,0,0,0,0,0,0,0,...,0.000198,0.783389,0.004681,0.001074692,0.08676956,0.000341,5.7e-05,0.000424,-1,0
1,0,0,0,0,0,0,0,0,0,0,...,0.001948,0.326371,0.001962,0.06245008,0.09519805,0.000565,2.8e-05,0.001172,2,0
2,1,0,0,0,0,0,0,0,0,1,...,0.015092,0.001455,0.000102,8.071655e-07,3.815989e-07,2.5e-05,1.6e-05,0.000488,28,0


In [185]:
%%time
from sklearn.cross_validation import train_test_split

xtr, xcv, ytr, ycv = train_test_split(train_stacked_grouped[cv_columns_tr], train_stacked_grouped.TripType, test_size = 0.052,  random_state = 42)

Wall time: 254 ms


#### 1. Try Random Forest Classifier

In [215]:
%%time
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=100,
#     max_depth=10,
#     max_features=17,
    n_jobs=2,
    oob_score=True
)
rf.fit(xtrain, ytrain)

Wall time: 731 ms


In [160]:
predictionRF_test = rf.predict_proba(xtest)

In [156]:
multiclass_log_loss(ycv, predictionRF)

1.789514262348288

The best score is: 0.65149983569634395

#### 2. Try XGBoost multiclass (gbtree)

In [186]:
# Set params

params = {"objective": "multi:softprob",
          "booster": "gbtree",
          "eta": 0.2,
          "max_depth": 12,
          "subsample": 0.9,
          "colsample_bytree": 0.7,
          "silent": 1,
          "num_class": 38,
          "eval_metric": 'mlogloss'
          }
num_trees = 30
stop = 20

In [187]:
labels = np.sort(ytrain.unique())

In [188]:
ytr_labeled = pd.Series([np.where(labels==x)[0][0] for x in ytr])
ycv_labeled = pd.Series([np.where(labels==x)[0][0] for x in ycv])

In [189]:
import xgboost as xgb
from sklearn.cross_validation import train_test_split

dtrain = xgb.DMatrix(xtr.values, label=ytr_labeled)
dvalid = xgb.DMatrix(xcv.values, label=ycv_labeled)
watchlist = [(dvalid, 'eval'), (dtrain, 'train')]
gbm = xgb.train(params, dtrain, num_trees, evals=watchlist, early_stopping_rounds=stop, verbose_eval=True)

Will train until train error hasn't decreased in 20 rounds.
[0]	eval-mlogloss:1.700911	train-mlogloss:1.606363
[1]	eval-mlogloss:1.486748	train-mlogloss:1.339669
[2]	eval-mlogloss:1.338123	train-mlogloss:1.151152
[3]	eval-mlogloss:1.222521	train-mlogloss:1.001925
[4]	eval-mlogloss:1.130430	train-mlogloss:0.882713
[5]	eval-mlogloss:1.057677	train-mlogloss:0.788074
[6]	eval-mlogloss:0.997477	train-mlogloss:0.706803
[7]	eval-mlogloss:0.948791	train-mlogloss:0.638983
[8]	eval-mlogloss:0.905460	train-mlogloss:0.578327
[9]	eval-mlogloss:0.867126	train-mlogloss:0.527538
[10]	eval-mlogloss:0.837288	train-mlogloss:0.485476
[11]	eval-mlogloss:0.809795	train-mlogloss:0.446223
[12]	eval-mlogloss:0.786671	train-mlogloss:0.413725
[13]	eval-mlogloss:0.768427	train-mlogloss:0.384550
[14]	eval-mlogloss:0.750099	train-mlogloss:0.356669
[15]	eval-mlogloss:0.734830	train-mlogloss:0.333665
[16]	eval-mlogloss:0.720036	train-mlogloss:0.312193
[17]	eval-mlogloss:0.707503	train-mlogloss:0.293349
[18]	eval-mlog

In [190]:
dcv = xgb.DMatrix(xcv.values)
XGB1 = gbm.predict(dcv)
indices = XGB1 < 0
XGB1[indices] = 0

In [191]:
multiclass_log_loss(ycv, XGB1)

0.63641752444569921

**The best score is:** 0.45266287447996978

## 5. Predict classes probabilities

#### Prepare submit with LR 

In [216]:
rf.fit(train_stacked_grouped[cv_columns_tr], train_stacked_grouped.TripType)
predictionRF = rf.predict_proba(test_stacked_grouped[cv_columns_te])

In [217]:
submit_XGB = predictionRF
#test_stacked_grouped[:3]

#### With XGBoost

In [192]:
dtest = xgb.DMatrix(test_stacked_grouped[cv_columns_te].as_matrix())
predictionXGB = gbm.predict(dtest)
indices = predictionXGB < 0
predictionXGB[indices] = 0

In [193]:
submit_XGB = predictionXGB

In [194]:
submit_XGB

array([[  1.26707437e-03,   1.24593813e-03,   1.26244873e-03, ...,
          8.39606673e-03,   1.45061198e-03,   2.10321005e-02],
       [  1.98764680e-03,   1.95156259e-03,   2.53459415e-03, ...,
          4.81983833e-03,   2.16702512e-03,   1.71710372e-01],
       [  6.18355043e-05,   5.76344464e-05,   5.99063424e-05, ...,
          5.88029179e-05,   6.03839690e-05,   9.95433390e-01],
       ..., 
       [  1.77400128e-04,   1.74212633e-04,   1.77757800e-04, ...,
          1.77744412e-04,   1.82523479e-04,   1.81616924e-03],
       [  7.02941208e-04,   6.91214984e-04,   7.43283832e-04, ...,
          6.43221196e-03,   1.08075151e-02,   1.97649351e-03],
       [  3.31287098e-04,   3.08221439e-04,   3.20372754e-04, ...,
          1.32542208e-03,   7.11742265e-04,   2.16437201e-03]], dtype=float32)

## 5. Submit probas 

In [218]:
cl_names = ['VisitNumber']
for cls in rf.classes_:
    cl_names.append('TripType_' + str(int(cls)))

In [220]:
submit = pd.DataFrame(columns=cl_names)
submit['VisitNumber'] = test_stacked_grouped['VisitNumber']
submit[cl_names[1:]] = submit_XGB
submit[:5]

Unnamed: 0,VisitNumber,TripType_3,TripType_4,TripType_5,TripType_6,TripType_7,TripType_8,TripType_9,TripType_12,TripType_14,...,TripType_36,TripType_37,TripType_38,TripType_39,TripType_40,TripType_41,TripType_42,TripType_43,TripType_44,TripType_999
0,1,0,0.01,0.0,0.0,0.0,0.01,0.07,0.01,0,...,0.01,0.0,0.52,0.07,0.01,0,0,0.01,0.01,0.01
1,2,0,0.03,0.01,0.02,0.05,0.12,0.07,0.0,0,...,0.01,0.04,0.03,0.04,0.0,0,0,0.0,0.0,0.4
2,3,0,0.0,0.0,0.0,0.0,0.07,0.0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.93
3,4,0,0.0,0.0,0.0,0.0,0.0,0.99,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.01
4,6,0,0.0,0.0,0.0,0.0,0.0,0.07,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.93


In [221]:
submit.to_csv('submit.csv', index=False)