In [246]:
import numpy as np
import pandas as pd

## 1. Get the Data 

In [247]:
train = pd.read_csv('data/train.csv', low_memory=False)
test = pd.read_csv('data/test.csv', low_memory=False)

In [248]:
train[:5]

Unnamed: 0,TripType,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber
0,999,5,Friday,68113152929,-1,FINANCIAL SERVICES,1000
1,30,7,Friday,60538815980,1,SHOES,8931
2,30,7,Friday,7410811099,1,PERSONAL CARE,4504
3,26,8,Friday,2238403510,2,PAINT AND ACCESSORIES,3565
4,26,8,Friday,2006613744,2,PAINT AND ACCESSORIES,1017


In [249]:
train.dtypes

TripType                   int64
VisitNumber                int64
Weekday                   object
Upc                      float64
ScanCount                  int64
DepartmentDescription     object
FinelineNumber           float64
dtype: object

## 2. Feature Engineering

In [250]:
# Replace labels with floats
from sklearn.preprocessing import LabelEncoder
lbl_enc = LabelEncoder()

for c in ['Weekday', 'DepartmentDescription']:
    train[c] = lbl_enc.fit_transform(train[c])
    test[c] = lbl_enc.transform(test[c])
    
train[:3]

Unnamed: 0,TripType,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber
0,999,5,0,68113152929,-1,21,1000
1,30,7,0,60538815980,1,63,8931
2,30,7,0,7410811099,1,51,4504


In [251]:
# Show unique department descriptions
departments = np.sort(train.DepartmentDescription.unique())

In [252]:
def build_features(data):
    for dep in departments:
        data['Department_' + str(dep)] = 0
        
    for index, row in data.iterrows():
        data.loc[index, 'Department_' + str(int(row.DepartmentDescription))] = 1

In [None]:
build_features(train)
build_features(test)

In [None]:
cols_tr = list(train.columns)
[cols_tr.remove(c) for c in ['Weekday', 'DepartmentDescription', 'Upc', 'FinelineNumber']]
cols_te = cols_tr.remove('TripType')

In [None]:
departments_count_tr = train[cols_tr].groupby(['VisitNumber', 'TripType']).sum().reset_index()
triptypes_tr = train[cols_tr].groupby(['VisitNumber']).sum().TripType.reset_index()

In [None]:
featured_train = pd.concat([triptypes_tr, departments_count_tr], axis=1)
featured_test = test[cols_te].groupby(['VisitNumber']).sum().reset_index()

In [216]:
train[cols].isnull().sum(axis=0)

VisitNumber      0
ScanCount        0
Department_0     0
Department_1     0
Department_2     0
Department_3     0
Department_4     0
Department_5     0
Department_6     0
Department_7     0
Department_8     0
Department_9     0
Department_10    0
Department_11    0
Department_12    0
Department_13    0
Department_14    0
Department_15    0
Department_16    0
Department_17    0
Department_18    0
Department_19    0
Department_20    0
Department_21    0
Department_22    0
Department_23    0
Department_24    0
Department_25    0
Department_26    0
Department_27    0
                ..
Department_39    0
Department_40    0
Department_41    0
Department_42    0
Department_43    0
Department_44    0
Department_45    0
Department_46    0
Department_47    0
Department_48    0
Department_49    0
Department_50    0
Department_51    0
Department_52    0
Department_53    0
Department_54    0
Department_55    0
Department_56    0
Department_57    0
Department_58    0
Department_59    0
Department_6

## 3. Cross Validation 

In [None]:
def multiclass_log_loss(clf, y_true, y_pred, eps=1e-15):
    Y_true = ycv.apply(lambda x: np.where(rf.classes_==x)[0][0]).values
    predictions = np.clip(y_pred, eps, 1 - eps)

    # normalize row sums to 1
    predictions /= predictions.sum(axis=1)[:, np.newaxis]

    actual = np.zeros(y_pred.shape)
    rows = actual.shape[0]
    actual[np.arange(rows), Y_true.astype(int)] = 1
    vsota = np.sum(actual * np.log(predictions))
    return -1.0 / rows * vsota

In [None]:
from sklearn.cross_validation import train_test_split

xtrain, xcv, ytrain, ycv = train_test_split(featured_train[cols_te], featured_train['TripType'], test_size = 0.05,  random_state = 42)

#### 1. Try Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=10)
rf.fit(xtrain, ytrain)
predictionRF = rf.predict_proba(xcv)

In [None]:
multiclass_log_loss(rf, ycv, predictionRF)

#### 2. Try Logistic Regression 

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(
    penalty='l2',
    solver='lbfgs',
    multi_class='multinomial'
)

lr.fit(xtrain, ytrain)
predictionLR = lr.predict_proba(xcv)

In [None]:
multiclass_log_loss(lr, ycv, predictionLR)

## 4. Predict classes probabilities

In [51]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=10)
rf.fit(train[cols], train.TripType)
predictionRF = rf.predict_proba(test[cols])

In [52]:
test[:5]

Unnamed: 0,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber
0,1,0,72503389714,1,63,3002
1,1,0,1707710732,1,17,1526
2,1,0,89470001026,1,17,1431
3,1,0,88491211470,1,25,3555
4,2,0,2840015224,1,18,4408


In [53]:
predictionRF[:5]

array([[ 0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0.2,  0. ,
         0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,
         0.5,  0. ,  0.1,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0.1,
         0.1,  0. ,  0. ,  0. ,  0. ],
       [ 0. ,  0. ,  0. ,  0. ,  0. ,  0.1,  0. ,  0. ,  0. ,  0. ,  0. ,
         0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0.3,  0. ,  0. ,  0. ,
         0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0.1,  0.5,  0. ,
         0. ,  0. ,  0. ,  0. ,  0. ],
       [ 0. ,  0. ,  0. ,  0. ,  0. ,  0.1,  0. ,  0. ,  0. ,  0. ,  0. ,
         0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0.3,  0. ,  0. ,  0. ,
         0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0.1,  0.5,  0. ,
         0. ,  0. ,  0. ,  0. ,  0. ],
       [ 0. ,  0. ,  0. ,  0. ,  0.1,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,
         0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,
         0. ,  0. ,  0. ,  0. ,  0. ,  0.6,  0. ,  0. ,  0.3,  0. ,  

## 5. Submit probas 

In [None]:
submit = pd.DataFrame()
submit['Id'] = xrange(len(test))
submit['Answer'] = submitLR[:, 1]
submit[:5]