In [1]:
import numpy as np
import pandas as pd

## 1. Get the Data 

In [2]:
train = pd.read_csv('data/train.csv', low_memory=False)
test = pd.read_csv('data/test.csv', low_memory=False)

In [3]:
train[:5]

Unnamed: 0,TripType,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber
0,999,5,Friday,68113152929,-1,FINANCIAL SERVICES,1000
1,30,7,Friday,60538815980,1,SHOES,8931
2,30,7,Friday,7410811099,1,PERSONAL CARE,4504
3,26,8,Friday,2238403510,2,PAINT AND ACCESSORIES,3565
4,26,8,Friday,2006613744,2,PAINT AND ACCESSORIES,1017


In [4]:
train.dtypes

TripType                   int64
VisitNumber                int64
Weekday                   object
Upc                      float64
ScanCount                  int64
DepartmentDescription     object
FinelineNumber           float64
dtype: object

## 2. Feature Engineering

In [5]:
# Replace labels with floats
from sklearn.preprocessing import LabelEncoder
lbl_enc = LabelEncoder()

for c in ['Weekday', 'DepartmentDescription']:
    train[c] = lbl_enc.fit_transform(train[c])
    test[c] = lbl_enc.transform(test[c])
    
train[:3]

  flag = np.concatenate(([True], aux[1:] != aux[:-1]))
  return aux[:-1][aux[1:] == aux[:-1]]


Unnamed: 0,TripType,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber
0,999,5,0,68113152929,-1,21,1000
1,30,7,0,60538815980,1,63,8931
2,30,7,0,7410811099,1,51,4504


In [6]:
# Show unique department descriptions
departments = np.sort(train.DepartmentDescription.unique())

In [7]:
def build_features(data):
    for dep in departments:
        data['Department_' + str(dep)] = 0
        
    for index, row in data.iterrows():
        data.loc[index, 'Department_' + str(int(row.DepartmentDescription))] = 1

In [8]:
build_features(train)
build_features(test)

In [9]:
cols_tr = list(train.columns)
[cols_tr.remove(c) for c in ['Weekday', 'DepartmentDescription', 'Upc', 'FinelineNumber']]
cols_te = list(train.columns)
[cols_te.remove(c) for c in ['Weekday', 'TripType', 'DepartmentDescription', 'Upc', 'FinelineNumber']]

[None, None, None, None, None]

In [10]:
featured_train = train[cols_tr].groupby(['VisitNumber', 'TripType']).sum().reset_index()
featured_test = test[cols_te].groupby(['VisitNumber']).sum().reset_index()

In [19]:
cols = cols_te
cols.remove('VisitNumber')

In [20]:
train[cols].isnull().sum(axis=0)

ScanCount        0
Department_0     0
Department_1     0
Department_2     0
Department_3     0
Department_4     0
Department_5     0
Department_6     0
Department_7     0
Department_8     0
Department_9     0
Department_10    0
Department_11    0
Department_12    0
Department_13    0
Department_14    0
Department_15    0
Department_16    0
Department_17    0
Department_18    0
Department_19    0
Department_20    0
Department_21    0
Department_22    0
Department_23    0
Department_24    0
Department_25    0
Department_26    0
Department_27    0
Department_28    0
                ..
Department_39    0
Department_40    0
Department_41    0
Department_42    0
Department_43    0
Department_44    0
Department_45    0
Department_46    0
Department_47    0
Department_48    0
Department_49    0
Department_50    0
Department_51    0
Department_52    0
Department_53    0
Department_54    0
Department_55    0
Department_56    0
Department_57    0
Department_58    0
Department_59    0
Department_6

## 3. Cross Validation 

In [12]:
def multiclass_log_loss(clf, y_true, y_pred, eps=1e-15):
    Y_true = ycv.apply(lambda x: np.where(rf.classes_==x)[0][0]).values
    predictions = np.clip(y_pred, eps, 1 - eps)

    # normalize row sums to 1
    predictions /= predictions.sum(axis=1)[:, np.newaxis]

    actual = np.zeros(y_pred.shape)
    rows = actual.shape[0]
    actual[np.arange(rows), Y_true.astype(int)] = 1
    vsota = np.sum(actual * np.log(predictions))
    return -1.0 / rows * vsota

In [21]:
from sklearn.cross_validation import train_test_split

xtrain, xcv, ytrain, ycv = train_test_split(featured_train[cols], featured_train['TripType'], test_size = 0.052,  random_state = 42)

#### 1. Try Random Forest Classifier

In [22]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=100,
    max_depth=20,
    max_features=16,
    n_jobs=2,
    oob_score=True
)
rf.fit(xtrain, ytrain)
predictionRF = rf.predict_proba(xcv)

In [23]:
multiclass_log_loss(rf, ycv, predictionRF)

1.2526831494360087

The best score is: 1.2526831494360087

#### 2. Try Logistic Regression 

In [285]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(
    penalty='l2',
    solver='lbfgs',
    multi_class='multinomial'
)

lr.fit(xtrain, ytrain)
predictionLR = lr.predict_proba(xcv)

In [286]:
multiclass_log_loss(lr, ycv, predictionLR)

3.4526601797370922

## 4. Predict classes probabilities

In [1]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=1250,
    max_depth=20,
    max_features=16,
    # n_jobs=2,
    oob_score=True
)
rf.fit(featured_train[cols], featured_train['TripType'])
predictionRF = rf.predict_proba(featured_test[cols])

NameError: name 'featured_train' is not defined

In [53]:
predictionRF[:3]

array([[ 0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0.2,  0. ,
         0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,
         0.5,  0. ,  0.1,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0.1,
         0.1,  0. ,  0. ,  0. ,  0. ],
       [ 0. ,  0. ,  0. ,  0. ,  0. ,  0.1,  0. ,  0. ,  0. ,  0. ,  0. ,
         0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0.3,  0. ,  0. ,  0. ,
         0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0.1,  0.5,  0. ,
         0. ,  0. ,  0. ,  0. ,  0. ],
       [ 0. ,  0. ,  0. ,  0. ,  0. ,  0.1,  0. ,  0. ,  0. ,  0. ,  0. ,
         0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0.3,  0. ,  0. ,  0. ,
         0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0.1,  0.5,  0. ,
         0. ,  0. ,  0. ,  0. ,  0. ],
       [ 0. ,  0. ,  0. ,  0. ,  0.1,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,
         0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,
         0. ,  0. ,  0. ,  0. ,  0. ,  0.6,  0. ,  0. ,  0.3,  0. ,  

## 5. Submit probas 

In [None]:
cl_names =['VisitNumber']
for cls in rf.classes_:
    cl_names.append(cls)

In [None]:
submit = pd.DataFrame(names=cl_names)
submit['Id'] = xrange(len(test))
submit['Answer'] = submitLR[:, 1]
submit[:5]