In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import pylab as pl
import sklearn.cross_validation, sklearn.metrics

%matplotlib inline
pd.set_option('display.mpl_style', 'default')
pl.rcParams['figure.figsize'] = (20, 10)

train = pd.read_csv('train.csv', low_memory=False)
test = pd.read_csv('test.csv', low_memory=False)
types = np.unique(train.TripType.values)
types_dictionary = {t: i for i, t in  enumerate(types)}

In [2]:
def get_prior_probabilities(data):
    prior_probabilities = np.zeros(len(types))
    prior_probabilities += np.array([len(data[data.TripType == trip_type]) for trip_type in types])
    prior_probabilities /= prior_probabilities.sum()
    return prior_probabilities

Unnamed: 0,TripType,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber
0,999,5,Friday,68113152929,-1,FINANCIAL SERVICES,1000
1,30,7,Friday,60538815980,1,SHOES,8931
2,30,7,Friday,7410811099,1,PERSONAL CARE,4504
3,26,8,Friday,2238403510,2,PAINT AND ACCESSORIES,3565
4,26,8,Friday,2006613744,2,PAINT AND ACCESSORIES,1017
5,26,8,Friday,2006618783,2,PAINT AND ACCESSORIES,1017
6,26,8,Friday,2006613743,1,PAINT AND ACCESSORIES,1017
7,26,8,Friday,7004802737,1,PAINT AND ACCESSORIES,2802
8,26,8,Friday,2238495318,1,PAINT AND ACCESSORIES,4501
9,26,8,Friday,2238400200,-1,PAINT AND ACCESSORIES,3565


In [33]:
def normalize_proba(proba, eps=10**-15):
    return np.max(np.min(proba, 1 - eps), eps)

def validate_logloss(visit_type_array, prediction_matrix):

    log_loss = 0.
    N = np.unique(visit_type_array[:,0]).shape[0]
    for i in range(len(visit_type_array)):
        proba = normalize_proba(prediction_matrix[i, types_dictionary[visit_type_array[i, 1]]])
        log_loss += np.log(proba)
    log_loss = -log_loss/N
    return log_loss

In [36]:
validation = False
if validation:
    log_loss_cv = 0.
    n_folds = 5
    #stratified KFold
    for i, (train_indices, test_indices) in enumerate(sklearn.cross_validation.KFold(n=len(train),
                                                                                     n_folds=n_folds, shuffle=True)):
        data_train = train.iloc[train_indices]
        data_test = train.iloc[test_indices]

        prior_probabilities = get_prior_probabilities(data_train)
        prediction_matrix = np.asarray([prior_probabilities]*len(data_test.groupby('VisitNumber').groups))
        
        log_loss = validate_logloss(data_train[['VisitNumber', 'TripType']].groupby('VisitNumber').TripType.median().reset_index().values,
                                    prediction_matrix)
        print('log_loss on', i+1, 'fold =', log_loss)
        log_loss_cv += log_loss
    del data_train, data_test, prediction_matrix, prior_probabilities, log_loss
    log_loss_cv /= n_folds

    print('LOG LOSS CV =', log_loss_cv)

elif not validation:
    prior_probabilities = get_prior_probabilities(train)
    prediction_matrix = np.asarray([prior_probabilities]*len(test.groupby('VisitNumber').groups))
    prediction_df = pd.DataFrame(prediction_matrix, columns=['TripType_' + str(i) for i in types])
    prediction_df['VisitNumber'] = list(test.groupby('VisitNumber').groups.keys())
    prediction_df = prediction_df[['VisitNumber'] + [x for x in prediction_df.columns if x != 'VisitNumber']]
    del prediction_matrix, prior_probabilities
    prediction_df.to_csv('prediction.csv', index=False)
    