In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import pylab as pl
import sklearn.cross_validation, sklearn.metrics

%matplotlib inline
pd.set_option('display.mpl_style', 'default')
pl.rcParams['figure.figsize'] = (20, 10)

train = pd.read_csv('train.csv', low_memory=False)
test = pd.read_csv('test.csv', low_memory=False)
types = np.unique(train.TripType.values)
types_dictionary = {t: i for i, t in  enumerate(types)}

In [2]:
def get_prior_probabilities(data):
    prior_probabilities = np.zeros(len(types))
    prior_probabilities += np.array([len(data[data.TripType == trip_type]) for trip_type in types])
    prior_probabilities /= prior_probabilities.sum()
    return prior_probabilities

In [20]:
validation = True
if validation:
    y = train.groupby('VisitNumber').TripType.median().values
    
    for train_indices, test_indices in sklearn.cross_validation.StratifiedShuffleSplit(y, n_iter=1, test_size=0.1):
        data_train = train.iloc[train_indices]
        data_test = train.iloc[test_indices]
    
        prior_probabilities = get_prior_probabilities(data_train)
        y_true = data_test.groupby('VisitNumber').TripType.median().values
        
        prediction_matrix = np.asarray([prior_probabilities]*len(y_true))
        prediction_matrix = np.delete(prediction_matrix, 7, axis=1)
        
        log_loss = sklearn.metrics.log_loss(y_true, prediction_matrix) 
        
        print('log_loss =', log_loss)
        
    del data_train, data_test, prediction_matrix, prior_probabilities, log_loss
    
elif not validation:
    prior_probabilities = get_prior_probabilities(train)
    prediction_matrix = np.asarray([prior_probabilities]*len(test.groupby('VisitNumber').groups))
    prediction_df = pd.DataFrame(prediction_matrix, columns=['TripType_' + str(i) for i in types])
    prediction_df['VisitNumber'] = list(test.groupby('VisitNumber').groups.keys())
    prediction_df = prediction_df[['VisitNumber'] + [x for x in prediction_df.columns if x != 'VisitNumber']]
    del prediction_matrix, prior_probabilities
    prediction_df.to_csv('prediction.csv', index=False)
    

log_loss = 3.07533931456
