In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import pylab as pl
import sklearn.cross_validation, sklearn.metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction import DictVectorizer, FeatureHasher
import scipy.stats

%matplotlib inline
pd.set_option('display.mpl_style', 'default')
pl.rcParams['figure.figsize'] = (20, 10)

train = pd.read_csv('train.csv', low_memory=False)
test = pd.read_csv('test.csv', low_memory=False)
types = np.unique(train.TripType.values)
types_dictionary = {t: i for i, t in  enumerate(types)}

In [None]:
def at(l):
    return l.iloc[0]
def mklist(l):
    return list(l)

train = train.groupby('VisitNumber').agg({'TripType': at, 'Weekday': at, 'ScanCount': at, 'DepartmentDescription': at,
                                          'FinelineNumber': mklist, 'Upc': mklist}).reset_index()
test = test.groupby('VisitNumber').agg({'Weekday': at, 'ScanCount': at, 'DepartmentDescription': at,
                                        'FinelineNumber': mklist, 'Upc': mklist}).reset_index()


In [None]:
def get_prior_probabilities(data):
    prior_probabilities = np.zeros(len(types))
    prior_probabilities += np.array([len(data[data.TripType == trip_type]) for trip_type in types])
    prior_probabilities /= prior_probabilities.sum()
    return prior_probabilities

In [None]:
def construct_dummy_features(data, dummy_columns):
    data = pd.get_dummies(data, sparse=True, dummy_na=False, columns=dummy_columns)
    return data

In [None]:
data = train.append(test)
data = data[['VisitNumber', 'TripType', 'Weekday', 'ScanCount', 'DepartmentDescription']]
data = construct_dummy_features(data, ['Weekday', 'DepartmentDescription'])
train = data.iloc[:len(train)]
test = data.iloc[len(train):]
del data, test['TripType']

In [None]:
def fit_predict(train_array, train_labels, test_array, prior_probabilities=None):
    clf = RandomForestClassifier(n_estimators=100, max_depth=20, min_samples_leaf=1, max_features=0.1, n_jobs=-1)
    clf.fit(train_array, train_labels)
    prediction_matrix = clf.predict_proba(test_array)
    if not prior_probabilities is None:
        coeff = 0.1
        prediction_matrix = (prediction_matrix + [prior_probabilities*coeff]*len(prediction_matrix))/(1. + coeff)
    return prediction_matrix

##MAIN

In [None]:
validation = True

if validation:
    y = train.TripType.values

    for train_indices, test_indices in sklearn.cross_validation.StratifiedShuffleSplit(y, n_iter=1, test_size=0.1):
        data_train = train.iloc[train_indices]
        data_test = train.iloc[test_indices]
        
#         prior_probabilities = get_prior_probabilities(data_train)
        
        prediction_matrix = fit_predict(data_train[[col for col in data_train.columns 
                                                    if col not in ['TripType', 'VisitNumber']]].values, 
                                        data_train.TripType.values, 
                                        data_test[[col for col in data_test.columns 
                                                   if col not in ['TripType', 'VisitNumber']]].values)
        
        prediction_matrix = np.delete(prediction_matrix, 7, axis=1)
        
        y_true = data_test.TripType.values
        
        log_loss = sklearn.metrics.log_loss(y_true, prediction_matrix) 
        
        print('log_loss =', log_loss)
     
    del y, y_true, data_train, data_test, prediction_matrix, log_loss
    
elif not validation:
#     prior_probabilities = get_prior_probabilities(train)
    prediction_matrix = fit_predict(train[[col for col in train.columns 
                                            if col not in ['TripType', 'VisitNumber']]].values, 
                                     train.TripType.values, 
                                     test[[col for col in test.columns if col not in ['VisitNumber']]].values)
    
    prediction_df = pd.DataFrame(prediction_matrix, columns=['TripType_' + str(i) for i in types])
    prediction_df['VisitNumber'] = test.VisitNumber
    
    prediction_df = prediction_df[['VisitNumber'] + [x for x in prediction_df.columns if x != 'VisitNumber']]
    del prediction_matrix
    filename = 'prediction.csv'
    prediction_df.to_csv(filename, index=False)
    print(filename, 'created')
    