In [3]:
from os import path
import gzip
import csv
from keras.layers.advanced_activations import PReLU
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.normalization import BatchNormalization
from keras.models import Sequential
from keras.utils import np_utils
from sklearn import metrics
from sklearn.cross_validation import KFold
from sklearn.preprocessing import StandardScaler
from time import time

import pandas as pd
import numpy as np

def preprocess_data(X, scaler=None):
    if not scaler:
        scaler = StandardScaler()
    return pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

def dating(x):
    date, time = x.split(' ')
    y, m, d = map(int, date.split('-'))
    h = int(time.split(':')[0])
    return [y, m, d, h]


def create_submission(preds):
    with gzip.open('submission{0}.gz'.format(time()), 'wt') as outf:
        fo = csv.writer(outf, lineterminator='\n')
        fo.writerow('Id,ARSON,ASSAULT,BAD CHECKS,BRIBERY,BURGLARY,DISORDERLY CONDUCT,DRIVING UNDER THE INFLUENCE,DRUG/NARCOTIC,DRUNKENNESS,EMBEZZLEMENT,EXTORTION,FAMILY OFFENSES,FORGERY/COUNTERFEITING,FRAUD,GAMBLING,KIDNAPPING,LARCENY/THEFT,LIQUOR LAWS,LOITERING,MISSING PERSON,NON-CRIMINAL,OTHER OFFENSES,PORNOGRAPHY/OBSCENE MAT,PROSTITUTION,RECOVERED VEHICLE,ROBBERY,RUNAWAY,SECONDARY CODES,SEX OFFENSES FORCIBLE,SEX OFFENSES NON FORCIBLE,STOLEN PROPERTY,SUICIDE,SUSPICIOUS OCC,TREA,TRESPASS,VANDALISM,VEHICLE THEFT,WARRANTS,WEAPON LAWS'.split(','))
        for i, pred in enumerate(preds):
            fo.writerow([i] + list(pred))

def load_data(trainpath='input/train.csv', testpath = 'input/test.csv'):
    from sklearn import preprocessing
    def load_(filepath):
        msg_path = filepath.split('.')[0]+'.msg'
        if path.isfile(msg_path):
            df_data = pd.read_msgpack(msg_path)
        else:
            df_data = pd.read_csv(filepath)
            dates = pd.DataFrame([dating(date) for date in df_data['Dates']], columns=['Year', 'Month', 'Day', 'Hour'])
            df_data.drop(['Dates'],  axis=1, inplace=True)
            df_data = pd.concat((df_data, dates), axis=1)
            df_data.replace({'DayOfWeek':{'Monday': 1, 'Tuesday': 2, 'Wednesday': 3, 'Thursday': 4, 'Friday': 5, 'Saturday': 6, 'Sunday': 7}}, inplace=True)
            
            vec = preprocessing.LabelEncoder()
            df_data['Address'] = vec.fit_transform(df_data['Address'])
            df_data['PdDistrict'] = vec.fit_transform(df_data['PdDistrict'])
            df_data.to_msgpack(msg_path)
        return df_data
    
    df_train = load_(trainpath).apply(np.random.permutation)
    X_train = df_train.drop(['Category', 'Resolution', 'Descript'], axis=1)#preprocess_data(
    Y_train = preprocessing.LabelEncoder().fit_transform(df_train['Category'])
    
    X_test = load_(testpath).drop(['Id'],  axis=1) # preprocess_data(
    return X_train, Y_train, X_test

X_train, Y_train, X_test = load_data()

input_dim = X_train.shape
output_dim = Y_train.shape
print('Input dimensions: {}, output dimensions: {}'.format(input_dim, output_dim))

Using Theano backend.


Input dimensions: (878049, 9), output dimensions: (878049L,)


In [None]:
Y_train = np_utils.to_categorical(Y_train)

In [None]:
def build_model(input_dim, output_dim, hn=32, dp=np.float32(0.5), layers=1):
    model = Sequential()
    model.add(Dense(hn, input_dim = input_dim))
    model.add(PReLU())
    model.add(Dropout(0.5))

    for i in range(layers):
        model.add(Dense(hn))
        model.add(PReLU())
        model.add(BatchNormalization(hn))
        model.add(Dropout(0.5))

    model.add(Dense(output_dim))
    model.add(Activation('softmax'))
    
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    return model


EPOCHS = 1
BATCHES = 128
HN = 64
RUN_FOLDS = False
nb_folds = 4
kfolds = KFold(len(y), nb_folds)
av_ll = 0.
f = 0
if RUN_FOLDS:
    for train, valid in kfolds:
        print('---' * 20)
        print('Fold', f)
        print('---' * 20)
        f += 1
        X_train = X[train]
        X_valid = X[valid]
        Y_train = Y[train]
        Y_valid = Y[valid]
        y_valid = y[valid]

        print("Building model...")
        model = build_model(input_dim, output_dim, HN)

        print("Training model...")

        model.fit(X_train, Y_train, nb_epoch=EPOCHS, batch_size=BATCHES, validation_data=(X_valid, Y_valid), verbose=0)
        valid_preds = model.predict_proba(X_valid)
        ll = metrics.log_loss(y_valid, valid_preds)
        print("LL:", ll)
        av_ll += ll
    print('Average LL:', av_ll / nb_folds)
    
print("Generating submission...")

model = build_model(input_dim, output_dim, HN)
model.fit(X, Y, nb_epoch=EPOCHS, batch_size=BATCHES, verbose=0)

print('Predicting over testing data...')
preds = model.predict_proba(X_test, verbose=0)

create_subsmission(preds)

In [None]:
import xgboost as xgb

xg_train = xgb.DMatrix(X_train, label=Y_train)
xg_test = xgb.DMatrix(X_test, label=Y_test)
# setup parameters for xgboost
param = {}
# use softmax multi-class classification
param['objective'] = 'multi:softmax'
# scale weight of positive examples
param['eta'] = 0.1
param['max_depth'] = 6
param['silent'] = 1
param['nthread'] = 4
param['num_class'] = 6

watchlist = [ (xg_train,'train'), (xg_test, 'test') ]
num_round = 5
bst = xgb.train(param, xg_train, num_round, watchlist );
# get prediction
pred = bst.predict( xg_test );

print ('predicting, classification error=%f' % (sum( int(pred[i]) != test_Y[i] for i in range(len(test_Y))) / float(len(test_Y)) ))

# do the same thing again, but output probabilities
param['objective'] = 'multi:softprob'
bst = xgb.train(param, xg_train, num_round, watchlist );
# Note: this convention has been changed since xgboost-unity
# get prediction, this is in 1D array, need reshape to (ndata, nclass)
yprob = bst.predict( xg_test ).reshape( test_Y.shape[0], 6 )
ylabel = np.argmax(yprob, axis=1)

print ('predicting, classification error=%f' % (sum( int(ylabel[i]) != test_Y[i] for i in range(len(test_Y))) / float(len(test_Y)) ))

In [8]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

clf = RandomForestClassifier(warm_start=True, oob_score=True, max_features="sqrt")
clf.fit(X_train, Y_train)

  warn("Some inputs do not have OOB scores. "


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='sqrt', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=True, random_state=None, verbose=0, warm_start=True)

In [4]:
from sklearn import neighbors

n_neighbors = 1
clf = neighbors.KNeighborsClassifier(n_neighbors, weights='distance') # 'uniform', 'distance'
clf.fit(X_train, Y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=1, p=2,
           weights='distance')

In [11]:
p = clf.predict(X_test)
p

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [70]:
from sklearn import gaussian_process

categories = set(Y_train)
predictions = dict()

for cat in categories:
    selector = np.array((Y_train == cat), dtype = int)
    x = X_train[selector]
    y = Y_train[selector]
    
    gp = gaussian_process.GaussianProcess(theta0=1e-2, thetaL=1e-4, thetaU=1e-1)
    gp.fit(x, y)

    Y_pred, sigma2_pred = gp.predict(X_test, eval_MSE=True)
    predictions[cat] = Y_pred

[0 0 0 ..., 0 0 0]
[0 0 0 ..., 0 0 0]
[0 0 0 ..., 0 0 0]
[0 0 0 ..., 0 0 0]
[0 0 0 ..., 1 0 0]
[0 0 0 ..., 0 0 0]
[0 0 0 ..., 0 0 0]
[0 0 0 ..., 0 0 0]
[0 0 0 ..., 0 0 0]
[0 0 0 ..., 0 0 0]
[0 0 0 ..., 0 0 0]
[0 0 0 ..., 0 0 0]
[0 0 0 ..., 0 0 0]
[0 0 0 ..., 0 0 0]
[0 0 0 ..., 0 0 0]
[0 0 0 ..., 0 0 0]
[1 0 0 ..., 0 0 1]
[0 0 0 ..., 0 0 0]
[0 0 0 ..., 0 0 0]
[0 0 0 ..., 0 0 0]
[0 0 0 ..., 0 0 0]
[0 0 0 ..., 0 1 0]
[0 0 0 ..., 0 0 0]
[0 0 0 ..., 0 0 0]
[0 0 0 ..., 0 0 0]
[0 0 0 ..., 0 0 0]
[0 0 0 ..., 0 0 0]
[0 0 0 ..., 0 0 0]
[0 0 0 ..., 0 0 0]
[0 0 0 ..., 0 0 0]
[0 0 0 ..., 0 0 0]
[0 0 0 ..., 0 0 0]
[0 0 0 ..., 0 0 0]
[0 0 0 ..., 0 0 0]
[0 0 0 ..., 0 0 0]
[0 0 0 ..., 0 0 0]
[0 1 0 ..., 0 0 0]
[0 0 0 ..., 0 0 0]
[0 0 1 ..., 0 0 0]


In [11]:
from sklearn import gaussian_process
from sklearn.metrics import log_loss
from sklearn import cross_validation

n = 7000
x_train = X_train[:n]
y_hot = np_utils.to_categorical(Y_train)[:n]

x_train, x_test, y_hot, y_test = cross_validation.train_test_split(x_train, y_hot, test_size=0.25, random_state=0)

gp = gaussian_process.GaussianProcess(theta0=1e-2, thetaL=1e-4, thetaU=1e-1)
gp.fit(x_train, y_hot)
y_pred, sigma2_pred = gp.predict(x_test, eval_MSE=True)
log_loss(y_test, y_pred)

17.779953560468538

In [12]:
y_hot.shape

(878049L, 39L)

In [87]:
np_utils.to_categorical(Y_train).shape
logloss

(878049L, 39L)