In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import matplotlib.path as path
%matplotlib inline

from datetime import datetime, time
from sklearn.neighbors import KNeighborsClassifier
from sknn.mlp import Classifier, Layer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, ExtraTreesRegressor, GradientBoostingClassifier
from sklearn.cross_validation import KFold, train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import log_loss
from sklearn.grid_search import GridSearchCV

In [2]:
# def get_data():
df = pd.read_csv('train.csv.gz')

In [3]:
# df.total_time.map(lambda x: np.log10(x+1)).hist()

In [4]:
df_color = pd.DataFrame({'color1':df.Color.str.split('/').str[0], 'color2':df.Color.str.split('/').str[1]})
vc_color = pd.concat((df_color['color1'], df_color['color2'])).value_counts()
common_colors = vc_color[vc_color > 300].index

In [5]:
def get_vars(df, cols_x, cols_y=None):
    minmax = MinMaxScaler()
    
    df.drop_duplicates(inplace=True)
    df.DateTime = pd.to_datetime(df.DateTime)
    df['weekday'] = df.DateTime.dt.weekday
    df['weekend'] = (df.weekday > 4) * 1
    df['viralata'] = (df.Breed.str.contains('Mix') | df.Breed.str.contains('/')) *1

    df['common_colors'] = df.Color.isin(common_colors) * 1
    
    #     df.ix[df.AgeuponOutcome.isnull(),'AgeuponOutcome'] = 0
    filt = df.AgeuponOutcome.str.contains('year')
    df['days_multiplyer'] = 0
    df.ix[df.AgeuponOutcome.str.contains('year')==True,'days_multiplyer'] = 365
    df.ix[df.AgeuponOutcome.str.contains('month')==True,'days_multiplyer'] = 30
    df.ix[df.AgeuponOutcome.str.contains('week')==True,'days_multiplyer'] = 7
    df.ix[df.AgeuponOutcome.str.contains('day')==True,'days_multiplyer'] = 1
    df.totaltime = df.AgeuponOutcome.str.split().str[0]
    df.ix[df.totaltime.isnull(),'AgeuponOutcome'] = 0
    df['total_time'] = df.totaltime.astype('float64') * df.days_multiplyer

    df.ix[df.total_time.isnull(), 'total_time'] = 0
#     df.ix[df.total_time == 0 'total_time'] = max(df.total_time) * -1
#     df['total_time'] = minmax.fit_transform(df.total_time)
    df.total_time = df.total_time.map(lambda x: np.log10(x+1))
    df['mixed_color'] = df.Color.str.contains('/').astype('int')
    
    df['year'] = df.DateTime.dt.year
    df['month'] = df.DateTime.dt.month
    df['day'] = df.DateTime.dt.day
    df['time'] = df.DateTime.dt.hour * 60 + df.DateTime.dt.minute
#     df.ix[df.time == 0,'time'] = max(df.time)* -1
    df.time = df.time.map(lambda x: np.log10(x+1))
    
    df['name_len'] = df.Name.str.len()
    df.ix[df.name_len.isnull(), 'name_len'] = df.name_len.median()
    X = pd.get_dummies(df[cols_x]).values
    if cols_y:
        df.sort_values(by='OutcomeType')        
        return X, df[cols_y].values
    else:
        return X
    

In [6]:
# df.index = [df.index,df.Color]

In [7]:
# df.ix[df.mixed_color.isin(df.mixed_color.value_counts()[df.mixed_color.value_counts() == True].index)]['OutcomeType'].value_counts().plot(kind='bar')

In [8]:
cols_x = ['AnimalType','SexuponOutcome','viralata','total_time','weekday','weekend','common_colors','mixed_color', 'year', 'name_len', 'month', 'day', 'time']
cols_y = ['OutcomeType']

In [9]:
X, Y = get_vars(df, cols_x, cols_y)
X = X
Y = Y.reshape(Y.shape[0])

In [10]:
# df.ix[df.total_time.isnull()]

In [11]:
df.shape[0]/3

8909.666666666666

In [20]:
kf = StratifiedKFold(df.OutcomeType,n_folds=2, shuffle=True, random_state=500)
a = list(kf)[0][0]
b = list(kf)[1][0]
# c = list(list(kf)[2][0])[0:8909]
# d = list(list(kf)[3][0])[0:6682]


# X_knn = X[train]
# Y_knn = Y[train]

# X_gb = X[train]
# Y_gb = Y[train]

# X_forest = X[train]
# Y_forest = Y[train]

# X_test = X[test]
# Y_test = Y[test]

# df_knn = df.ix[train]
# df_gb = df.ix[train]
# df_forest = df.ix[train]

# df_train = df

In [13]:
# train_x = X_train
# train_y = Y_train
# test_x = X_test
# test_y = Y_test

In [81]:
def train_neuralnetwork(train_x, train_y, test_x=None, test_y=None):
    nfolds = 3
    ini = datetime.now()

    forest = Classifier( layers=[
            Layer('Rectifier', units=32),
            Layer('Sigmoid', units=64),
            Layer("Softmax")],
                                random_state=500,
#                                 learning_rate=0.0001,
                                 batch_size=3,
#                                 dropout_rate=0.01
                       )

    kf = StratifiedKFold(train_y, n_folds=nfolds, shuffle=True)
    log_acc = 0
    print('Training neural...')
    for k, (train, test) in enumerate(kf, start=1):
        forest.fit(train_x[train], train_y[train])
        pred = forest.predict_proba(train_x[test]) 
        loglo = log_loss(train_y[test], pred)
        log_acc += loglo
        print('fold:', k, 'log loss:', loglo)
        del pred

    print(log_acc/nfolds)
    
    if not test_x is None:
        pred = forest.predict_proba(test_x)
        print('fold:', 'train', 'log_loss:', log_loss(test_y, pred))
        del pred
    fim = datetime.now()
    print(fim - ini)
    return forest

In [82]:
def train_knn(train_x, train_y, test_x=None, test_y=None):
    nfolds = 10
    ini = datetime.now()

#     forest = GradientBoostingClassifier(min_samples_split=1000, max_depth=6)
    forest = KNeighborsClassifier(15)

    kf = StratifiedKFold(train_y, n_folds=nfolds, shuffle=True)
    log_acc = 0
    print('Training kneighbors...')
    for k, (train, test) in enumerate(kf, start=1):
        forest.fit(train_x[train], train_y[train])
        pred = forest.predict_proba(train_x[test]) 
        loglo = log_loss(train_y[test], pred)
        log_acc += loglo
        print('fold:', k, 'log loss:', loglo)
        del pred

    print(log_acc/nfolds)
    
    if not test_x is None:
        pred = forest.predict_proba(test_x)
        print('fold:', 'train', 'log_loss:', log_loss(test_y, pred))
        del pred
    fim = datetime.now()
    print(fim - ini)
    return forest

In [83]:
def train_gb(train_x, train_y, test_x=None, test_y=None):
    nfolds = 5
    ini = datetime.now()

    forest = GradientBoostingClassifier(min_samples_split=1000, max_depth=6)
#     forest = ExtraTreesClassifier(min_samples_split=1000, random_state=500)

    kf = StratifiedKFold(train_y, n_folds=nfolds, shuffle=True)
    log_acc = 0
    print('Training gb...')
    for k, (train, test) in enumerate(kf, start=1):
        forest.fit(train_x[train], train_y[train])
        pred = forest.predict_proba(train_x[test]) 
        loglo = log_loss(train_y[test], pred)
        log_acc += loglo
        print('fold:', k, 'log loss:', loglo)
        del pred

    print(log_acc/nfolds)
    
    if not test_x is None:
        pred = forest.predict_proba(test_x)
        print('fold:', 'train', 'log_loss:', log_loss(test_y, pred))
        del pred
    fim = datetime.now()
    print(fim - ini)
    return forest

In [89]:
def train(x, y):
    kf = StratifiedKFold(y, random_state=500, n_folds=10)
    for k, (train, test) in enumerate(kf, start=1):
        neuralnetwork = train_neuralnetwork(x[train], y[train])
        ens = neuralnetwork.predict_proba(x[train])

        knn = train_knn(np.concatenate((ens, x[train]),axis=1), y[train])
        ens = knn.predict_proba(np.concatenate((ens, x[train]),axis=1))

        gb = train_gb(np.concatenate((ens, x[train]),axis=1), y[train])
#         proba = gb.predict_proba(np.concatenate((ens, X[train]),axis=1))
#         print('=============')
#         print('fold:',k,'log_loss:', log_loss())
#         print('=============')
        
    return neuralnetwork, knn, gb

In [90]:
def pred(neuralnetwork, knn, gb, X):
    ens = neuralnetwork.predict_proba(X)
    ens = knn.predict_proba(np.concatenate((ens, X),axis=1))
    return gb.predict_proba(np.concatenate((ens, X),axis=1))

In [None]:
neuralnetwork, knn, gb = train(X[a], Y[a])

Training neural...
[(4009, 5)]
fold: 1 log loss: 1.2860733283
[(4008, 5)]
fold: 2 log loss: 1.24412987715
[(4007, 5)]
fold: 3 log loss: 1.26679732824
1.26566684456
0:00:53.781681
[(12024, 5)]
Training kneighbors...
fold: 1 log loss: 1.81414152634
fold: 2 log loss: 2.04206229693
fold: 3 log loss: 2.03888938813
fold: 4 log loss: 1.98397844949
fold: 5 log loss: 2.3003715546
fold: 6 log loss: 2.02653876104
fold: 7 log loss: 2.06845633448
fold: 8 log loss: 1.98761092987
fold: 9 log loss: 1.75693460235
fold: 10 log loss: 1.94240325894
1.99613871022
0:00:01.068352
Training gb...
fold: 1 log loss: 0.722454616469
fold:

In [None]:
prd = pred(neuralnetwork, knn, gb, X[b])

In [None]:
log_loss(Y[b], prd)

In [None]:
def get_test():
    df_test = pd.read_csv('test.csv.gz')    
    X_test = get_vars(df_test,cols_x)

    return X_test

In [None]:
X_test = get_test()

In [None]:
cats = df.OutcomeType.unique()
cats.sort()

final_result = pred(neuralnetwork, knn, gb, X_test)
final_result = pd.DataFrame(final_result, columns=cats)

In [None]:
final_result['Id'] = final_result.index+1
final_result.set_index('Id', inplace=True)

In [None]:
today = datetime.today()
t = today.strftime('%Y%m%d%H%M')
final_result.to_csv('result_%s.csv.gz'%t, compression='gzip')