In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import matplotlib.path as path
%matplotlib inline

from datetime import datetime, time
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sknn.mlp import Classifier, Layer
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import *
from sklearn.cross_validation import KFold, train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import log_loss
from sklearn.grid_search import GridSearchCV

In [2]:
# def get_data():
df = pd.read_csv('train.csv.gz')

In [3]:
# df.total_time.map(lambda x: np.log10(x+1)).hist()

In [4]:
df_color = pd.DataFrame({'color1':df.Color.str.split('/').str[0], 'color2':df.Color.str.split('/').str[1]})
vc_color = pd.concat((df_color['color1'], df_color['color2'])).value_counts()
common_colors = vc_color[vc_color > 300].index

In [5]:
def get_vars(df, cols_x, cols_y=None):
    minmax = MinMaxScaler()
    
    df.drop_duplicates(inplace=True)
    df.DateTime = pd.to_datetime(df.DateTime)
    df['weekday'] = df.DateTime.dt.weekday
    df['weekend'] = (df.weekday > 4) * 1
    df['viralata'] = (df.Breed.str.contains('Mix') | df.Breed.str.contains('/')) *1

    df['common_colors'] = df.Color.isin(common_colors) * 1
    
    #     df.ix[df.AgeuponOutcome.isnull(),'AgeuponOutcome'] = 0
    filt = df.AgeuponOutcome.str.contains('year')
    df['days_multiplyer'] = 0
    df.ix[df.AgeuponOutcome.str.contains('year')==True,'days_multiplyer'] = 365
    df.ix[df.AgeuponOutcome.str.contains('month')==True,'days_multiplyer'] = 30
    df.ix[df.AgeuponOutcome.str.contains('week')==True,'days_multiplyer'] = 7
    df.ix[df.AgeuponOutcome.str.contains('day')==True,'days_multiplyer'] = 1
    df.totaltime = df.AgeuponOutcome.str.split().str[0]
    df.ix[df.totaltime.isnull(),'AgeuponOutcome'] = 0
    df['total_time'] = df.totaltime.astype('float64') * df.days_multiplyer

    df.ix[df.total_time.isnull(), 'total_time'] = 0
    df.total_time = df.total_time.map(lambda x: np.log10(x+1))
    df['mixed_color'] = df.Color.str.contains('/').astype('int')
    
    df['year'] = df.DateTime.dt.year
    df['month'] = df.DateTime.dt.month
    df['day'] = df.DateTime.dt.day
    df['time'] = df.DateTime.dt.hour * 60 + df.DateTime.dt.minute
    df.time = df.time.map(lambda x: np.log10(x+1))
    
    df['name_len'] = df.Name.str.len()
    df.ix[df.name_len.isnull(), 'name_len'] = df.name_len.median()
    
    X = pd.get_dummies(df[cols_x]).values
    if cols_y:
        df.sort_values(by='OutcomeType')        
        return X, df[cols_y].values
    else:
        return X
    

In [6]:
# df.index = [df.index,df.Color]

In [7]:
# df.ix[df.mixed_color.isin(df.mixed_color.value_counts()[df.mixed_color.value_counts() == True].index)]['OutcomeType'].value_counts().plot(kind='bar')

In [8]:
cols_x = ['AnimalType','SexuponOutcome','viralata','total_time','weekday','weekend','common_colors','mixed_color', 'year', 'name_len', 'month', 'day', 'time']
cols_y = ['OutcomeType']

In [9]:
X, Y = get_vars(df, cols_x, cols_y)
X = X
Y = Y.reshape(Y.shape[0])

In [10]:
# df.ix[df.total_time.isnull()]

In [11]:
df.shape[0]/3

8909.666666666666

In [12]:
kf = StratifiedKFold(df.OutcomeType,n_folds=2, shuffle=True, random_state=500)
a = list(kf)[0][0]
b = list(kf)[1][0]
# c = list(list(kf)[2][0])[0:8909]
# d = list(list(kf)[3][0])[0:6682]


# X_knn = X[train]
# Y_knn = Y[train]

# X_gb = X[train]
# Y_gb = Y[train]

# X_forest = X[train]
# Y_forest = Y[train]

# X_test = X[test]
# Y_test = Y[test]

# df_knn = df.ix[train]
# df_gb = df.ix[train]
# df_forest = df.ix[train]

# df_train = df

In [13]:
# train_x = X_train
# train_y = Y_train
# test_x = X_test
# test_y = Y_test

In [14]:
def train_linregression(train_x, train_y, test_x=None, test_y=None):
    nfolds = 5
    ini = datetime.now()

    forest = ExtraTreesRegressor(n_jobs=4)

    kf = StratifiedKFold(train_y, n_folds=nfolds, shuffle=True)
    log_acc = 0
    print('Training extratrees...')
    for k, (train, test) in enumerate(kf, start=1):
        forest.fit(train_x[train], train_y[train])
        pred = forest.predict(train_x[test]) 
        loglo = log_loss(train_y[test], pred)
        log_acc += loglo
        print('fold:', k, 'log loss:', loglo)
        del pred
        
#     log_acc = 0
#     print('Training extratrees...')
#     forest.fit(train_x, train_y)
#     pred = forest.predict_proba(train_x) 
#     loglo = log_loss(train_y, pred)
#     log_acc += loglo
#     print('log loss:', loglo)
#     del pred

#     print(log_acc/nfolds)
    
#     if not test_x is None:
#         pred = forest.predict_proba(test_x)
#         print('fold:', 'train', 'log_loss:', log_loss(test_y, pred))
#         del pred
    fim = datetime.now()
    print(fim - ini)
    return forest

In [35]:
def train_rf(train_x, train_y, test_x=None, test_y=None, forest=None):
    nfolds = 5
    ini = datetime.now()

    if forest==None:
        forest = RandomForestClassifier(n_estimators=60, max_depth=15)
    kf = StratifiedKFold(train_y, n_folds=nfolds, shuffle=True)
    log_acc = 0
    print('Training randomforest...')
    for k, (train, test) in enumerate(kf, start=1):
        forest.fit(train_x[train], train_y[train])
        pred = forest.predict_proba(train_x[test]) 
        loglo = log_loss(train_y[test], pred)
        log_acc += loglo
        print('fold:', k, 'log loss:', loglo)
        del pred
        
#     log_acc = 0
#     print('Training rf...')
#     forest.fit(train_x, train_y)
#     pred = forest.predict_proba(train_x) 
#     loglo = log_loss(train_y, pred)
#     log_acc += loglo
#     print('log loss:', loglo)
#     del pred

#     print(log_acc/nfolds)
    
#     if not test_x is None:
#         pred = forest.predict_proba(test_x)
#         print('fold:', 'train', 'log_loss:', log_loss(test_y, pred))
#         del pred
    fim = datetime.now()
    print(fim - ini)
    return forest

In [25]:
def train_extratrees(train_x, train_y, test_x=None, test_y=None):
    nfolds = 5
    ini = datetime.now()

    forest = ExtraTreesClassifier(n_estimators=15,n_jobs=4)

    kf = StratifiedKFold(train_y, n_folds=nfolds, shuffle=True)
    log_acc = 0
    print('Training extratrees...')
    for k, (train, test) in enumerate(kf, start=1):
        forest.fit(train_x[train], train_y[train])
        pred = forest.predict_proba(train_x[test]) 
        loglo = log_loss(train_y[test], pred)
        log_acc += loglo
        print('fold:', k, 'log loss:', loglo)
        del pred
        
#     log_acc = 0
#     print('Training extratrees...')
#     forest.fit(train_x, train_y)
#     pred = forest.predict_proba(train_x) 
#     loglo = log_loss(train_y, pred)
#     log_acc += loglo
#     print('log loss:', loglo)
#     del pred

#     print(log_acc/nfolds)
    
#     if not test_x is None:
#         pred = forest.predict_proba(test_x)
#         print('fold:', 'train', 'log_loss:', log_loss(test_y, pred))
#         del pred
    fim = datetime.now()
    print(fim - ini)
    return forest

In [26]:
def train_knn(train_x, train_y, test_x=None, test_y=None):
    nfolds = 4
    ini = datetime.now()

#     forest = GradientBoostingClassifier(min_samples_split=1000, max_depth=6)
    forest = KNeighborsClassifier(15)
    
    kf = StratifiedKFold(train_y, n_folds=nfolds, shuffle=True)
    log_acc = 0
    print('Training kneighbors...')
    for k, (train, test) in enumerate(kf, start=1):
        forest.fit(train_x[train], train_y[train])
        pred = forest.predict_proba(train_x[test]) 
        loglo = log_loss(train_y[test], pred)
        log_acc += loglo
        print('fold:', k, 'log loss:', loglo)
        del pred

#     log_acc = 0
#     print('Training kneighbours...')
#     forest.fit(train_x, train_y)
#     pred = forest.predict_proba(train_x) 
#     loglo = log_loss(train_y, pred)
#     log_acc += loglo
#     print('log loss:', loglo)
#     del pred
        
#     print(log_acc/nfolds)
    
#     if not test_x is None:
#         pred = forest.predict_proba(test_x)
#         print('fold:', 'train', 'log_loss:', log_loss(test_y, pred))
#         del pred
    fim = datetime.now()
    print(fim - ini)
    return forest

In [27]:
def train_gb(train_x, train_y, test_x=None, test_y=None):
    nfolds = 5
    ini = datetime.now()

    forest = GradientBoostingClassifier(min_samples_split=1000, max_depth=6)
#     forest = ExtraTreesClassifier(min_samples_split=1000, random_state=500)

    kf = StratifiedKFold(train_y, n_folds=nfolds, shuffle=True)
    log_acc = 0
    print('Training gb...')
    for k, (train, test) in enumerate(kf, start=1):
        forest.fit(train_x[train], train_y[train])
        pred = forest.predict_proba(train_x[test]) 
        loglo = log_loss(train_y[test], pred)
        log_acc += loglo
        print('fold:', k, 'log loss:', loglo)
        del pred

#     log_acc = 0
#     print('Training gb...')
#     forest.fit(train_x, train_y)
#     pred = forest.predict_proba(train_x) 
#     loglo = log_loss(train_y, pred)
#     log_acc += loglo
#     print('log loss:', loglo)
#     del pred

#     print(log_acc/nfolds)
    
#     if not test_x is None:
#         pred = forest.predict_proba(test_x)
#         print('fold:', 'train', 'log_loss:', log_loss(test_y, pred))
#         del pred
    fim = datetime.now()
    print(fim - ini)
    return forest

In [28]:
cols_1 = ['common_colors','SexuponOutcome','mixed_color']
cols_2 = []
cols_3 = []
cols_4 = ['total_time','time','name_len','year','month','AnimalType','viralata','weekend']

In [70]:
def train(x, y):    
    x1 = pd.get_dummies(x[cols_1]).values
    x4 = pd.get_dummies(x[cols_4]).values
    nfolds = 2
    kf = StratifiedKFold(y, n_folds=nfolds, shuffle=True)
    
    list_ens =[]
    for k, (train, test) in enumerate(kf, start=1):
        if k == 1:
            rf1 = train_rf(x1[train], y[train])
        else:
            rf1 = train_rf(x1[train], y[train], rf1)
        list_ens.append(rf1.predict_proba(x1[train]))
        pass
    
    ens1 = np.concatenate(list_ens)
    ens2 = rf1.predict_proba(x1)
            

    gb = train_gb(np.concatenate((ens1, ens2, x4),axis=1), y)

    #         proba = gb.predict_proba(np.concatenate((ens, X[train]),axis=1))
        
    return rf1, gb

In [80]:
def pred(rf,gb, x):
    x1 = pd.get_dummies(x[cols_1]).values  
    x4 = pd.get_dummies(x[cols_4]).values
    
    ens = rf.predict_proba(x1)
    kf = KFold(x1.shape[0],n_folds=2, shuffle=True)
    
    list_ens =[]
    for k, (train, test) in enumerate(kf, start=1):
        list_ens.append(rf.predict_proba(x1[train]))
        pass
    
    ens1 = np.concatenate(list_ens)
    ens2 = rf.predict_proba(x1)
    
    
    return gb.predict_proba(np.concatenate((ens1,ens2, x4),axis=1))

In [68]:
# pd.DataFrame(extra.feature_importances_).plot(kind='bar')

In [71]:
rf, gb = train(
    pd.DataFrame(df.loc[a]),
    Y[a]
)

Training randomforest...
fold: 1 log loss: 1.14753451152
fold: 2 log loss: 1.10999827482
fold: 3 log loss: 1.05132460066
fold: 4 log loss: 1.05522401959
fold: 5 log loss: 1.06959495386
0:00:00.657046
Training randomforest...
fold: 1 log loss: 1.03050958984
fold: 2 log loss: 1.07783227276
fold: 3 log loss: 1.08130098108
fold: 4 log loss: 1.16270249276
fold: 5 log loss: 1.06906843488
0:00:00.640564
Training gb...
fold: 1 log loss: 0.795002887442
fold: 2 log loss: 0.810095209444
fold: 3 log loss: 0.795879489188
fold: 4 log loss: 0.801095273457
fold: 5 log loss: 0.810841373546
0:01:08.451742


In [81]:
prd = pred(
    rf,
    gb, 
    df.iloc[b]
          )

In [82]:
log_loss(Y[b], prd)

0.81174944999387144

In [None]:
# 0.80637134429378865

In [None]:
def get_test():
    df_test = pd.read_csv('test.csv.gz')    
    X_test = get_vars(df_test,cols_x)

    return X_test

In [None]:
X_test = get_test()

In [None]:
cats = df.OutcomeType.unique()
cats.sort()

final_result = pred(neuralnetwork, knn, gb, X_test)
final_result = pd.DataFrame(final_result, columns=cats)

In [None]:
final_result['Id'] = final_result.index+1
final_result.set_index('Id', inplace=True)

In [None]:
today = datetime.today()
t = today.strftime('%Y%m%d%H%M')
final_result.to_csv('result_%s.csv.gz'%t, compression='gzip')