In [96]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import matplotlib.path as path
%matplotlib inline

from datetime import datetime, time
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sknn.mlp import Classifier, Layer
from sklearn.linear_model import LinearRegression, RidgeClassifierCV, LogisticRegression
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import *
from sklearn.cross_validation import KFold, train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import log_loss
from sklearn.grid_search import GridSearchCV
from sklearn.svm import LinearSVC
from sklearn.feature_selection import RFE
from sklearn import cross_validation

In [97]:
# def get_data():
df = pd.read_csv('train.csv.gz')

In [98]:
# df.total_time.map(lambda x: np.log10(x+1)).hist()

In [99]:
def get_vars(df, cols_x, cols_y=None):
    minmax = MinMaxScaler()
    
    df.drop_duplicates(inplace=True)
    df.DateTime = pd.to_datetime(df.DateTime)
    df['weekday'] = df.DateTime.dt.weekday
    df['weekend'] = (df.weekday > 4) * 1
    df['viralata'] = (df.Breed.str.contains('Mix') | df.Breed.str.contains('/')) *1
    df['named'] = ~df.Name.isnull() * 1
    df['sex'] = df.SexuponOutcome.str.split(' ').str[1]
    df['castrado'] = (df.SexuponOutcome.str.split(' ').str[0].str.contains('Neutered') | df.SexuponOutcome.str.split(' ').str[0].str.contains('Spayed')) * 1
    df['first_color'] = df.Color.str.split('/').str[0]
    df['second_color'] = df.Color.str.split('/').str[1]
    df['first_breed'] = df.Breed.str.replace('Mix','').str.split('/').str[0].str.strip()
    df['second_breed'] = df.Breed.str.replace('Mix','').str.split('/').str[1].str.strip()
    #     df.ix[df.AgeuponOutcome.isnull(),'AgeuponOutcome'] = 0
    filt = df.AgeuponOutcome.str.contains('year')
    df['days_multiplyer'] = 0
    df.ix[df.AgeuponOutcome.str.contains('year')==True,'days_multiplyer'] = 365
    df.ix[df.AgeuponOutcome.str.contains('month')==True,'days_multiplyer'] = 30
    df.ix[df.AgeuponOutcome.str.contains('week')==True,'days_multiplyer'] = 7
    df.ix[df.AgeuponOutcome.str.contains('day')==True,'days_multiplyer'] = 1
    df.totaltime = df.AgeuponOutcome.str.split().str[0]
    df.ix[df.totaltime.isnull(),'AgeuponOutcome'] = 0
    df['total_time'] = df.totaltime.astype('float64') * df.days_multiplyer
    
    df.ix[df.total_time.isnull(), 'total_time'] = 0
#     df.total_time = df.total_time.map(lambda x: np.log10(x+1))
    df['mixed_color'] = df.Color.str.contains('/').astype('int')
    
    df['year'] = df.DateTime.dt.year
    df['month'] = df.DateTime.dt.month
    df['yearmonth'] = df.DateTime.dt.month + df.DateTime.dt.year
    df['day'] = df.DateTime.dt.day
    df['time'] = df.DateTime.dt.hour * 60 + df.DateTime.dt.minute
#     df.time = df.time.map(lambda x: np.log10(x+1))
    
    df['name_len'] = df.Name.str.len()
    df.ix[df.name_len.isnull(), 'name_len'] = df.name_len.median()
    
    X = pd.get_dummies(df[cols_x]).values
    if cols_y:            
        df.ix[df.OutcomeType == 'Return_to_owner', 'IndY'] = 1
        df.ix[df.OutcomeType == 'Euthanasia', 'IndY'] = 2
        df.ix[df.OutcomeType == 'Adoption', 'IndY'] = 3
        df.ix[df.OutcomeType == 'Transfer', 'IndY'] = 4
        df.ix[df.OutcomeType == 'Died', 'IndY'] = 5
        df.sort_values(by='OutcomeType')        
        return X, df[cols_y].values
    else:
        return df
    

In [107]:
cols_x = [
    'AnimalType','SexuponOutcome','viralata',
    'total_time','weekday','weekend','mixed_color',
    'year', 'name_len', 'month', 
    'day', 'time','named',
    'sex','castrado','first_color','second_color','first_breed','second_breed']
cols_y = ['OutcomeType']

In [108]:
cols_1 = [
    'AnimalType',
    'viralata','weekend','total_time','time','name_len','year','month',
    'named','sex','castrado','first_color','second_color','first_breed','second_breed'
]
# cols_1 = ['common_colors','SexuponOutcome','mixed_color','AnimalType','viralata','weekend','year']
cols_rf = ['total_time', 'SexuponOutcome', 'AnimalType']
cols_extra = ['total_time','time', 'SexuponOutcome', 'AnimalType']
cols_gb = ['total_time','time','name_len', 'month', 'AnimalType', 'SexuponOutcome']

In [125]:
X, Y = get_vars(df, cols_x, cols_y)
X = pd.get_dummies(df[cols_1])
gb_features = X.columns
Y = Y.reshape(Y.shape[0])

In [126]:
kf = StratifiedKFold(Y, n_folds=2, shuffle=True, random_state=500)
a = list(kf)[0][0]

In [129]:
X = X.loc[a]
Y = Y[a]

In [130]:
def train_regression(forest, train_x, train_y, test_x=None, test_y=None):
    nfolds = 5
    ini = datetime.now()

    kf = StratifiedKFold(train_y, n_folds=nfolds, shuffle=True, random_state=500)
    log_acc = 0
    print('Training linear...')
    for k, (train, test) in enumerate(kf, start=1):
        forest.fit(train_x[train], train_y[train])
        pred = forest.predict(train_x[test]) 
        del pred
        
#     log_acc = 0
#     print('Training ridge...')
#     forest.fit(train_x, train_y)
#     pred = forest.predict(train_x) 
#     del pred
    
    fim = datetime.now()
    print(fim - ini)
    return forest

In [131]:
def train_classifier(forest, train_x, train_y, nfolds, test_x=None, test_y=None):
    ini = datetime.now()

    kf = StratifiedKFold(train_y, n_folds=nfolds, shuffle=True)
    
    print('Training classifier...')
    for k, (train, test) in enumerate(kf, start=1):
        forest.fit(train_x[train], train_y[train])
        print(k, ' ', end="")
        
    scores = cross_validation.cross_val_score(forest, train_x, train_y, cv=5, scoring='log_loss')
    print("Log loss: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()))
    
    fim = datetime.now()
    print(fim - ini)
    print('')
    return forest

In [200]:
rf = RandomForestClassifier(
        n_estimators=300,
        max_depth=50,
        max_leaf_nodes=500,
        n_jobs=4,
        min_weight_fraction_leaf=.1,
        max_features=1.0,
        random_state=500,
        criterion='gini')  

In [None]:
rf.fit(X, Y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=50, max_features=1.0, max_leaf_nodes=500,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.1, n_estimators=300, n_jobs=4,
            oob_score=False, random_state=500, verbose=0, warm_start=False)

In [None]:
scores = cross_validation.cross_val_score(rf, X, Y, cv=5, scoring='log_loss')
print("Log loss: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()))

In [177]:
def train(x, y, y_ind):    
    x1 = x.values
    x1_log2 = np.log2(x1+1)
    x1_log10 = np.log10(x1+1)
    

    extra = ExtraTreesClassifier(
        n_estimators=400,
        max_depth=50,
        max_leaf_nodes=200,
        n_jobs=4, 
        random_state=500)    
    rf_gini = RandomForestClassifier(
        n_estimators=400,
        max_depth=75,
        max_leaf_nodes=40,
        n_jobs=4,
        random_state=500,
        criterion='gini')        
    knn = KNeighborsClassifier(500)
    gb = GradientBoostingClassifier(min_samples_split=1000, max_depth=6, random_state=500)

    ada = AdaBoostClassifier(n_estimators=75)

    extra = train_classifier(extra, x1, y, 5)
    rf_gini = train_classifier(rf_gini, x1, y, 10)
    knn = train_classifier(knn, x1, y, 4)
    gb = train_classifier(gb, x1, y, 5)    
    ada = train_classifier(ada, x1, y, 2)
    
    
    print('\n')

    return extra, rf_gini, knn, gb, ada

In [78]:
# %%time
extra, rf, knn, gb, ada = train(
    X,
    Y,
    df.IndY.values
)


Training classifier...
1  2  3  4  5  Log loss: -1.01 (+/- 0.01)
0:00:07.953688

Training classifier...
1  2  3  4  5  6  7  8  9  10  Log loss: -0.99 (+/- 0.01)
0:00:12.173348

Training classifier...
1  2  3  4  Log loss: -1.18 (+/- 0.04)
0:00:00.887559

Training classifier...
1  2  3  4  5  Log loss: -0.88 (+/- 0.03)
0:00:56.924457

Training classifier...
1  2  Log loss: -1.44 (+/- 0.03)
0:00:02.352765





In [79]:
# 1.46

In [80]:
def pred(extra, rf, knn, gb, x):
    x1 = x.values[a]
    
    one = extra.predict_proba(xextra)
    two = rf.predict_proba(xrf)
    three = knn.predict_proba(x1_log10)
    four = gb.predict_proba(xgb)
    newx = [one, two, three, four]
    return newx


In [88]:
def get_test():
    df_test = pd.read_csv('test.csv.gz')    
    X_test = get_vars(df_test, cols_x)

    return X_test

In [89]:
X_test = get_test()
X_test = pd.get_dummies(X_test)

In [125]:
missing_cols = list(gb_features - X_test.columns)

  if __name__ == '__main__':


In [127]:
for col in missing_cols:
    X_test[col] = 0

In [129]:
cats = df.OutcomeType.unique()
cats.sort()
final_result = gb.predict_proba(X_test[gb_features].values)
final_result = pd.DataFrame(final_result, columns=cats)

In [None]:
cats = df.OutcomeType.unique()
cats.sort()

first, second, thirdth, fourth  = pred(extra, rf, knn, gb, X_test)
final_result = first*one + second*two +  fourth*four # +thirdth*three

# final_result = pred(extra, gb, X_test)
final_result = pd.DataFrame(final_result, columns=cats)

In [131]:
final_result['Id'] = final_result.index+1
final_result.set_index('Id', inplace=True)

In [132]:
today = datetime.today()
t = today.strftime('%Y%m%d%H%M')
final_result.to_csv('result_%s.csv.gz'%t, compression='gzip')