In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [2]:
train = pd.read_csv('input/train.csv', parse_dates=['DateTime'])
test = pd.read_csv('input/test.csv',  parse_dates=['DateTime'])

In [3]:
train.shape

(26729, 10)

In [4]:
train.columns

Index(['AnimalID', 'Name', 'DateTime', 'OutcomeType', 'OutcomeSubtype',
       'AnimalType', 'SexuponOutcome', 'AgeuponOutcome', 'Breed', 'Color'],
      dtype='object')

In [5]:
import re
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
sc = preprocessing.StandardScaler()
def convertToYears(age):
    if age == 999.0:
        return age
    num = int(re.search('\d+', age).group())
    unit = re.search('[A-Za-z]+', age).group()
    
    if 'year' in unit:
        return num
    if 'week' in unit:
        return num/52
    if 'month' in unit:
        return num/12
    if 'day' in unit:
        return num/365
    return num


def prepData(train, test):
    
    #Encode OutcomeType
    le.fit(train.OutcomeType.values)
    train.OutcomeType = le.transform(train.OutcomeType.values)
    
    allData = pd.concat([train, test])
    
    #Drop some columns for simplicity
    allData = allData.drop(['AnimalID', 'Name', 'DateTime', 'OutcomeSubtype'],axis=1)
    
    #Convert age to age in years
    allData['AgeuponOutcome'] = allData.AgeuponOutcome.fillna(999.0)
    allData['ageInYears'] = allData.AgeuponOutcome.apply(convertToYears)
    ageMean = allData[allData.ageInYears < 999.0].ageInYears.mean()
    allData.loc[allData['ageInYears'] == 999.0, 'ageInYears'] = ageMean
    allData['ageInYears'] = sc.fit_transform(allData.ageInYears.reshape(1,-1)).T
    allData = allData.drop('AgeuponOutcome', axis=1)
    
    #Get dummies
    dummies = pd.get_dummies(allData[['Breed', 'AnimalType', 'Color', 'SexuponOutcome']])
    allData = pd.concat([allData, dummies], axis=1)
    allData = allData.drop(['Breed', 'AnimalType', 'SexuponOutcome', 'Color'], axis=1)
       
    return (allData, le)

In [6]:
allData, le = prepData(train, test)

In [7]:
train = allData.iloc[:train.shape[0],:].drop('ID', axis=1)

In [8]:
test =  allData.iloc[train.shape[0]:,:].drop('OutcomeType', axis=1)

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV

params = {
    'C':[1]
}
lr = LogisticRegression(multi_class='multinomial', solver='newton-cg')
gs = GridSearchCV(lr, params, scoring='log_loss')

gs.fit(train.drop('OutcomeType', axis=1).values, train['OutcomeType'].values)

GridSearchCV(cv=None, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=1, penalty='l2', random_state=None, solver='newton-cg',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1, param_grid={'C': [1]},
       pre_dispatch='2*n_jobs', refit=True, scoring='log_loss', verbose=0)

In [10]:
gs.best_score_ 

-0.98893376007745615

In [26]:
gs.best_params_

{'C': 1}

In [27]:
bestLr = gs.best_estimator_
bestLr.fit(train.drop('OutcomeType', axis=1).values, train['OutcomeType'].values)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=1, penalty='l2', random_state=None, solver='newton-cg',
          tol=0.0001, verbose=0, warm_start=False)

In [28]:
testIds = test.ID

In [29]:
probs = bestLr.predict_proba(test.drop('ID', axis=1))

In [36]:
probsDf = pd.DataFrame(data=probs, columns=le.inverse_transform([0,1,2,3,4]))

In [48]:
sol = pd.concat([testIds, probsDf], axis=1) 

In [53]:
sol.ID = sol.ID.astype(int)

In [57]:
sol.to_csv('lr5.csv',index =False)

0.98278	