# Load data

In [1]:
import pandas as pd

problem = 'Animals'
train = pd.read_csv('data/' + problem + '/train.csv')
test = pd.read_csv('data/' + problem + '/test.csv')

#train.head()

In [2]:
#test.head()

# Preprocessing

### Drop useless features

In [3]:
train.drop(['OutcomeSubtype', 'AnimalID'], axis=1, inplace=True)
train.head(2)

Unnamed: 0,Name,DateTime,OutcomeType,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,Hambone,2014-02-12 18:22:00,Return_to_owner,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White
1,Emily,2013-10-13 12:44:00,Euthanasia,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby


### Generate new features from existing ones

__Add binary feature *'HasName'*__ showing if animal has a name

In [4]:
def process_name(df):
    df['HasName'] = 1
    df.loc[df.Name.isnull(), 'HasName'] = 0

__Transform *'AgeuponOutcome'*__ so that shows an age of an animal __in days__

In [5]:
def process_age(df):    
    days_in_unit = {
        'day': 1,
        'days': 1,
        'week': 7,
        'weeks': 7,
        'month': 30,
        'months': 30,
        'years': 365,
        'year': 365
    }
        
    def strage_to_days(age):
        if not isinstance(age, str): return float('nan')
        age = age.split(' ')
        return int(age[0]) * days_in_unit[age[1]]

    df['AgeuponOutcome'] = df['AgeuponOutcome'].map(strage_to_days)

Impute mean/median values intead of NaNs

In [6]:
def impute_age(src, dst):
    for f, g in dst.loc[dst['AgeuponOutcome'].isnull()].groupby(['Breed', 'Mix', 'SexuponOutcome']):
        gtrain = src.loc[
                (src['Breed'] == f[0]) &
                (src['Mix'] == f[1]) &
                ( (src['SexuponOutcome'] == f[2]) ), # if f[2] else True ),
            'AgeuponOutcome']
        med, cnt = gtrain.median(), gtrain.count()
        if cnt < 30: med = gtrain.mean()
        dst.loc[dst['AgeuponOutcome'].isnull() &
             (dst['Breed'] == f[0]) &
             (dst['Mix'] == f[1]) &
             (dst['SexuponOutcome'] == f[2]),
                'AgeuponOutcome'] = med
        #print('Imputed age %s from %s samples of %s %s %s' % (
        #      med, cnt,
        #      f[0],
        #     'Mix' if f[1] else '',
        #      'Male' if f[2] == 0 else ('Female' if f[2] == 1 else 'Bisexual')))

__Add *'Sterilized'* feature__ showing if animal was neutered or spayed

In [7]:
import re

def process_sex(df):
    sterilized_pat = re.compile('.*(neutered|spayed).*', flags=re.IGNORECASE)
    df.SexuponOutcome.fillna('Unknown', inplace=True)
    df['Sterilized'] = df.SexuponOutcome.str.match(sterilized_pat).astype(int)
    df.Sterilized.loc[df.SexuponOutcome == 'Unknown'] = 2
    
    def shorten_sex(sex):
        if isinstance(sex, str):
            if 'Male' in sex:
                return 0
            elif 'Female' in sex:
                return 1
        return 2 # the Unknown
    
    df['Sex'] = df.SexuponOutcome.map(shorten_sex).astype('int')

__Substract *'Mix'*__ from *'Breed'*

In [8]:
def process_breed(df):
    mix_pat = re.compile('.*(/|Mix).*')
    df['Mix'] = df.Breed.str.match(mix_pat).astype('int')
    df['Breed'] = df.Breed.str.rstrip(' Mix')

In [9]:
#breed_cnts = train.Breed.value_counts()
#sz = train.count()
#train['BreedFrequency'] = train.Breed.apply(lambda x: breed_cnts[x]/sz)

#### All preprocessing steps

In [10]:
from sklearn.preprocessing import LabelEncoder

def preprocess_df(df):
    encoder = LabelEncoder()
    df.AnimalType = encoder.fit_transform(df.AnimalType)
    process_name(df)
    process_age(df)
    process_sex(df)
    process_breed(df)
    impute_age(train, df)

In [11]:
outcome_encoder = LabelEncoder()
train.OutcomeType = outcome_encoder.fit_transform(train.OutcomeType)
    
preprocess_df(train)
preprocess_df(test)

#display(train.head(2))
#display(test.head(2))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [59]:
fetures = ['AnimalType', 'Sex', 'AgeuponOutcome', 'Sterilized', 'Mix']
X = train[fetures] #, 'BreedFrequency'
y = train['OutcomeType']
X.head(2)

Unnamed: 0,AnimalType,Sex,AgeuponOutcome,Sterilized,Mix
0,1,0,365.0,1,1
1,0,1,365.0,1,1


# Visualization

In [60]:
#import seaborn as sns
#sns.set()
#sns.pairplot(train[fetures + ['OutcomeType']], size=2, hue='OutcomeType')

# Training

In [61]:
# fix random seed for reproducibility
import numpy as np
seed = 7
np.random.seed(seed)

In [62]:
from sklearn.model_selection import train_test_split
X_train, X_validation, y_train, y_validation = train_test_split(X, y, train_size=0.85, random_state=seed)

## Logistic Regression

In [63]:
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn import model_selection

degree = 1

logfit = Pipeline([
    ('scaler', StandardScaler()),
    ('poly_features', PolynomialFeatures(degree)), 
    ('clf', LogisticRegression())])

scores = model_selection.cross_val_score(logfit, X_train, y_train, scoring='neg_log_loss')
print('score: %s' % -scores.mean())

score: 1.04268114925


## KNN

In [17]:
from sklearn.neighbors import KNeighborsClassifier

params = { 
    'n_neighbors': [x for x in range(30, 31)],
    #'weights': ['uniform', 'distance'],
    #'p': [1,2],
}

knn = model_selection.GridSearchCV(KNeighborsClassifier(), param_grid=params, scoring='neg_log_loss')
knn.fit(X_train, y_train);

print('Best params for knn: ', knn.best_params_)
print('Best score: ', -knn.best_score_)

Best params for knn:  {'n_neighbors': 30}
Best score:  1.27714738564


## NN

In [32]:
import keras
from keras.models import Sequential
from keras.layers import Dense

def baseline_model():
    # create model
    model = Sequential()
    model.add(Dense(8, input_dim=6, activation='relu'))
    model.add(Dense(5, activation='softmax'))
    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [33]:
from keras.wrappers.scikit_learn import KerasClassifier
estimator = KerasClassifier(build_fn=baseline_model, epochs=3, batch_size=5, verbose=1)

In [34]:
from sklearn.model_selection import KFold, cross_val_score
kfold = KFold(n_splits=10, shuffle=True, random_state=seed)

In [35]:
from keras.utils import np_utils
# encode class values as integers
encoder = LabelEncoder()
encoded_Y = encoder.fit_transform(y)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y = np_utils.to_categorical(encoded_Y)
X = StandardScaler().fit_transform(X)

In [22]:
results = cross_val_score(estimator, X, dummy_y, cv=kfold)
print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 2/10

KeyboardInterrupt: 

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = Pipeline([
    ('scaler', preprocessing.StandardScaler()),
    #('poly_features', preprocessing.PolynomialFeatures(degree)), 
    ('clf', RandomForestClassifier())])

scores = model_selection.cross_val_score(rf, X, y, scoring='neg_log_loss')
print('score: %s' % -scores.mean())

# Submit to Kaggle

In [23]:
X_test = test[fetures]

In [40]:
from sklearn.preprocessing import label_binarize

def submit(clf, name):
    clf.fit(X, y)
    predicts = clf.predict_proba(np.array(X_test))
    ids = test.ID
    
    answers = pd.DataFrame(predicts)
    answers = pd.concat([ids, answers], axis=1)
    answers.columns = ['ID', 'Adoption', 'Died', 'Euthanasia', 'Return_to_owner', 'Transfer']
    answers.to_csv(name + '.csv', index=False)

In [41]:
#submit(logfit, 'logfit_poly_degree_' + str(degree))

In [42]:
submit(estimator, 'nn')

