## Shelter Animal Outcomes
### MIDS W207 Final Project
### Clay Miller, Roseanna Hopper, Yubo Zhang

In [33]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re

from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction import FeatureHasher, DictVectorizer
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.base import BaseEstimator, TransformerMixin
from bokeh.charts import Bar, output_file, show, output_notebook
output_notebook()

%matplotlib inline

In [60]:
# Load the data
data = pd.read_csv('../data/train.csv')
breeds = pd.read_csv('../data/breeds.csv')
breeds['Breed'] = breeds['Breed'].str.strip()
top_breed_list = []
for b in breeds['Breed']:
    top_breed_list.append(b.strip())
data['OutcomeSubtype'] = data['OutcomeSubtype'].fillna('')
data['Female'] = 'Female' in data['SexuponOutcome']
data['AgeuponOutcome'].fillna('', inplace = True)

def ageConvert(age):
    regexyear = '(\d+) year'
    regexmnth = '(\d+) month'
    regexwk = '(\d+) week'
    regexday = '(\d+) day'
    if re.match(regexyear, age):
        const = int(re.match(regexyear, age).groups()[0])
        return const*52
    elif re.match(regexmnth, age):
        const = int(re.match(regexmnth, age).groups()[0])
        return const*4.5 # a month is roughly 4.5 weeks
    elif re.match(regexwk, age):
        return int(re.match(regexwk, age).groups()[0])
    elif re.match(regexday, age):
        const = int(re.match(regexday, age).groups()[0])
        return const/7 #7 days in a week
    else:
        return None
    
data['ConvertedAge']=data['AgeuponOutcome'].apply(ageConvert)


def female(i):
    i = str(i)
    if i.find('Female') >= 0: return 'Female'
    if i.find('Unknown') >= 0: return 'Unknown'
    return 'Male'
data['Female'] = data.SexuponOutcome.apply(female)

def intact(i):
    i = str(i)
    if i.find('Intact') >= 0: return 'Intact'
    if i.find('Unknown') >= 0: return 'Unknown'
    return 'Spayed/Neutered'
data['Intact'] = data.SexuponOutcome.apply(intact)

def mixed_breed(i):
    i = str(i)
    if i.find('Mix') >= 0: return 'Mixed Breed'
    if i.find('/') >= 0: return 'Known Breed Combo'
    return 'Nonmixed'
data['MixedBreed'] = data.Breed.apply(mixed_breed)

def top_breed(i):
    i = str(i)
    if any(word in i for word in top_breed_list):
        return int(1)
    else:
        return int(0)
data['TopBreed'] = data.Breed.apply(top_breed)

def breed_rank(i):
    i = str(i)
    ranks = []
    for word in top_breed_list:
        if word in i:
            ranks.append(int(breeds.loc[breeds['Breed'] == word]['2007']))
    if len(ranks) > 0:
        return np.mean(ranks)
    else:
        return 51.0
data['BreedRank'] = data.Breed.apply(breed_rank)

def pit_bull(i):
    i = str(i)
    if i.find("Pit Bull") >=0: return int(1)
    else: return int(0)
data['PitBull'] = data.Breed.apply(pit_bull)

def black_cat(i):
    i = str(i)
    if i == "Black": return int(1)
    else: return int(0)
data['BlackCat'] = data.Color.apply(black_cat)

def naming(i):
    if pd.isnull(i): return 'Unnamed'
    return 'Named'
data['Named'] = data.Name.apply(naming)

#Change all breed and color strings so that they are ordered consistently
#E.G. all "brown/black" and "black/brown" should become "black, brown"
def reorder(i):
    i = str(i)
    if i.find(" ") >= 0: i = i.replace(" ", "-")
    if i.find("/") >= 0: i = i.replace("/", " ")
    i = i.split()
    i = sorted(i)
    i = ' '.join(i)
    return i

data['OrderedColor'] = data.Color.apply(reorder)
data['OrderedBreed'] = data.Breed.apply(reorder)


In [61]:
data.head(10)

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color,...,ConvertedAge,Intact,MixedBreed,TopBreed,BreedRank,PitBull,BlackCat,Named,OrderedColor,OrderedBreed
0,A671945,Hambone,2014-02-12 18:22:00,Return_to_owner,,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White,...,52.0,Spayed/Neutered,Mixed Breed,1,20.0,0,0,Named,Brown White,Shetland-Sheepdog-Mix
1,A656520,Emily,2013-10-13 12:44:00,Euthanasia,Suffering,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby,...,52.0,Spayed/Neutered,Mixed Breed,0,51.0,0,0,Named,Cream-Tabby,Domestic-Shorthair-Mix
2,A686464,Pearce,2015-01-31 12:28:00,Adoption,Foster,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White,...,104.0,Spayed/Neutered,Mixed Breed,0,51.0,1,0,Named,Blue White,Pit-Bull-Mix
3,A683430,,2014-07-11 19:09:00,Transfer,Partner,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Cream,...,3.0,Intact,Mixed Breed,0,51.0,0,0,Unnamed,Blue-Cream,Domestic-Shorthair-Mix
4,A667013,,2013-11-15 12:52:00,Transfer,Partner,Dog,Neutered Male,2 years,Lhasa Apso/Miniature Poodle,Tan,...,104.0,Spayed/Neutered,Known Breed Combo,1,28.5,0,0,Unnamed,Tan,Lhasa-Apso Miniature-Poodle
5,A677334,Elsa,2014-04-25 13:04:00,Transfer,Partner,Dog,Intact Female,1 month,Cairn Terrier/Chihuahua Shorthair,Black/Tan,...,4.5,Intact,Known Breed Combo,1,30.0,0,0,Named,Black Tan,Cairn-Terrier Chihuahua-Shorthair
6,A699218,Jimmy,2015-03-28 13:11:00,Transfer,Partner,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Tabby,...,3.0,Intact,Mixed Breed,0,51.0,0,0,Named,Blue-Tabby,Domestic-Shorthair-Mix
7,A701489,,2015-04-30 17:02:00,Transfer,Partner,Cat,Unknown,3 weeks,Domestic Shorthair Mix,Brown Tabby,...,3.0,Unknown,Mixed Breed,0,51.0,0,0,Unnamed,Brown-Tabby,Domestic-Shorthair-Mix
8,A671784,Lucy,2014-02-04 17:17:00,Adoption,,Dog,Spayed Female,5 months,American Pit Bull Terrier Mix,Red/White,...,22.5,Spayed/Neutered,Mixed Breed,0,51.0,1,0,Named,Red White,American-Pit-Bull-Terrier-Mix
9,A677747,,2014-05-03 07:48:00,Adoption,Offsite,Dog,Spayed Female,1 year,Cairn Terrier,White,...,52.0,Spayed/Neutered,Nonmixed,1,48.0,0,0,Unnamed,White,Cairn-Terrier


In [64]:
#Remove unnecessary variables
#del data['AnimalID']
#del data['Name']
#del data['DateTime']
#del data['Breed']
#del data['OutcomeSubtype']
#del data['SexuponOutcome']
#del data['AgeuponOutcome']
#del data['Color']
#del data['OrderedBreed']
#del data['OrderedColor']

#BreedRank, TopBreed, PitBull
continuous = {'dog':['ConvertedAge', 'BreedRank'], 'cat':['ConvertedAge']}
discrete = {'dog':[
    'AnimalType',
    'Female',
    'Intact',
    'MixedBreed',
    'Named',
    'TopBreed',
    'PitBull'
], 'cat': [
    'AnimalType',
    'Female',
    'Intact',
    'MixedBreed',
    'Named',
    'BlackCat'
]}


predictors = {'dog': continuous['dog'] + discrete['dog'], 'cat':continuous['cat']+discrete['cat']}
target = 'OutcomeType'

print data.ConvertedAge.unique()

#For those missing an age, fill with the median age by animal type
data["ConvertedAge"] = data.groupby("AnimalType").transform(lambda x: x.fillna(x.median()))
data[continuous['dog']].describe().T


[  5.20000000e+01   1.04000000e+02   3.00000000e+00   4.50000000e+00
   2.25000000e+01   2.08000000e+02   1.35000000e+01   2.00000000e+00
   9.00000000e+00   4.50000000e+01   2.70000000e+01   2.60000000e+02
   3.64000000e+02   1.56000000e+02   1.80000000e+01   6.24000000e+02
   4.68000000e+02   3.12000000e+02   1.00000000e+00   5.72000000e+02
   4.00000000e+00   3.15000000e+01   4.16000000e+02   4.95000000e+01
   0.00000000e+00   4.05000000e+01   3.60000000e+01   7.80000000e+02
   5.20000000e+02   7.28000000e+02   5.00000000e+00   8.32000000e+02
   6.76000000e+02   8.84000000e+02   9.36000000e+02   9.88000000e+02
   1.04000000e+03]


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
ConvertedAge,26729.0,113.370253,154.064766,0.0,9.0,52.0,156.0,1040.0
BreedRank,26729.0,35.975981,20.246237,1.0,12.0,51.0,51.0,51.0


In [65]:
predictors['dog']

['ConvertedAge',
 'BreedRank',
 'AnimalType',
 'Female',
 'Intact',
 'MixedBreed',
 'Named',
 'TopBreed',
 'PitBull']

In [66]:
# Train/test split
data_dog = data[data['AnimalType'] == 'Dog']
data_cat = data[data['AnimalType'] == 'Cat']
X_dog = data_dog[predictors['dog']]
X_cat = data_cat[predictors['cat']]
y_dog = data_dog[[target]]
y_cat = data_cat[[target]]
X_dog_train, X_dog_dev, y_dog_train, y_dog_dev = train_test_split(X_dog, y_dog, random_state=2)
X_cat_train, X_cat_dev, y_cat_train, y_cat_dev = train_test_split(X_cat, y_cat, random_state=2)

#ss = StandardScaler()
#ss.fit(X_train.loc[:,continuous])
#X_train.loc[:,continuous] = ss.transform(X_train.loc[:,continuous])
#X_dev.loc[:,continuous] = ss.transform(X_dev.loc[:,continuous])

#X_train[continuous].describe().T

In [67]:
class MyVectorizer(BaseEstimator, TransformerMixin):

    def __init__(self, cols, hashing=None):
        """
        args:
            cols: a list of column names of the categorical variables
            hashing: 
                If None, then vectorization is a simple one-hot-encoding.
                If an integer, then hashing is the number of features in the output.
        """
        self.cols = cols
        self.hashing = hashing
        
    def fit(self, X, y=None):

        data = X[self.cols]
        
        # Choose a vectorizer
        if self.hashing is None:
            self.myvec = DictVectorizer(sparse=False)
        else:
            self.myvec = FeatureHasher(n_features = self.hashing)
    
        self.myvec.fit(X[self.cols].to_dict(orient='records'))
        return self
            
    def transform(self, X):
            
        # Vectorize Input
        if self.hashing is None:
            return pd.DataFrame(
                self.myvec.transform(X[self.cols].to_dict(orient='records')),
                columns = self.myvec.feature_names_
            )
        else:
            return pd.DataFrame(
                self.myvec.transform(X[self.cols].to_dict(orient='records')).toarray()
            )

In [68]:
#Discrete test

discrete_pipe = Pipeline(steps=[('Vectorizer', MyVectorizer(cols=discrete, hashing=None))])

#### Continous Variables

In [69]:
class MyScaler():
    
    def __init__(self, cols):
        self.cols = cols
    
    def fit(self, X, y=None):
        self.ss = StandardScaler()
        self.ss.fit(X[self.cols])
        return self
    
    def transform(self, X):
        return self.ss.transform(X[self.cols])

In [70]:
#Continuous test
#ss = MyScaler(continuous)
#transformed_train = ss.fit(X_train)
#transformed_dev = ss.transform(X_dev)

In [71]:
#continuous_pipe = Pipeline(steps=[('Scale', MyScaler(continuous))])

#transformed_train = continuous_pipe.fit(X_train)
#transformed_dev = continuous_pipe.transform(X_dev)

#### Feature Union

In [72]:
discrete_pipe_dog = Pipeline(steps=[('Vectorizer', MyVectorizer(cols=discrete['dog'], hashing=None))])
discrete_pipe_cat = Pipeline(steps=[('Vectorizer', MyVectorizer(cols=discrete['cat'], hashing=None))])

continuous_pipe_cat = Pipeline(steps=[('Scale', MyScaler(continuous['cat']))])
continuous_pipe_dog = Pipeline(steps=[('Scale', MyScaler(continuous['dog']))])

union_dog = FeatureUnion([('Discrete', discrete_pipe_dog), ('Continuous', continuous_pipe_dog)])
union_cat = FeatureUnion([('Discrete', discrete_pipe_cat), ('Continuous', continuous_pipe_cat)])
transformed_train = union_dog.fit_transform(X_dog_train)
transformed_dev = union_dog.transform(X_dog_dev)

In [73]:
pipeline_dog = Pipeline([('Preprocess', FeatureUnion([('Discrete', discrete_pipe_dog), ('Continuous', continuous_pipe_dog)])),
                      ('Predict', LogisticRegression(multi_class='multinomial', solver='newton-cg'))])
pipeline_cat = Pipeline([('Preprocess', FeatureUnion([('Discrete', discrete_pipe_cat), ('Continuous', continuous_pipe_cat)])),
                      ('Predict', LogisticRegression(multi_class='multinomial', solver='newton-cg'))])

test_dog_lr = pipeline_dog.fit(X_dog_train, y_dog_train)
test_cat_lr = pipeline_cat.fit(X_cat_train, y_cat_train)

In [74]:
print "Multinomial Logistic Regression Train Accuracy :: ", metrics.accuracy_score(y_cat_train, test_cat_lr.predict(X_cat_train))
print "Multinomial Logistic Regression Dev Accuracy :: ", metrics.accuracy_score(y_cat_dev, test_cat_lr.predict(X_cat_dev))
print
print "Multinomial Logistic Regression Train Accuracy :: ", metrics.accuracy_score(y_dog_train, test_dog_lr.predict(X_dog_train))
print "Multinomial Logistic Regression Dev Accuracy :: ", metrics.accuracy_score(y_dog_dev, test_dog_lr.predict(X_dog_dev))
 

Multinomial Logistic Regression Train Accuracy ::  0.73748502994
Multinomial Logistic Regression Dev Accuracy ::  0.739583333333

Multinomial Logistic Regression Train Accuracy ::  0.562841997264
Multinomial Logistic Regression Dev Accuracy ::  0.578096947935


In [83]:
cfmatrix = confusion_matrix(y_cat_dev, test_cat_lr.predict(X_cat_dev), labels = ['Return_to_owner', 'Euthanasia', 'Adoption', 'Transfer', 'Died'])
print cfmatrix
print data.OutcomeType.unique()

[[   9    5  112   10    0]
 [   6   25   15  137    0]
 [  12    3  943   87    0]
 [  10    2  292 1082    0]
 [   0    0    4   30    0]]
['Return_to_owner' 'Euthanasia' 'Adoption' 'Transfer' 'Died']


In [84]:
cfmatrix = confusion_matrix(y_dog_dev, test_dog_lr.predict(X_dog_dev), labels = ['Return_to_owner', 'Euthanasia', 'Adoption', 'Transfer', 'Died'])
print cfmatrix
print data.OutcomeType.unique()

[[ 408    1  568  109    0]
 [  61   13   50   63    0]
 [ 167    1 1433   28    0]
 [ 123    8  457  400    0]
 [   1    0    2    6    0]]
['Return_to_owner' 'Euthanasia' 'Adoption' 'Transfer' 'Died']
