# Load data

In [1]:
import pandas as pd

problem = 'Animals'
train = pd.read_csv('data/' + problem + '/train.csv')
test = pd.read_csv('data/' + problem + '/test.csv')

train.head()

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,A671945,Hambone,2014-02-12 18:22:00,Return_to_owner,,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White
1,A656520,Emily,2013-10-13 12:44:00,Euthanasia,Suffering,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby
2,A686464,Pearce,2015-01-31 12:28:00,Adoption,Foster,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White
3,A683430,,2014-07-11 19:09:00,Transfer,Partner,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Cream
4,A667013,,2013-11-15 12:52:00,Transfer,Partner,Dog,Neutered Male,2 years,Lhasa Apso/Miniature Poodle,Tan


In [2]:
test.head()

Unnamed: 0,ID,Name,DateTime,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,1,Summer,2015-10-12 12:15:00,Dog,Intact Female,10 months,Labrador Retriever Mix,Red/White
1,2,Cheyenne,2014-07-26 17:59:00,Dog,Spayed Female,2 years,German Shepherd/Siberian Husky,Black/Tan
2,3,Gus,2016-01-13 12:20:00,Cat,Neutered Male,1 year,Domestic Shorthair Mix,Brown Tabby
3,4,Pongo,2013-12-28 18:12:00,Dog,Intact Male,4 months,Collie Smooth Mix,Tricolor
4,5,Skooter,2015-09-24 17:59:00,Dog,Neutered Male,2 years,Miniature Poodle Mix,White


# Preprocessing

### Generate new features from existing ones

__Add binary feature *'HasName'*__ showing if animal has a name

In [3]:
def process_name(df):
    df['HasName'] = 1
    df.loc[df.Name.isnull(), 'HasName'] = 0

__Transform *'AgeuponOutcome'*__ so that shows an age of an animal __in days__

In [4]:
def process_age(df):    
    days_in_unit = {
        'day': 1,
        'days': 1,
        'week': 7,
        'weeks': 7,
        'month': 30,
        'months': 30,
        'years': 365,
        'year': 365
    }
        
    def strage_to_days(age):
        if not isinstance(age, str): return float('nan')
        age = age.split(' ')
        return int(age[0]) * days_in_unit[age[1]]

    df['AgeuponOutcome'] = df['AgeuponOutcome'].map(strage_to_days)

__Add *'Sterilized'* feature__ showing if animal was neutered or spayed

In [5]:
display(train[train.SexuponOutcome.isnull()])
train[train.SexuponOutcome == 'Unknown'].count()

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
3174,A667395,Diego,2013-11-27 16:11:00,Return_to_owner,,Dog,,7 years,Dachshund,Brown Merle


AnimalID          1093
Name                37
DateTime          1093
OutcomeType       1093
OutcomeSubtype    1076
AnimalType        1093
SexuponOutcome    1093
AgeuponOutcome    1089
Breed             1093
Color             1093
dtype: int64

In [6]:
import re

def process_sex(df):
    sterilized_pat = re.compile('.*(neutered|spayed).*', flags=re.IGNORECASE)
    df['Sterilized'] = df.SexuponOutcome.fillna('Unknown')\
        .str.match(sterilized_pat)\
        .astype(int)
    
    def shorten_sex(sex):
        if isinstance(sex, str):
            if 'Male' in sex:
                return 0
            elif 'Female' in sex:
                return 1
        return 2 # the Unknown
    df['SexuponOutcome'] = df['SexuponOutcome'].map(shorten_sex).astype('int')

In [7]:
def process_breed(df):
    mix_pat = re.compile('.*(/|Mix).*')
    df['Mix'] = df.Breed.str.match(mix_pat).astype('int')
    df['Breed'] = df.Breed.str.rstrip(' Mix')

In [8]:
from sklearn import preprocessing

def preprocess_df(df):
    encoder = preprocessing.LabelEncoder()
    df.AnimalType = encoder.fit_transform(df.AnimalType)
    process_name(df)
    process_age(df)
    process_sex(df)
    process_breed(df)

In [9]:
outcome_encoder = preprocessing.LabelEncoder()
train.OutcomeType = outcome_encoder.fit_transform(train.OutcomeType)
    
preprocess_df(train)
preprocess_df(test)

display(train.head())
display(test.head())

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color,HasName,Sterilized,Mix
0,A671945,Hambone,2014-02-12 18:22:00,3,,1,0,365.0,Shetland Sheepdog,Brown/White,1,1,1
1,A656520,Emily,2013-10-13 12:44:00,2,Suffering,0,1,365.0,Domestic Shorthair,Cream Tabby,1,1,1
2,A686464,Pearce,2015-01-31 12:28:00,0,Foster,1,0,730.0,Pit Bull,Blue/White,1,1,1
3,A683430,,2014-07-11 19:09:00,4,Partner,0,0,21.0,Domestic Shorthair,Blue Cream,0,0,1
4,A667013,,2013-11-15 12:52:00,4,Partner,1,0,730.0,Lhasa Apso/Miniature Poodle,Tan,0,1,1


Unnamed: 0,ID,Name,DateTime,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color,HasName,Sterilized,Mix
0,1,Summer,2015-10-12 12:15:00,1,1,300.0,Labrador Retriever,Red/White,1,0,1
1,2,Cheyenne,2014-07-26 17:59:00,1,1,730.0,German Shepherd/Siberian Husky,Black/Tan,1,1,1
2,3,Gus,2016-01-13 12:20:00,0,0,365.0,Domestic Shorthair,Brown Tabby,1,1,1
3,4,Pongo,2013-12-28 18:12:00,1,0,120.0,Collie Smooth,Tricolor,1,0,1
4,5,Skooter,2015-09-24 17:59:00,1,0,730.0,Miniature Poodle,White,1,1,1


In [10]:
test.head(3)

Unnamed: 0,ID,Name,DateTime,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color,HasName,Sterilized,Mix
0,1,Summer,2015-10-12 12:15:00,1,1,300.0,Labrador Retriever,Red/White,1,0,1
1,2,Cheyenne,2014-07-26 17:59:00,1,1,730.0,German Shepherd/Siberian Husky,Black/Tan,1,1,1
2,3,Gus,2016-01-13 12:20:00,0,0,365.0,Domestic Shorthair,Brown Tabby,1,1,1


In [11]:
display(train.isnull().sum())
display(test.isnull().sum())

AnimalID              0
Name               7691
DateTime              0
OutcomeType           0
OutcomeSubtype    13612
AnimalType            0
SexuponOutcome        0
AgeuponOutcome       18
Breed                 0
Color                 0
HasName               0
Sterilized            0
Mix                   0
dtype: int64

ID                   0
Name              3225
DateTime             0
AnimalType           0
SexuponOutcome       0
AgeuponOutcome       6
Breed                0
Color                0
HasName              0
Sterilized           0
Mix                  0
dtype: int64

In [12]:
def impute_age(src, dst):
    for f, g in dst.loc[dst['AgeuponOutcome'].isnull()].groupby(['Breed', 'Mix', 'SexuponOutcome']):
        gtrain = src.loc[
                (src['Breed'] == f[0]) &
                (src['Mix'] == f[1]) &
                ( (src['SexuponOutcome'] == f[2]) ), # if f[2] else True ),
            'AgeuponOutcome']
        med, cnt = gtrain.median(), gtrain.count()
        if cnt < 30: med = gtrain.mean()
        dst.loc[dst['AgeuponOutcome'].isnull() &
             (dst['Breed'] == f[0]) &
             (dst['Mix'] == f[1]) &
             (dst['SexuponOutcome'] == f[2]),
                'AgeuponOutcome'] = med
        print('Imputed age %s from %s samples of %s %s %s' % (
              med, cnt,
              f[0],
              'Mix' if f[1] else '',
              'Male' if f[2] == 0 else ('Female' if f[2] == 1 else 'Bisexual')))

In [13]:
impute_age(train, train)

Imputed age 365.0 from 231 samples of Domestic Longhair Mix Male
Imputed age 90.0 from 3927 samples of Domestic Shorthair Mix Male
Imputed age 120.0 from 3967 samples of Domestic Shorthair Mix Female
Imputed age 21.0 from 900 samples of Domestic Shorthair Mix Bisexual
Imputed age 1467.82608696 from 23 samples of Toy Poodle Mix Male


In [14]:
impute_age(train, test)

Imputed age 365.0 from 232 samples of Domestic Longhair Mix Male
Imputed age 90.0 from 3931 samples of Domestic Shorthair Mix Male
Imputed age 120.0 from 3975 samples of Domestic Shorthair Mix Female
Imputed age 21.0 from 904 samples of Domestic Shorthair Mix Bisexual


In [22]:
train.Breed.value_counts()

Domestic Shorthair                       8953
Pit Bull                                 1972
Chihuahua Shorthair                      1851
Labrador Retriever                       1432
Domestic Medium Hair                      881
German Shepherd                           652
Domestic Longhair                         543
Siamese                                   413
Australian Cattle Dog                     392
Dachshund                                 364
Boxer                                     275
Miniature Poodle                          254
Border Collie                             246
Australian Shepherd                       177
Rat Terrier                               170
Siberian Husky                            168
Yorkshire Terrier                         167
Catahoula                                 163
Jack Russell Terrier                      162
Miniature Schnauzer                       158
Shih Tzu                                  153
Rottweiler                        

### Drop useless features

In [16]:
train.head()

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color,HasName,Sterilized,Mix
0,A671945,Hambone,2014-02-12 18:22:00,3,,1,0,365.0,Shetland Sheepdog,Brown/White,1,1,1
1,A656520,Emily,2013-10-13 12:44:00,2,Suffering,0,1,365.0,Domestic Shorthair,Cream Tabby,1,1,1
2,A686464,Pearce,2015-01-31 12:28:00,0,Foster,1,0,730.0,Pit Bull,Blue/White,1,1,1
3,A683430,,2014-07-11 19:09:00,4,Partner,0,0,21.0,Domestic Shorthair,Blue Cream,0,0,1
4,A667013,,2013-11-15 12:52:00,4,Partner,1,0,730.0,Lhasa Apso/Miniature Poodle,Tan,0,1,1


In [15]:
#train.drop(['Name', 'DateTime', 'OutcomeSubtype', 'AnimalID'], axis=1, inplace=True)
#test.drop(['Name', 'ID'], axis=1, inplace=True)
#train.head(2)

# Visualization