# Load data

In [1]:
import pandas as pd

problem = 'Animals'
train = pd.read_csv('data/' + problem + '/train.csv')
test = pd.read_csv('data/' + problem + '/test.csv')

train.head()

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,A671945,Hambone,2014-02-12 18:22:00,Return_to_owner,,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White
1,A656520,Emily,2013-10-13 12:44:00,Euthanasia,Suffering,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby
2,A686464,Pearce,2015-01-31 12:28:00,Adoption,Foster,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White
3,A683430,,2014-07-11 19:09:00,Transfer,Partner,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Cream
4,A667013,,2013-11-15 12:52:00,Transfer,Partner,Dog,Neutered Male,2 years,Lhasa Apso/Miniature Poodle,Tan


In [2]:
test.head()

Unnamed: 0,ID,Name,DateTime,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,1,Summer,2015-10-12 12:15:00,Dog,Intact Female,10 months,Labrador Retriever Mix,Red/White
1,2,Cheyenne,2014-07-26 17:59:00,Dog,Spayed Female,2 years,German Shepherd/Siberian Husky,Black/Tan
2,3,Gus,2016-01-13 12:20:00,Cat,Neutered Male,1 year,Domestic Shorthair Mix,Brown Tabby
3,4,Pongo,2013-12-28 18:12:00,Dog,Intact Male,4 months,Collie Smooth Mix,Tricolor
4,5,Skooter,2015-09-24 17:59:00,Dog,Neutered Male,2 years,Miniature Poodle Mix,White


# Preprocessing

### Generate new features from existing ones

__Add binary feature *'HasName'*__ showing if animal has a name and __remove *'Name'*__ from features 

In [3]:
def process_name(df):
    df['HasName'] = 1
    df.loc[df.Name.isnull(), 'HasName'] = 0
    return df.drop('Name', axis=1)

__Transform *'AgeuponOutcome'*__ so that shows an age of an animal __in days__

In [4]:
def process_age(df):    
    days_in_unit = {
        'day': 1,
        'days': 1,
        'week': 7,
        'weeks': 7,
        'month': 30,
        'months': 30,
        'years': 365,
        'year': 365
    }
        
    def strage_to_days(age):
        if not isinstance(age, str): return 0
        age = age.split(' ')
        return int(age[0]) * days_in_unit[age[1]]

    df['AgeuponOutcome'] = df['AgeuponOutcome'].map(strage_to_days)
    return df

__Add *'Sterilized'* feature__ showing if animal was neutered or spayed

In [5]:
import re

def process_steril(df):
    sterilized_pat = re.compile('.*(neutered|spayed).*', flags=re.IGNORECASE)
    df['Sterilized'] = df.SexuponOutcome.fillna('Unknown')\
        .str.match(sterilized_pat)\
        \
        .astype(int)
    return df

In [6]:
train

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,A671945,Hambone,2014-02-12 18:22:00,Return_to_owner,,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White
1,A656520,Emily,2013-10-13 12:44:00,Euthanasia,Suffering,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby
2,A686464,Pearce,2015-01-31 12:28:00,Adoption,Foster,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White
3,A683430,,2014-07-11 19:09:00,Transfer,Partner,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Cream
4,A667013,,2013-11-15 12:52:00,Transfer,Partner,Dog,Neutered Male,2 years,Lhasa Apso/Miniature Poodle,Tan
5,A677334,Elsa,2014-04-25 13:04:00,Transfer,Partner,Dog,Intact Female,1 month,Cairn Terrier/Chihuahua Shorthair,Black/Tan
6,A699218,Jimmy,2015-03-28 13:11:00,Transfer,Partner,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Tabby
7,A701489,,2015-04-30 17:02:00,Transfer,Partner,Cat,Unknown,3 weeks,Domestic Shorthair Mix,Brown Tabby
8,A671784,Lucy,2014-02-04 17:17:00,Adoption,,Dog,Spayed Female,5 months,American Pit Bull Terrier Mix,Red/White
9,A677747,,2014-05-03 07:48:00,Adoption,Offsite,Dog,Spayed Female,1 year,Cairn Terrier,White


In [7]:
from sklearn import preprocessing

def preprocess_df(df):
    encoder = preprocessing.LabelEncoder()
    df.AnimalType = encoder.fit_transform(df.AnimalType)
    
    return process_steril(
        process_age(
        process_name(df)
    ))

In [8]:
train.OutcomeSubtype[train.OutcomeSubtype.isnull()] = 'Unknown'

outcome_encoder = preprocessing.LabelEncoder()
train.OutcomeSubtype = outcome_encoder.fit_transform(train.OutcomeSubtype)
train.OutcomeType = outcome_encoder.fit_transform(train.OutcomeType)
    
train.drop('AnimalID', axis=1, inplace=True)
test.drop('ID', axis=1, inplace=True)

X_train = preprocess_df(train)
X_test = preprocess_df(test)

X_train

Unnamed: 0,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color,HasName,Sterilized
0,2014-02-12 18:22:00,3,16,1,Neutered Male,365,Shetland Sheepdog Mix,Brown/White,1,1
1,2013-10-13 12:44:00,2,15,0,Spayed Female,365,Domestic Shorthair Mix,Cream Tabby,1,1
2,2015-01-31 12:28:00,0,6,1,Neutered Male,730,Pit Bull Mix,Blue/White,1,1
3,2014-07-11 19:09:00,4,12,0,Intact Male,21,Domestic Shorthair Mix,Blue Cream,0,0
4,2013-11-15 12:52:00,4,12,1,Neutered Male,730,Lhasa Apso/Miniature Poodle,Tan,0,1
5,2014-04-25 13:04:00,4,12,1,Intact Female,30,Cairn Terrier/Chihuahua Shorthair,Black/Tan,1,0
6,2015-03-28 13:11:00,4,12,0,Intact Male,21,Domestic Shorthair Mix,Blue Tabby,1,0
7,2015-04-30 17:02:00,4,12,0,Unknown,21,Domestic Shorthair Mix,Brown Tabby,0,0
8,2014-02-04 17:17:00,0,16,1,Spayed Female,150,American Pit Bull Terrier Mix,Red/White,1,1
9,2014-05-03 07:48:00,0,11,1,Spayed Female,365,Cairn Terrier,White,0,1


In [9]:
X_test

Unnamed: 0,DateTime,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color,HasName,Sterilized
0,2015-10-12 12:15:00,1,Intact Female,300,Labrador Retriever Mix,Red/White,1,0
1,2014-07-26 17:59:00,1,Spayed Female,730,German Shepherd/Siberian Husky,Black/Tan,1,1
2,2016-01-13 12:20:00,0,Neutered Male,365,Domestic Shorthair Mix,Brown Tabby,1,1
3,2013-12-28 18:12:00,1,Intact Male,120,Collie Smooth Mix,Tricolor,1,0
4,2015-09-24 17:59:00,1,Neutered Male,730,Miniature Poodle Mix,White,1,1
5,2015-06-23 11:17:00,1,Neutered Male,1095,Beagle Mix,Brown/White,1,1
6,2014-03-12 09:45:00,0,Neutered Male,4745,Domestic Medium Hair Mix,Brown Tabby/White,1,1
7,2014-06-25 08:27:00,0,Spayed Female,180,Domestic Shorthair Mix,Brown Tabby,1,1
8,2014-11-12 18:05:00,1,Spayed Female,90,Cairn Terrier,Black/Cream,1,1
9,2014-04-07 17:41:00,1,Spayed Female,365,Pit Bull Mix,Brown/White,1,1
