# Preprocessing and feature engineering

In [23]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

DATAFOLDER = '../data/'

In [2]:
df = pd.read_csv(DATAFOLDER + 'train.csv', parse_dates=['DateTime'])
df.head()

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,A671945,Hambone,2014-02-12 18:22:00,Return_to_owner,,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White
1,A656520,Emily,2013-10-13 12:44:00,Euthanasia,Suffering,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby
2,A686464,Pearce,2015-01-31 12:28:00,Adoption,Foster,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White
3,A683430,,2014-07-11 19:09:00,Transfer,Partner,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Cream
4,A667013,,2013-11-15 12:52:00,Transfer,Partner,Dog,Neutered Male,2 years,Lhasa Apso/Miniature Poodle,Tan


In [3]:
# Divide training set in input and output
# X = input, y = output to predict
col = df.columns.tolist()
col.remove('OutcomeType')
col.remove('OutcomeSubtype')
X = df[col]
y = df['OutcomeType']

## Dataset analysis

In [4]:
print('Repartition of the classes')
print('--------------------------')
for outcome, rate in y.value_counts(normalize=True).items():
    print('{:15} {:>7.2f} %'.format(outcome, 100 * rate))

Repartition of the classes
--------------------------
Adoption          40.29 %
Transfer          35.25 %
Return_to_owner   17.91 %
Euthanasia         5.82 %
Died               0.74 %


In [5]:
print('Rate of unknown values by column')
print('--------------------------------')
for c in df.columns:
    if df[df[c].isnull()].shape[0] == 0:
        continue
    print('{:15} {:>7.3f}%'.format(c, 100 * df[df[c].isnull()].shape[0] / df.shape[0]))

Rate of unknown values by column
--------------------------------
Name             28.774%
OutcomeSubtype   50.926%
SexuponOutcome    0.004%
AgeuponOutcome    0.067%


In [6]:
# Missing values for the Sex
df[df['SexuponOutcome'].isnull()]

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
3174,A667395,Diego,2013-11-27 16:11:00,Return_to_owner,,Dog,,7 years,Dachshund,Brown Merle


There is only one missing value for the sex. As the outcomeSubtype is also missing, it might be simpler to drop this input.

In [7]:
print('Number of different values by columns')
print('-------------------------------------')
for c in df.columns:
    print('{:15} {:>7}'.format(c, df[c].value_counts().shape[0]))

Number of different values by columns
-------------------------------------
AnimalID          26729
Name               6374
DateTime          22918
OutcomeType           5
OutcomeSubtype       16
AnimalType            2
SexuponOutcome        5
AgeuponOutcome       44
Breed              1380
Color               366


In [8]:
print('Repartition Dog/Cat')
print('-------------------')
for k, v in df['AnimalType'].value_counts().items():
    print('{:5}{:>5}  {:.3f}%'.format(k, v, 100 * v / df.shape[0]))

Repartition Dog/Cat
-------------------
Dog  15595  58.345%
Cat  11134  41.655%


## First remarks

Names: some unknown values

To change:
- DateTime $\rightarrow$ from string to year, month, day, hour, weekday, holiday
- SexuponOutcome $\rightarrow$ make multiple categories
- AgeuponOutcome $\rightarrow$ from string to integer (in number of weeks)
- Breed $\rightarrow$ reduce the number of breed by using categories
- Color $\rightarrow$ change to a smaller number of categories

Ideas:
- Names<br>
    - Try to see name origin

### Datetime

In [9]:
def process_date(X):
    # Add new columns
    X['Hour']    = X['DateTime'].dt.hour
    X['Day']     = X['DateTime'].dt.day
    X['Weekday'] = X['DateTime'].dt.weekday
    X['Month']   = X['DateTime'].dt.month
    X['Year']    = X['DateTime'].dt.year
    # Delete the DateTime column
    X = X.drop(labels=['DateTime'], axis=1)

### Age

In [10]:
def process_age(X):
    def age_in_weeks(age):
        """Change from age in weeks and years to age in week only."""
        if isinstance(age, float):
            return -100000
        nb, period = age.split()
        return 52 * int(nb) if period.startswith('year') else int(nb)

    X.loc[:, 'AgeuponOutcome'] = X['AgeuponOutcome'].apply(age_in_weeks)

### Sex

In [11]:
def process_sex(X):
    # Dummify all possible values
    # spayed and neutered are considered the same here
    Unknown = np.zeros(X.shape[0])
    Male = np.zeros(X.shape[0])
    Female = np.zeros(X.shape[0])
    Neutered = np.zeros(X.shape[0])
    NotNeutered = np.zeros(X.shape[0])

    for i in range(X.shape[0]):
        sex = X.loc[i, 'SexuponOutcome']
        if isinstance(sex, float):
            continue
        if 'Unknown' in sex:
            Unknown[i] = 1
            continue
        if 'Neutered' in sex or 'Spayed' in sex:
            Neutered[i] = 1
        else:
            NotNeutered[i] = 1
        if 'Male' in sex:
            Male[i] = 1
        else:
            Female[i] = 1

    X['Unknown'] = Unknown
    X['Male'] = Male
    X['Female'] = Female
    X['Neutered'] = Neutered
    X['NotNeutered'] = NotNeutered
    X = X.drop(labels=['SexuponOutcome'], axis=1)

### Type

In [12]:
def process_type(X):
    # Dummify the type of animal
    def animal_type_dummy(animal_type):
        return 0 if animal_type == 'Dog' else 1

    X.loc[:, 'AnimalType'] = X['AnimalType'].apply(animal_type_dummy)

### Color

Let's dummify by grouping according to the base colors and patterns ([More details](http://messybeast.com/colour-charts.htm)). I grouped them in nine colors and five patterns.

In [13]:
colors = ['White', 'Black', 'Brown', 'Blue', 'Orange', 'Red', 'Tricolor', 'Cream', 'Gray']
corresponding_colors = {
    'Chocolate': 'Brown' ,
    'Smoke'    : 'Black' ,
    'Lilac'    : 'Gray'  ,
    'Fawn'     : 'Cream' ,
    'Sable'    : 'Cream' ,
    'Yellow'   : 'Orange',
    'Lynx'     : 'Gray'  ,
    'Flame'    : 'Orange',
    'Apricot'  : 'Orange',
    'Pink'     : 'Orange',
    'Silver'   : 'Gray'  ,
    'Gold'     : 'Orange',
    'Liver'    : 'Gray'  ,
    'Ruddy'    : 'Orange'
}
patterns = ['Tabby', 'Point', 'Tortie', 'Tan', 'Merle']
corresponding_patterns = {
    'Calico' : 'Tortie',
    'Torbie' : 'Tabby' ,
    'Brindle': 'Tortie',
    'Buff'   : 'Tabby' ,
    'Seal'   : 'Point' ,
    'Tick'   : 'Merle' ,
    'Tiger'  : 'Tabby' ,
    'Agouti' : 'Tabby'
}

In [14]:
def process_color(X, colors, corresponding_colors,
                     patterns, corresponding_patterns):
    df_color = pd.DataFrame(np.zeros((X.shape[0], len(colors))),
                            columns=colors)
    df_pattern = pd.DataFrame(np.zeros((X.shape[0], len(patterns))),
                              columns=patterns)
    for i, color in enumerate(X['Color']):
        for c in colors:
            if c in color:
                df_color.loc[i, c] = 1
        for c in corresponding_colors.keys():
            if c in color:
                df_color.loc[i, corresponding_colors[c]] = 1
        for p in patterns:
            if p in color:
                df_pattern.loc[i, p] = 1
        for p in corresponding_patterns.keys():
            if p in color:
                df_pattern.loc[i, corresponding_patterns[p]] = 1

### Breed

I use the [FCI Nomenclature](http://www.fci.be/en/Nomenclature/) to group the breed of dogs in ten groups.
I use the 

In [15]:
groups = ['Sheepdogs_Cattledogs', 'Pinscher_Schnauzer', 'Terriers', 
          'Dachshunds', 'Spitz', 'Scent', 'Pointing', 'Retrievers',
          'Companion', 'Sighthounds']
corresponding_groups = {
    'Cattle'    : 'Sheepdogs_Cattledogs',
    'Kelpie'    : 'Sheepdogs_Cattledogs',
    'Sheepdog'  : 'Sheepdogs_Cattledogs',
    'Shepherd'  : 'Sheepdogs_Cattledogs',
    'Dachshund' : 'Sheepdogs_Cattledogs',
    'Collie'    : 'Sheepdogs_Cattledogs',
    'Bull '     : 'Terriers'            ,
    'Terrier'   : 'Terriers'            ,
    'Staff'     : 'Terriers'            ,
    'Retriever' : 'Retrievers'          ,
    'Rottweiler': 'Pinscher_Schnauzer'  ,
    'Boxer'     : 'Pinscher_Schnauzer'  ,
    'Bulldog'   : 'Pinscher_Schnauzer'  ,
    'Pinsch'    : 'Pinscher_Schnauzer'  ,
    'Point'     : 'Pointing'            ,
    'Chihuahua' : 'Companion'           ,
    'Poodle'    : 'Companion'           ,
    'Husky'     : 'Spitz'               ,
    'Beagle'    : 'Scent'
}
sizes = ['Big', 'Small', 'Medium']
hair_lengths = ['Short', 'Medium', 'Long']

In [16]:
df_groups
df_sizes
df_hair_lengths
df_mix

NameError: name 'df_groups' is not defined

In [22]:
breeds = pd.Series(X[X['AnimalType'] == 'Dog']['Breed'].value_counts().keys())
breeds.to_csv('../data/', index=False)

TypeError: to_csv() missing 1 required positional argument: 'path'

In [None]:
i = 0
for k, v in X[X['AnimalType'] == 'Dog']['Breed'].value_counts().items():
    if not any(b in k for b in groups + list(corresponding_groups.keys())):
        print('{:45}{:>4}'.format(k, v))
        i += 1
print(i)

For the cats, I use the size (small, medium, big), the hair length (short, medium, long) and whether they are a mix or not to classify them in smaller categories.

In [None]:
X[X['AnimalType'] == 'Cat']['Breed'].value_counts()

In [None]:
n_mixes = df[df['Breed'].str.contains('Mix|/')].shape[0]
n = df.shape[0]
print('Number of mixed animals: {} ({:4.2f}%)'.format(n_mixes, 100 * n_mixes / n))
print('Total number of animals:', n)