In [1]:
# imports
import pandas as pd
from datetime import *
import numpy as np
import math

In [2]:
# read in file
# read in intakes
outcomes_df = pd.read_csv('../sourceData/outcomes.csv')

In [3]:
# view data
outcomes_df.head()

Unnamed: 0,Animal ID,Name,DateTime,MonthYear,Date of Birth,Outcome Type,Outcome Subtype,Animal Type,Sex upon Outcome,Age upon Outcome,Breed,Color
0,A794011,Chunk,05/08/2019 06:20:00 PM,05/08/2019 06:20:00 PM,05/02/2017,Rto-Adopt,,Cat,Neutered Male,2 years,Domestic Shorthair Mix,Brown Tabby/White
1,A776359,Gizmo,07/18/2018 04:02:00 PM,07/18/2018 04:02:00 PM,07/12/2017,Adoption,,Dog,Neutered Male,1 year,Chihuahua Shorthair Mix,White/Brown
2,A821648,,08/16/2020 11:38:00 AM,08/16/2020 11:38:00 AM,08/16/2019,Euthanasia,,Other,Unknown,1 year,Raccoon,Gray
3,A720371,Moose,02/13/2016 05:59:00 PM,02/13/2016 05:59:00 PM,10/08/2015,Adoption,,Dog,Neutered Male,4 months,Anatol Shepherd/Labrador Retriever,Buff
4,A674754,,03/18/2014 11:47:00 AM,03/18/2014 11:47:00 AM,03/12/2014,Transfer,Partner,Cat,Intact Male,6 days,Domestic Shorthair Mix,Orange Tabby


In [4]:
# using 24 hr time instead of 12 hr time
outcomes_df['DateTime'] = outcomes_df['DateTime'].apply(lambda x: datetime.strptime(x, '%m/%d/%Y %I:%M:%S %p').strftime('%m/%d/%Y %H:%M'))
# changing DateTime and Date of Birth to datetime
outcomes_df[['DateTime', 'Date of Birth']] = outcomes_df[['DateTime', 'Date of Birth']].apply(pd.to_datetime)
# dropping the doubled col
outcomes_df = outcomes_df.drop(['MonthYear'], axis=1)
outcomes_df.head()

Unnamed: 0,Animal ID,Name,DateTime,Date of Birth,Outcome Type,Outcome Subtype,Animal Type,Sex upon Outcome,Age upon Outcome,Breed,Color
0,A794011,Chunk,2019-05-08 18:20:00,2017-05-02,Rto-Adopt,,Cat,Neutered Male,2 years,Domestic Shorthair Mix,Brown Tabby/White
1,A776359,Gizmo,2018-07-18 16:02:00,2017-07-12,Adoption,,Dog,Neutered Male,1 year,Chihuahua Shorthair Mix,White/Brown
2,A821648,,2020-08-16 11:38:00,2019-08-16,Euthanasia,,Other,Unknown,1 year,Raccoon,Gray
3,A720371,Moose,2016-02-13 17:59:00,2015-10-08,Adoption,,Dog,Neutered Male,4 months,Anatol Shepherd/Labrador Retriever,Buff
4,A674754,,2014-03-18 11:47:00,2014-03-12,Transfer,Partner,Cat,Intact Male,6 days,Domestic Shorthair Mix,Orange Tabby


In [5]:
# checking dtypes
outcomes_df.dtypes

Animal ID                   object
Name                        object
DateTime            datetime64[ns]
Date of Birth       datetime64[ns]
Outcome Type                object
Outcome Subtype             object
Animal Type                 object
Sex upon Outcome            object
Age upon Outcome            object
Breed                       object
Color                       object
dtype: object

In [6]:
# checking for nans
outcomes_df.isnull().sum()

Animal ID               0
Name                39025
DateTime                0
Date of Birth           0
Outcome Type           20
Outcome Subtype     67183
Animal Type             0
Sex upon Outcome        1
Age upon Outcome        5
Breed                   0
Color                   0
dtype: int64

In [7]:
# changing the null values in the 'Name' and Sex upon Outcome' col to 'unknown'
outcomes_df[['Sex upon Outcome', 'Name']] = outcomes_df[['Sex upon Outcome','Name']].fillna(value = 'Unknown')
print(outcomes_df[outcomes_df['Sex upon Outcome'].isnull() == True])
print(outcomes_df[outcomes_df['Name'].isnull() == True])

Empty DataFrame
Columns: [Animal ID, Name, DateTime, Date of Birth, Outcome Type, Outcome Subtype, Animal Type, Sex upon Outcome, Age upon Outcome, Breed, Color]
Index: []
Empty DataFrame
Columns: [Animal ID, Name, DateTime, Date of Birth, Outcome Type, Outcome Subtype, Animal Type, Sex upon Outcome, Age upon Outcome, Breed, Color]
Index: []


In [8]:
# changing null values in Outcome subtype to unspecified
outcomes_df[['Outcome Subtype']] = outcomes_df[['Outcome Subtype']].fillna(value = 'Unspecified')
outcomes_df[outcomes_df['Outcome Subtype'].isnull() == True]

Unnamed: 0,Animal ID,Name,DateTime,Date of Birth,Outcome Type,Outcome Subtype,Animal Type,Sex upon Outcome,Age upon Outcome,Breed,Color


In [9]:
# dropping nans in outcome type column
outcomes_df = outcomes_df.dropna(subset=['Outcome Type'])
outcomes_df[outcomes_df['Outcome Type'].isnull() == True]

Unnamed: 0,Animal ID,Name,DateTime,Date of Birth,Outcome Type,Outcome Subtype,Animal Type,Sex upon Outcome,Age upon Outcome,Breed,Color


In [10]:
# calculating age on outcome through subtracting DateTime from Date of Birth
# saving all differences (in years, rounded down) between dob and date of entry to a list
subtractList = [
    math.floor(int(x.split(' ')[0])/365) for x in (
        outcomes_df['DateTime'] - outcomes_df['Date of Birth']).astype(str)]
subtractList[0:10]

[2, 1, 1, 0, 0, 7, 2, 0, 0, 2]

In [11]:
# changing nans to values in the list
for x in range(0, len(outcomes_df)):
    #if age is nan
    if pd.isnull(outcomes_df.iloc[x, 8]):
        # if more than one year
        if subtractList[x] > 1:
            print(str(subtractList[x]) + ' ' + 'years')
            outcomes_df.iloc[x, 8] = str(subtractList[x]) + ' ' + 'years'
        # if one year or less
        else:
            print(str(subtractList[x]) + ' ' + 'year')
            outcomes_df.iloc[x, 8] = str(subtractList[x]) + ' ' + 'year'

2 years
1 year
2 years
2 years
1 year


In [12]:
outcomes_df[outcomes_df['Age upon Outcome'].isnull() == True]

Unnamed: 0,Animal ID,Name,DateTime,Date of Birth,Outcome Type,Outcome Subtype,Animal Type,Sex upon Outcome,Age upon Outcome,Breed,Color


In [13]:
# rechecking for nans
outcomes_df.isnull().sum()

Animal ID           0
Name                0
DateTime            0
Date of Birth       0
Outcome Type        0
Outcome Subtype     0
Animal Type         0
Sex upon Outcome    0
Age upon Outcome    0
Breed               0
Color               0
dtype: int64

In [14]:
# renaming columns
outcomes_df.columns = [
    'id', 'name_outcome', 'dateTime_outcome', 'dob', 'outcome_type','outcome_subtype', 'animal_type', 
    'sex_outcome','age_outcome', 'breed', 'colour'
]

In [15]:
outcomes_df.head()

Unnamed: 0,id,name_outcome,dateTime_outcome,dob,outcome_type,outcome_subtype,animal_type,sex_outcome,age_outcome,breed,colour
0,A794011,Chunk,2019-05-08 18:20:00,2017-05-02,Rto-Adopt,Unspecified,Cat,Neutered Male,2 years,Domestic Shorthair Mix,Brown Tabby/White
1,A776359,Gizmo,2018-07-18 16:02:00,2017-07-12,Adoption,Unspecified,Dog,Neutered Male,1 year,Chihuahua Shorthair Mix,White/Brown
2,A821648,Unknown,2020-08-16 11:38:00,2019-08-16,Euthanasia,Unspecified,Other,Unknown,1 year,Raccoon,Gray
3,A720371,Moose,2016-02-13 17:59:00,2015-10-08,Adoption,Unspecified,Dog,Neutered Male,4 months,Anatol Shepherd/Labrador Retriever,Buff
4,A674754,Unknown,2014-03-18 11:47:00,2014-03-12,Transfer,Partner,Cat,Intact Male,6 days,Domestic Shorthair Mix,Orange Tabby


In [16]:
# saving to processed folder
outcomes_df.to_csv('../cleanedSources/outcomesCleaned.csv')