In [1]:
import pandas as pd

In [2]:
animals = pd.read_csv("Austin_Animal_Center_Outcomes.csv")

Considering cleaning and then splitting into 3 dfs for Dogs, Cats, and Other (and modeling on each Animal Type)

***Animal ID***: likely will drop column (acts as an index)

***Name***: dropping column: over 25000 null while the highest actual name is repeated only 405 times

***DateTime***: time of outcome: object type, will convert to DateTime and likely split out and dummy date/month/year variables

***MonthYear***: dropping column (exactly the same as DateTime)

***Date of Birth***: converting to DateTime

***Outcome Type***: target, will drop 12 null values, will utilize to_categorical since multi-class classification

***Outcome Subtype***: approximately half null: will keep column for EDA purposes and to further examine Outcome Type but may not include in models

***Animal Type***: 5 types (mainly Dog and Cat) but also Bird, Livestock, and Other (Other contains 99 different species). Will likely just use Dog and Cat to model but will use all Animal Type for EDA

***Sex upon Outcome***: Neutered Male, Spayed Female, Intact Male, Intact Female, Unknown (mainly Animal Type Other)

***Age upon Outcome***: creating a new column subtracting Date of Birth and Time of Outcome to get a time in days (then will drop Age Outcome since this column contains weeks, months, years)

***Breed***: TBD

***Color***: TBD

In [3]:
animals.head()

Unnamed: 0,Animal ID,Name,DateTime,MonthYear,Date of Birth,Outcome Type,Outcome Subtype,Animal Type,Sex upon Outcome,Age upon Outcome,Breed,Color
0,A741715,*Pebbles,01/11/2017 06:17:00 PM,01/11/2017 06:17:00 PM,03/07/2016,Adoption,,Cat,Spayed Female,10 months,Domestic Shorthair Mix,Calico
1,A658751,Benji,11/13/2016 01:38:00 PM,11/13/2016 01:38:00 PM,07/14/2011,Return to Owner,,Dog,Neutered Male,5 years,Border Terrier Mix,Tan
2,A721285,,02/24/2016 02:42:00 PM,02/24/2016 02:42:00 PM,02/24/2014,Euthanasia,Suffering,Other,Unknown,2 years,Raccoon Mix,Black/Gray
3,A746650,Rose,04/07/2017 11:58:00 AM,04/07/2017 11:58:00 AM,04/06/2016,Return to Owner,,Dog,Intact Female,1 year,Labrador Retriever/Jack Russell Terrier,Yellow
4,A750122,Happy Camper,05/24/2017 06:36:00 PM,05/24/2017 06:36:00 PM,04/08/2017,Transfer,Partner,Dog,Intact Male,1 month,Labrador Retriever Mix,Black


In [4]:
animals.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84246 entries, 0 to 84245
Data columns (total 12 columns):
Animal ID           84246 non-null object
Name                58329 non-null object
DateTime            84246 non-null object
MonthYear           84246 non-null object
Date of Birth       84246 non-null object
Outcome Type        84234 non-null object
Outcome Subtype     38552 non-null object
Animal Type         84246 non-null object
Sex upon Outcome    84243 non-null object
Age upon Outcome    84239 non-null object
Breed               84246 non-null object
Color               84246 non-null object
dtypes: object(12)
memory usage: 7.7+ MB


In [5]:
animals.isnull().sum()

Animal ID               0
Name                25917
DateTime                0
MonthYear               0
Date of Birth           0
Outcome Type           12
Outcome Subtype     45694
Animal Type             0
Sex upon Outcome        3
Age upon Outcome        7
Breed                   0
Color                   0
dtype: int64

***Animal ID***

In [6]:
# Column acts as an index: dropping column

In [7]:
animals.drop(columns=["Animal ID"], axis = 1, inplace=True)

***Name***

In [8]:
animals["Name"].isnull().sum()

25917

In [9]:
# Noticed many names contain an "*": replacing "*" values from Name to see if top name values increase

animals["Name"] = animals["Name"].str.replace("*", "")

In [10]:
animals["Name"].value_counts()[0:10]

Max         405
Bella       401
Charlie     315
Luna        312
Daisy       306
Rocky       277
Princess    261
Lucy        258
Buddy       238
Coco        219
Name: Name, dtype: int64

In [11]:
# 25917 null values, next highest name has only 405 entries: dropping column

In [12]:
animals.drop(columns=["Name"], axis = 1, inplace=True)

***DateTime***

In [13]:
# Converting to DateTime object

animals["DateTime"] = pd.to_datetime(animals["DateTime"]) # , format = "%d/%m/%Y")
# animals.info()

# Renaming:

animals = animals.rename(columns={"DateTime": "Time of Outcome"})

***MonthYear***

In [14]:
# Dropping column: exactly the same as DateTime

animals.drop(columns = "MonthYear", axis = 1, inplace = True)
animals.head()

Unnamed: 0,Time of Outcome,Date of Birth,Outcome Type,Outcome Subtype,Animal Type,Sex upon Outcome,Age upon Outcome,Breed,Color
0,2017-01-11 18:17:00,03/07/2016,Adoption,,Cat,Spayed Female,10 months,Domestic Shorthair Mix,Calico
1,2016-11-13 13:38:00,07/14/2011,Return to Owner,,Dog,Neutered Male,5 years,Border Terrier Mix,Tan
2,2016-02-24 14:42:00,02/24/2014,Euthanasia,Suffering,Other,Unknown,2 years,Raccoon Mix,Black/Gray
3,2017-04-07 11:58:00,04/06/2016,Return to Owner,,Dog,Intact Female,1 year,Labrador Retriever/Jack Russell Terrier,Yellow
4,2017-05-24 18:36:00,04/08/2017,Transfer,Partner,Dog,Intact Male,1 month,Labrador Retriever Mix,Black


***Date of Birth***

In [15]:
# Converting from object to DateTime

animals["Date of Birth"] = pd.to_datetime(animals["Date of Birth"])

***Outcome Type***

In [16]:
# Outcome Type: target for modeling (what we are attempting to predict)

animals["Outcome Type"].unique()

array(['Adoption', 'Return to Owner', 'Euthanasia', 'Transfer',
       'Rto-Adopt', 'Died', 'Disposal', 'Missing', 'Relocate', nan],
      dtype=object)

In [17]:
animals["Outcome Type"].isnull().sum()

12

In [18]:
animals.shape

(84246, 9)

In [19]:
# 12 null values: dropping those rows

animals.dropna(subset = ["Outcome Type"], inplace = True)

In [20]:
animals.shape

(84234, 9)

In [21]:
animals.head()

Unnamed: 0,Time of Outcome,Date of Birth,Outcome Type,Outcome Subtype,Animal Type,Sex upon Outcome,Age upon Outcome,Breed,Color
0,2017-01-11 18:17:00,2016-03-07,Adoption,,Cat,Spayed Female,10 months,Domestic Shorthair Mix,Calico
1,2016-11-13 13:38:00,2011-07-14,Return to Owner,,Dog,Neutered Male,5 years,Border Terrier Mix,Tan
2,2016-02-24 14:42:00,2014-02-24,Euthanasia,Suffering,Other,Unknown,2 years,Raccoon Mix,Black/Gray
3,2017-04-07 11:58:00,2016-04-06,Return to Owner,,Dog,Intact Female,1 year,Labrador Retriever/Jack Russell Terrier,Yellow
4,2017-05-24 18:36:00,2017-04-08,Transfer,Partner,Dog,Intact Male,1 month,Labrador Retriever Mix,Black


In [22]:
animals["Outcome Type"].value_counts()

Adoption           35694
Transfer           25205
Return to Owner    15494
Euthanasia          6507
Died                 740
Disposal             317
Rto-Adopt            210
Missing               50
Relocate              17
Name: Outcome Type, dtype: int64

***Outcome Subtype***

***Animal Type***

In [23]:
# Need to add "Bird" and "Livestock" to "Other" to simplify

animals["Animal Type"].unique()

array(['Cat', 'Dog', 'Other', 'Bird', 'Livestock'], dtype=object)

In [24]:
animals["Animal Type"].value_counts()

Dog          47905
Cat          31282
Other         4671
Bird           366
Livestock       10
Name: Animal Type, dtype: int64

In [25]:
animals.replace(to_replace = "Bird", value = "Other", inplace = True)
animals.replace(to_replace = "Livestock", value = "Other", inplace = True)

In [26]:
animals["Animal Type"].value_counts()

Dog      47905
Cat      31282
Other     5047
Name: Animal Type, dtype: int64

***Sex upon Outcome***

In [27]:
# One null value, many Unknown values

animals["Sex upon Outcome"].isnull().sum()

1

In [28]:
animals["Sex upon Outcome"].unique()

array(['Spayed Female', 'Neutered Male', 'Unknown', 'Intact Female',
       'Intact Male', nan], dtype=object)

In [29]:
# Dropping 1 null value

animals.dropna(subset = ["Sex upon Outcome"], inplace = True)

# animals["Sex upon Outcome"].isnull().sum()

In [30]:
animals["Sex upon Outcome"].value_counts()

Neutered Male    29836
Spayed Female    27001
Intact Male      10292
Intact Female     9897
Unknown           7207
Name: Sex upon Outcome, dtype: int64

In [31]:
# mask = (animals["Sex upon Outcome"] == "Unknown") & (animals["Animal Type"] == "Other")
# animals[mask]

***Age Upon Outcome***

In [32]:
# Creating a new column subtracting Date of Birth and Time of Outcome to get a time in days 
# (then will drop Age Outcome)

animals["Age Outcome"] = animals["Date of Birth"] - animals["Time of Outcome"]

animals.drop(columns = "Age upon Outcome", axis = 1, inplace = True)

In [33]:
animals.head()

Unnamed: 0,Time of Outcome,Date of Birth,Outcome Type,Outcome Subtype,Animal Type,Sex upon Outcome,Breed,Color,Age Outcome
0,2017-01-11 18:17:00,2016-03-07,Adoption,,Cat,Spayed Female,Domestic Shorthair Mix,Calico,-311 days +05:43:00
1,2016-11-13 13:38:00,2011-07-14,Return to Owner,,Dog,Neutered Male,Border Terrier Mix,Tan,-1950 days +10:22:00
2,2016-02-24 14:42:00,2014-02-24,Euthanasia,Suffering,Other,Unknown,Raccoon Mix,Black/Gray,-731 days +09:18:00
3,2017-04-07 11:58:00,2016-04-06,Return to Owner,,Dog,Intact Female,Labrador Retriever/Jack Russell Terrier,Yellow,-367 days +12:02:00
4,2017-05-24 18:36:00,2017-04-08,Transfer,Partner,Dog,Intact Male,Labrador Retriever Mix,Black,-47 days +05:24:00


***Breed***

In [34]:
animals["Breed"].nunique()

2223

In [35]:
animals["Breed"].value_counts();

***Color***

In [36]:
animals["Color"].nunique();

In [37]:
animals["Color"].unique();

In [38]:
# mask = animals["Animal Type"] == "Dog"

# animals[mask]["Color"].value_counts()