In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from utils import *

In [2]:
df = pd.read_csv('Animal_Shelter_Intake_and_Outcome_20240517.csv')

In [3]:
df_clean = load_df()

Date Of Birth is NOT A STRING
Intake Date is NOT A STRING
Outcome Date is NOT A STRING
Days in Shelter is NOT A STRING
Count is NOT A STRING
replace null values in Name with 'Unknown'
replace null values in Size with 'Unknown'
replace null values in Outcome_Type with 'Unknown'
replace null values in Outcome_Subtype with 'Unknown'
replace null values in Outcome_Condition with 'Unknown'
replace null values in Outcome_Jurisdiction with 'Unknown'
replace null values in Outcome_Zip_Code with 'Unknown'
replace null values in Location with 'Unknown'


In [None]:
df[['Intake Date',
       'Outcome Date', 'Days in Shelter', 'Intake Type', ]].head(10)

In [None]:
df.columns

In [None]:
df[df['Animal ID'].duplicated()]

In [None]:
df.info()

In [None]:
df.columns

In [None]:
print(df.Count.dtype)

## Lets remove spaces to make coding easier

In [None]:
col_list = []
for col in df.columns:
    col = col.replace(' ', '_')
    col_list.append(col)
df.columns = col_list
df.columns

## What should we do with missing data?

In [None]:
total_null = 0
for col in df.columns:
    null_count = df[col][df[col].isnull()].shape[0]
    if null_count!= 0:
        print(f'{col} has null rows: {null_count}')
        total_null += null_count
print(f'\nthere are {total_null} missing values')

## days in shelter is right skewed

In [None]:
df.describe()

In [None]:
df.Days_in_Shelter.plot.hist()

## Animal Gender

In [None]:
df.Sex.unique()

## Animal Breed

In [None]:
print(f'there are {len(df.Breed.unique())} unique breeds')
cat_breed_count = len(df.Breed[df.Type=='CAT'].unique())
print(f'cat breeds: {cat_breed_count}')
dog_breed_count = len(df.Breed[df.Type=='DOG'].unique())
print(f'dog breeds: {dog_breed_count}')
Other_breed_count = len(df.Breed[df.Type=='OTHER'].unique())
print(f'other breeds: {Other_breed_count}')

In [None]:
df.Type.value_counts()

In [None]:
df[df.Type=='CAT'].Breed.value_counts()[0:8].plot.bar()
plt.title('CAT Breeds')

In [None]:
df[df.Type=='DOG'].Breed.value_counts()[0:15].plot.bar()
plt.title('DOG Breeds')

## Animal Type

In [None]:
df.Type.unique()

### What is the other type?

In [None]:
df[df.Type=='OTHER'].Breed.unique()

### I think we should ignore other, all these different animal types add a lot of noise to the data

## Animal Size

In [None]:
df[df.Type=='CAT'].Size.value_counts().plot.bar()
plt.title('CAT Sizes')

In [None]:
df[df.Type=='DOG'].Size.value_counts().plot.bar()
plt.title('DOG Sizes')

## What colors do we have?

In [None]:
print(f'there are {len(df.Color.unique())} unique colors')
breed_color = len(df.Color[df.Type=='CAT'].unique())
print(f'cat colors: {breed_color}')
breed_color = len(df.Color[df.Type=='DOG'].unique())
print(f'dog colors: {breed_color}')
breed_color = len(df.Color[df.Type=='OTHER'].unique())
print(f'other colors: {breed_color}')
print()
print(df.Color.value_counts())

In [None]:
df[df.Type=='DOG'].Color.value_counts()[:20].plot.bar()
plt.title('DOG colors')

In [None]:
df[df.Type=='CAT'].Color.value_counts()[:20].plot.bar()
plt.title('cat colors')

## Intake Type is skewed

In [None]:
df.Intake_Type.value_counts().plot.bar()
plt.title('intake type')

In [None]:
df.Intake_Subtype.value_counts().plot.bar()
plt.title('intake subtype')

## Most animals are not euthanized, but there are still a lot of animals in this category

In [None]:
df.Outcome_Type.value_counts().plot.bar()
plt.title('Outcome type')

## outcome subtype has so many options. could be interesting to use NLP here

In [None]:
len(df.Outcome_Subtype.unique())

In [None]:
df.Outcome_Subtype.value_counts()[:20].plot.bar()
plt.title('Outcome subtype')

## most animals come in healthy, when is condition be unknown?

In [None]:
df.Intake_Condition.value_counts()[:20].plot.bar()
plt.title('Intake_Condition')

## it appears going to a shelter usually improves health

what does pending mean?

In [None]:
df.Outcome_Condition.value_counts()[:20].plot.bar()
plt.title('Outcome_Condition')

In [None]:
df.columns


In [None]:
df.Intake_Jurisdiction.unique()

In [None]:
df.Outcome_Jurisdiction.unique()

In [None]:
df[df.Outcome_Jurisdiction != df.Intake_Jurisdiction]

## Thoughts
There are many categories for breed, color and outcome subtype. Perhaps too many. We may need to feature engineer these columns to group similar colors