In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

dsample = pd.read_csv('/kaggle/input/shelter-animal-outcomes/sample_submission.csv.gz', compression = 'gzip', header = 0, sep=',',quotechar='"')
dtest = pd.read_csv('/kaggle/input/shelter-animal-outcomes/test.csv.gz', compression = 'gzip', header = 0, sep=',',quotechar='"')
dtrain = pd.read_csv('/kaggle/input/shelter-animal-outcomes/train.csv.gz', compression = 'gzip', header = 0, sep=',',quotechar='"')
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from sklearn.impute import SimpleImputer
import re

In [None]:
# Explore the dataset
dtrain

We have 11 columns initially with OutcomeType as our results. We will explore the rest of the 10 features and decide which features to use for analysis. Below are the list of the column names and their respective descriptions.

AgeuponOutcome: Age when outcome happened
AnimalID: Animal ID number
AnimalType: cat or dog
Breed: Breed of the animal
Color: Color of the animal
DateTime: Date time of adoption.
ID: id of data
Name: Name given to animal
OutcomeSubtype: more details about outcome
OutcomeType: Outcome (5 targets)
SexUponOutcome: Sex of animal during outcome

In [None]:
dtrain.describe()

In [None]:
dtrain.info()

In [None]:
# Identify null values in training dataset

dtrain.isnull().any()

Check number of null values in columns that have null values. We will ommit 'Name' as it is not likely to affect the outcome and it will be dropped later for our analysis.

In [None]:
dtrain.loc[:, 'OutcomeSubtype'].isnull().sum()

OutcomeSubtype column will be dropped later as there is too many null values for accurate analysis

In [None]:
dtrain.loc[:, 'SexuponOutcome'].isnull().sum()

In [None]:
dtrain.loc[:, 'AgeuponOutcome'].isnull().sum()

In [None]:
# Replace missing values in columns "SexuponOutcome" and "AgeuponOutcome" with most frequent data assuming
# they are most likely to occur and since missing values are not so many

cols_for_na = ["SexuponOutcome", "AgeuponOutcome"]
imputer_na = SimpleImputer(strategy="most_frequent")

dtrain.loc[:, cols_for_na] = imputer_na.fit_transform(dtrain.loc[:, cols_for_na])


In [None]:
# check null values for the processed columns

dtrain.loc[:, cols_for_na].isnull().sum()

In [None]:
# Check unique values in data set columns with high number of unique values

dtrain['OutcomeType'].value_counts()

In [None]:
dtrain['Breed'].value_counts()

In [None]:
dtrain['Color'].value_counts()

In [None]:
# Convert Age in 'AgeuponOutcome' to number of years

def AgetoYears(input):
    key = input.split()
    
    if re.findall(r'\Ayear', key[1]):
        years = 1
    elif re.findall(r'\Amonth', key[1]):
        years = 12
    elif re.findall(r'\Aweek', key[1]):
        years = 52
    else:
        years = 365
        
    return int(key[0])/years


In [None]:
dtrain.AgeuponOutcome = dtrain.AgeuponOutcome.apply(AgetoYears)

In [None]:
# Check if more dogs or cats are put into animal shelters
sns.countplot(x="AnimalType", data=dtrain)

In [None]:
# Check outcome of dogs or cats
f, (ax1, ax2) = plt.subplots(1, 2, figsize=(18, 5))
sns.countplot(data=dtrain, x='AnimalType',hue='OutcomeType', ax=ax1)
sns.countplot(data=dtrain, x='OutcomeType',hue='AnimalType', ax=ax2)

From the above output, it seems like more dogs are adopted and returned to owners, while cats are more likely to be transfered and euthanised. We also see that more dogs are put into adoption homes but we cannot conclude at this point because it might be there are significantly more dog owners, thus resulting in more dogs in shelters. At this point, we still cannot that if dogs are more desirable than cats.

In [None]:
# Upon inspection, let's assume that the age of the animals have the greatest effect to the outcome apart from
# the other features. We will see if that is the case from the results afterwards.

# Let's define a function to explore the age group and the outcome effects it has.

def age_group(age):
    if age < 3: group = 'young'
    elif age < 6: group = 'adult'
    else: group = 'old'
    return group

dtrain['AgeGroup'] = dtrain.AgeuponOutcome.apply(age_group)

In [None]:
# Plot and explore the age group and outcomes

f, (ax1, ax2) = plt.subplots(1, 2, figsize=(18, 5))
sns.countplot(data=dtrain, x='AgeGroup',hue='OutcomeType', ax=ax1)
sns.countplot(data=dtrain, x='OutcomeType',hue='AgeGroup', ax=ax2)

As expected, the young animals are the ones most likely to be adopted than the rest of the groups.

In [None]:
# Get number of occurrences of each unique breed and color. These are the two columns with most number of variables. We will count them and see if we
# need further processing when converting them into dummy indicator variables 

dtrain_unique_breed = Counter(dtrain['Breed'])
dtrain_unique_color = Counter(dtrain['Color'])

print(f"Number of unique breeds in given train set: {len(dtrain_unique_breed)}")
print(f"Number of unique colours in given train set: {len(dtrain_unique_color)}")

In [None]:
# Explore the dtrain_unique_breed and get an idea of the number of breeds and their occurences in the data

dtrain_unique_breed

In [None]:
# Explore the dtrain_unique_color and get an idea of the number of colors and their occurences in the data

dtrain_unique_color

For our models used, we have to convert the deciding features into categorical variables which will yield a very high numebr of columns and we will likely have to deal with it when predicting the results with the given test set.

For the significant high number of breeds and colours, especially those that fall into exotic colours and breeds, it is virtually impossible to generalize them into a small enough group without compromising accuracy.

One approach for 'Breed' is to segregate them into simply pure or mixed breed. In this case, we end up with just two columns of 'Pure' or 'Mix' after doing some manipulation in the column values. Likewise, we will do the same for 'Color' column as well.

Our approach here is to delete as many of the breeds and colours with the LEAST number of occurences in the dataframe as they do not contribute much to an accurate analysis. The remaining animals in the dataframe will generate an equal number of columns when converting them into dummy/indicator variables.

In [None]:
# Find unique number of breeds and colours for number of columns created in dataframe after creating
# dummy/indicator variables when predicting results.

dtest_unique_breed = Counter(dtest['Breed'])
dtest_unique_color = Counter(dtest['Color'])

print(f"Number of unique breeds in given test set: {len(dtest_unique_breed)}")
print(f"Number of unique colours in given test set: {len(dtest_unique_color)}")

In [None]:
# Drop 1380-913=467 animals starting from least number of breed
# Add them into a list to be deleted to match breeds in given test set.

# Count the number of loops to exit
count=0

# List of lesser animal breeds
lesser_breed=[]

# Number of occurences of a breed in the list
breed_number=0

while count<464:
    breed_number+=1
    for x in dtrain_unique_breed:
        if dtrain_unique_breed[x]==breed_number and count<464:
            lesser_breed.append(x)
            count+=1
            
print(f"Count is {count}")

The above loop counts should be 467 by calculation but result given was 911 unique value for breed. I cannot find the error and has to change the value to 464 by trial and error to yield the result 913 when checked with given test set number of breeds.

In [None]:
# Drop 366-277=89 animals starting from least number of breed
# Add them into a list to be deleted to match breeds in given test set.

# Count the number of loops to exit
count=0

# List of lesser animal breeds
lesser_color=[]

# Number of occurences of a breed in the list
color_number=0

while count<89:
    color_number+=1
    for x in dtrain_unique_color:
        if dtrain_unique_color[x]==color_number and count<89:
            lesser_color.append(x)
            count+=1
            
print(f"Count is {count}")

In [None]:
# Drop 467 animals with the least Breed in the original dataset.

for x in lesser_breed:
    dtrain = dtrain.drop(dtrain[dtrain.Breed == x].index)
    
dtrain

In [None]:
# Drop the 89 animals with least occurences of colours in the original dataset.

for x in lesser_color:
    dtrain = dtrain.drop(dtrain[dtrain.Color == x].index)
    
dtrain

In [None]:
# Get y training set from OutcomeType column
y = dtrain.OutcomeType

In [None]:
# Reset the index for the dataframe

dtrain=dtrain.reset_index()
dtrain

In [None]:
# Create a method to get whether entries in a column contains mixed elements or not. This will be used in Breed and Color columns.
def find_mix(x):
    if x.find('Mix')>=0 or x.find('/')>=0: return 'Mix'
    else: return 'Pure'


In [None]:
# Check and change Breed and Color columns into 'Pure' or 'Mix'
dtrain["Breed"]=dtrain.Breed.apply(find_mix)
dtrain["Color"]=dtrain.Color.apply(find_mix)


In [None]:
# Drop irrelevant columns that are not likely to affect outcome. The following will be dropped below:
# 'AnimalID' and 'OutcomeType' are stored above.
# 'OutcomesubType' has too many null values and is not suitable for analysis.
# Outcome is not likely to be affected by having a name or not so 'Name' is dropped as well.
# 'DateTime' documents the date and time of the outcome and has no effect on the outcome
# 'AgeGroup' is not needed anymore as we will be using AgeuponOutcome in numerical form, which will yield more favourable results.
dtrain = dtrain.drop(["index", "AnimalID", "OutcomeType", "OutcomeSubtype", "Name", "DateTime", "AgeGroup"], axis=1)
dtrain

In [None]:
# Create indicator variables on dataset.

dtrain = pd.get_dummies(dtrain, columns=['AnimalType', 'SexuponOutcome', 'Breed', 'Color'])
dtrain

In [None]:
# Test split dataset

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(dtrain, y, test_size = 0.3)

In [None]:
# Training set overview
X_train

In [None]:
# y training set overview

y_train

In [None]:
# Test set overview
X_test

In [None]:
# y testing set

y_test

Check matching number of rows and columns for train and test sets 

In [None]:
print(f"Number of Rows, Features in Training Dataset: {X_train.shape}")
print(f"Number of Rows, Features in Test Dataset: {X_test.shape}")

In [None]:
print(f"Number of Rows in Training Response: {y_train.shape}")
print(f"Number of Rows in Test Response: {y_test.shape}")

In [None]:
# Importing testing models

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score


In [None]:
# Use RandomForestClasifier as testing model

rfc_train = RandomForestClassifier(n_estimators=100)
rfc_train.fit(X_train , y_train)

rfc_test = RandomForestClassifier(n_estimators=100)
rfc_test.fit(X_test , y_test)

y_pred_train = rfc_train.predict(X_train)
y_pred_test = rfc_test.predict(X_test)

print(f"RandomForestClassifier Accuracy train Score: {accuracy_score(y_train, y_pred_train)}")
print(f"RandomForestClassifier Accuracy test Score: {accuracy_score(y_test, y_pred_test)}")

In [None]:
# Use Logistic Regression for testing model

lr = LogisticRegression()

# Get accuracy using train set
lr.fit(X_train, y_train)

print(lr.classes_)
print(f"LogisticRegression Accuracy train Score: {lr.score(X_train, y_train)}")

# Get accuracy using test set
test_predict = lr.predict(X_test)

print(f"LogisticRegression Accuracy test Score: {np.mean(test_predict == y_test)}")


In [None]:
# Use DecisionTreeClassifier as testing model

dtc_train = DecisionTreeClassifier()
dtc_train.fit(X_train, y_train)

dtc_test = DecisionTreeClassifier()
dtc_test.fit(X_test, y_test)

y_pred_train = dtc_train.predict(X_train)
y_pred_test = dtc_test.predict(X_test)

print(f"DecisionTreeClassifier Accuracy train Score: {accuracy_score(y_train, y_pred_train)}")
print(f"DecisionTreeClassifier Accuracy test Score: {accuracy_score(y_test, y_pred_test)}")

From above, RandomForestClassifier and DecisionTreeClassifier generate similar accuracy score and both score much higher than LogisticRegression, which is expected. We will use DecisionTreeClassifier.

# Process given test set for yielding results.

In [None]:
# Check given testing set
dtest

In [None]:
# Store ID for later insertion back into submission results
temp_ID=dtest['ID']

In [None]:
temp_ID

In [None]:
dtest.info()

In [None]:
dtest.isnull().any()

In [None]:
# impute most frequent value into AgeuponOutcome column
dtest[["AgeuponOutcome"]] = imputer_na.fit_transform(dtest[["AgeuponOutcome"]])

In [None]:
# change age in 'AgeuponOutcome' to number of years

dtest.AgeuponOutcome = dtest.AgeuponOutcome.apply(AgetoYears)

# Check and change Breed and Color columns into 'Pure' or 'Mix'
dtest["Breed"]=dtest.Breed.apply(find_mix)
dtest["Color"]=dtest.Color.apply(find_mix)

In [None]:
# Drop irrelevant columns for predicting
dtest = dtest.drop(["ID", "Name", "DateTime"], axis=1)
dtest

In [None]:
# Create indicator variables on dataset.
dtest = pd.get_dummies(dtest, columns=['AnimalType', 'SexuponOutcome', 'Breed', 'Color'])
dtest

In [None]:
# Get the prediction and convert it into categorical output.

predictions = dtc_train.predict(dtest)

predictions = pd.get_dummies(predictions)

predictions

In [None]:
# Check for null values in results.

predictions.isna().sum()

In [None]:
# Compare with given sample results to hand in.
dsample

In [None]:
predictions.insert(loc=0,column='ID',value=temp_ID)

In [None]:
# Match final results with sample results above.
predictions

In [None]:
predictions.to_csv('submission.csv', index = False )