# IND6212 Project
---

In [1]:
# import libraries
import csv
import numpy as np

In [2]:
# if runned on colaboratory
GOOGLE = False
if GOOGLE:
    # Load the Drive helper and mount
    from google.colab import drive
    drive.mount('/content/drive/')
    path = "/content/drive/My Drive/EPM/PhD/IND6212/Projet/"
else:
    path = ""

In [3]:
# read intakes
with open('{}data/aac_intakes.csv'.format(path), 'r') as file:
    reader = csv.reader(file)
    header_intake = next(reader)
    intakes = [[value for value in row] for row in reader]

In [4]:
# read outcomes
with open('{}data/aac_outcomes.csv'.format(path), 'r') as file:
    reader = csv.reader(file)
    header_outcome = next(reader)
    outcomes = [[value for value in row] for row in reader]

In [5]:
# convert numpy array
header_intake = np.asarray(header_intake)
header_outcome = np.asarray(header_outcome)

intakes = np.asarray(intakes)
outcomes = np.asarray(outcomes)

In [6]:
print("Intakes")
for i, c in enumerate(header_intake):
    print("{} : {}".format(i, c))

Intakes
0 : age_upon_intake
1 : animal_id
2 : animal_type
3 : breed
4 : color
5 : datetime
6 : datetime2
7 : found_location
8 : intake_condition
9 : intake_type
10 : name
11 : sex_upon_intake


In [7]:
print("Outcomes")
for i, c in enumerate(header_outcome):
    print("{} : {}".format(i, c))

Outcomes
0 : age_upon_outcome
1 : animal_id
2 : animal_type
3 : breed
4 : color
5 : date_of_birth
6 : datetime
7 : monthyear
8 : name
9 : outcome_subtype
10 : outcome_type
11 : sex_upon_outcome


In [8]:
# get the last outcome (it can be transfered multiple times, we want the last update)
# if no outcome, then we remove the example (line)
counter = 0
data = []
for i, x in enumerate(intakes, 1):
    index = np.where(outcomes[:, 1] == x[1])
    if len(index[0]) > 0 and outcomes[index[0][-1], 10]:
        data.append(np.concatenate((x, [outcomes[index[0][-1], 10]])))
    else:
        counter += 1
    print("\r{} | {}".format(i - counter, counter), end=" ")

79776 | 411                                                                                                                

In [9]:
# add the outcome to data
data = np.asarray(data)
header = np.concatenate((header_intake, ["outcome"]))

The author of the dataset explained that Rto-Adopt = Return to Owner and Disposal = Died (see [link](https://www.kaggle.com/aaronschlegel/austin-animal-center-shelter-intakes-and-outcomes/discussion/56707#latest-329163))

In [10]:
for row in data:
    if row[-1] == "Rto-Adopt":
        row[-1]="Return to Owner"
    if row[-1] == "Disposal":
        row[-1]="Died"

In [11]:
# remove the animal id
data = np.delete(data, 1, axis=1)
header = np.delete(header, 1, axis=0)

In [12]:
# convert intake date into a real value (year)
for x in data:
    s = x[0].split()
    if "year" in s[1]:
        x[0] = float(s[0])
    if "month" in s[1]:
        x[0] = float(s[0]) / 12.
    if "week" in s[1]:
        x[0] = float(s[0]) / 52.

In [13]:
# convertir animal breed to integer
mapping_breed, breed_int = np.unique(data[:, 2], return_inverse=True)
data = np.delete(data, [1, 2], axis=1)
header = np.delete(header, [1, 2], axis=0)
data = np.concatenate((data, breed_int.reshape(-1, 1)), axis=1)
header = np.concatenate((header, ["animal_breed"]))

In [14]:
# convert color to integer
mapping_color, color_int = np.unique(data[:, 1], return_inverse=True)
data = np.delete(data, 1, axis=1)
header = np.delete(header, 1, axis=0)
data = np.concatenate((data, color_int.reshape(-1, 1)), axis=1)
header = np.concatenate((header, ["color"]))

In [15]:
# remove location
data = np.delete(data, 3, axis=1)
header = np.delete(header, 3, axis=0)

In [16]:
# check if datetime and datetime2 are identical
if np.array_equal(data[:, 1], data[:, 2]):
    print("Datetime duplicated")
else:
    print("Datetime not duplicated")

Datetime duplicated


In [17]:
# remove datetime2
data = np.delete(data, 2, axis=1)
header = np.delete(header, 2, axis=0)

In [18]:
# convert condition to integer
mapping_condition, condition_int = np.unique(data[:, 2], return_inverse=True)
data = np.delete(data, 2, axis=1)
header = np.delete(header, 2, axis=0)
data = np.concatenate((data, condition_int.reshape(-1,1)), axis=1)
header = np.concatenate((header, ["condition"]))

In [19]:
# convert type to integer
mapping_type, type_int = np.unique(data[:, 2], return_inverse=True)
data = np.delete(data, 2, axis=1)
header = np.delete(header, 2, axis=0)
data = np.concatenate((data, type_int.reshape(-1,1)), axis=1)
header = np.concatenate((header, ["type"]))

In [20]:
# check if name as any impact
out = list(set(data[:,4]))
out_name = [0 for _ in out]
out_noname = [0 for _ in out]
name, noname = 0, 0

for d in data:
    if d[2]:
        name += 1
        out_name[out.index(d[4])] += 1
    else:
        noname += 1
        out_noname[out.index(d[4])] += 1
out_noname = [o/noname for o in out_noname]
out_name = [o/name for o in out_name]

print("{:20s} | {:^15s} | {:^15s}".format("Outcome", "with name", "without name"))
print("-"*56)
for a, b, c in zip(out, out_name, out_noname):
    print("{:20s} | {:^15.2%} | {:^15.2%}".format(a, b, c))

Outcome              |    with name    |  without name  
--------------------------------------------------------
Return to Owner      |     25.85%      |      1.91%     
Adoption             |     50.43%      |     23.37%     
Transfer             |     20.55%      |     51.64%     
Euthanasia           |      2.66%      |     19.92%     
Died                 |      0.45%      |      3.05%     
Missing              |      0.06%      |      0.04%     
Relocate             |      0.00%      |      0.06%     


In [21]:
# the presence of a name seems to have an impact on the outcome
# its value should not have any effect (difficult to evaluate)
# replace absence of a name by 0 and presence of a name by 1
for row in data:
    row[2] = 1 if row[2] else 0

In [22]:
# sex analysis
sex, count = np.unique(data[:, 3], return_counts=True)
for s, c in zip(sex, count):
    print("{:20s} : {:6d} ({:6.1%})".format(s, c, c / data.shape[0]))

Intact Female        :  23739 ( 29.8%)
Intact Male          :  25355 ( 31.8%)
NULL                 :      1 (  0.0%)
Neutered Male        :  12728 ( 16.0%)
Spayed Female        :  11168 ( 14.0%)
Unknown              :   6785 (  8.5%)


In [23]:
# check if name as any impact
out = list(set(data[:, 4]))
sex = list(set(data[:, 3]))
table = [[0 for _ in out] for _ in sex]
for d in data:
    table[sex.index(d[3])][out.index(d[4])] += 1

print(" " * 14, end="")
for o in out:
    print("{:^14s}".format(o), end="")
print("")
for i, s in zip(table, sex):
    print("{:^14s}".format(s), end="")
    for j in i:
        print("{:^14d}".format(j), end="")
    print("")

              Return to Owner   Adoption      Transfer     Euthanasia       Died        Missing       Relocate   
Neutered Male      4958          4699          2429          597            42            3             0       
Spayed Female      3950          4690          2099          377            48            4             0       
   Unknown          84            70           2747          3392          476            2             14      
 Intact Male       3523         12178          8282          1099          255            18            0       
Intact Female      2246         11972          8447          882           172            19            1       
     NULL           1             0             0             0             0             0             0       


In [24]:
# we cannot remove unknown because it seems to have an impact on the outcome (extremly low adoption rate)
# there is only one NULL, which is suspect, we'll remove this
id = np.where(data[:,3] == "NULL")[0][0]
data = np.delete(data, id, axis=0)

In [25]:
# convert sex to integer
mapping_sex, sex_int = np.unique(data[:, 3], return_inverse=True)
data = np.delete(data, 3, axis=1)
header = np.delete(header, 3, axis=0)
data = np.concatenate((data, sex_int.reshape(-1,1)), axis=1)
header = np.concatenate((header, ["sex"]))

In [26]:
for i, row in enumerate(data):
    for h, value in zip(header, row):
        print("{:20s} : {}".format(h, value))
    print("-----------------------")
    if i > 2:
        break

age_upon_intake      : 8.0
datetime             : 2015-07-05T12:59:00.000
name                 : 1
outcome              : Return to Owner
animal_breed         : 1007
color                : 498
condition            : 3
type                 : 3
sex                  : 3
-----------------------
age_upon_intake      : 0.9166666666666666
datetime             : 2016-04-14T18:43:00.000
name                 : 1
outcome              : Return to Owner
animal_breed         : 231
color                : 382
condition            : 3
type                 : 3
sex                  : 1
-----------------------
age_upon_intake      : 0.07692307692307693
datetime             : 2013-10-21T07:59:00.000
name                 : 0
outcome              : Transfer
animal_breed         : 952
color                : 194
condition            : 7
type                 : 3
sex                  : 0
-----------------------
age_upon_intake      : 4.0
datetime             : 2014-06-29T10:38:00.000
name                 : 1
out