# IND6212 Project
---

# Preprocessing

In [None]:
# import libraries
import csv
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from MulticoreTSNE import MulticoreTSNE as TSNE

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix

from keras import metrics
from keras import callbacks
from keras.layers import BatchNormalization
from keras.layers import Activation
from keras.layers import Dropout
from keras.layers import Input
from keras.layers import Dense
from keras.models import Model

In [None]:
# if runned on colaboratory
GOOGLE = False
if GOOGLE:
    # Load the Drive helper and mount
    from google.colab import drive
    drive.mount("/content/drive/")
    path = "/content/drive/My Drive/EPM/PhD/IND6212/Projet/"
else:
    path = ""

In [None]:
# read intakes
with open("{}data/aac_intakes.csv".format(path), "r") as file:
    reader = csv.reader(file)
    header_intake = next(reader)
    intakes = [[value for value in row] for row in reader]

In [None]:
# read outcomes
with open("{}data/aac_outcomes.csv".format(path), "r") as file:
    reader = csv.reader(file)
    header_outcome = next(reader)
    outcomes = [[value for value in row] for row in reader]

In [None]:
# convert numpy array
header_intake = np.asarray(header_intake)
header_outcome = np.asarray(header_outcome)

intakes = np.asarray(intakes)
outcomes = np.asarray(outcomes)

In [None]:
print("Intakes")
for i, c in enumerate(header_intake):
    print("{} : {}".format(i, c))

In [None]:
print("Outcomes")
for i, c in enumerate(header_outcome):
    print("{} : {}".format(i, c))

In [None]:
# get the last outcome (it can be transfered multiple times, we want the last update)
# if no outcome, then we remove the example (line)
counter = 0
data = []
for i, x in enumerate(intakes, 1):
    index = np.where(outcomes[:, 1] == x[1])
    if len(index[0]) > 0 and outcomes[index[0][-1], 10]:
        data.append(np.concatenate((x, [outcomes[index[0][-1], 10]])))
    else:
        counter += 1
    print("\r{} | {}".format(i - counter, counter), end=" ")

In [None]:
# add the outcome to data
data = np.asarray(data)
header = np.concatenate((header_intake, ["outcome"]))

The author of the dataset explained that Rto-Adopt = Return to Owner and Disposal = Died (see [link](https://www.kaggle.com/aaronschlegel/austin-animal-center-shelter-intakes-and-outcomes/discussion/56707#latest-329163))

In [None]:
for row in data:
    if row[-1] == "Rto-Adopt":
        row[-1]="Return to Owner"
    if row[-1] == "Disposal":
        row[-1]="Died"

In [None]:
# plot class distribution with seaborn
plt.figure(figsize=(10, 4))
sns.set(style="darkgrid")
name, count = np.unique(data[:,-1], return_counts=True)
graph = sns.countplot(data[:,-1], order =name)
for i, c in enumerate(count):
    graph.text(i, c+500, c, color='black', ha="center")
plt.tight_layout()
plt.savefig("figures/class_distribution.png", dpi=300, transparent=True)
plt.show()
plt.close()

In [None]:
# remove the animal id
data = np.delete(data, 1, axis=1)
header = np.delete(header, 1, axis=0)

In [None]:
# convert intake date into a real value (year)
for x in data:
    s = x[0].split()
    if "year" in s[1]:
        x[0] = float(s[0])
    if "month" in s[1]:
        x[0] = float(s[0]) / 12.
    if "week" in s[1]:
        x[0] = float(s[0]) / 52.
    if "day" in s[1]:
        x[0] = float(s[0]) / 365.

In [None]:
# convertir animal breed to integer
mapping_breed, breed_int = np.unique(data[:, 2], return_inverse=True)
data = np.delete(data, [1, 2], axis=1)
header = np.delete(header, [1, 2], axis=0)
data = np.concatenate((data, breed_int.reshape(-1, 1)), axis=1)
header = np.concatenate((header, ["animal_breed"]))

In [None]:
print(mapping_breed)

In [None]:
# convert color to integer
mapping_color, color_int = np.unique(data[:, 1], return_inverse=True)
data = np.delete(data, 1, axis=1)
header = np.delete(header, 1, axis=0)
data = np.concatenate((data, color_int.reshape(-1, 1)), axis=1)
header = np.concatenate((header, ["color"]))

In [None]:
# remove location
data = np.delete(data, 3, axis=1)
header = np.delete(header, 3, axis=0)

In [None]:
# check if datetime and datetime2 are identical
if np.array_equal(data[:, 1], data[:, 2]):
    print("Datetime duplicated")
else:
    print("Datetime not duplicated")

In [None]:
# remove datetime2
data = np.delete(data, 2, axis=1)
header = np.delete(header, 2, axis=0)

In [None]:
# convert condition to integer
mapping_condition, condition_int = np.unique(data[:, 2], return_inverse=True)
data = np.delete(data, 2, axis=1)
header = np.delete(header, 2, axis=0)
data = np.concatenate((data, condition_int.reshape(-1, 1)), axis=1)
header = np.concatenate((header, ["condition"]))

In [None]:
# convert type to integer
mapping_type, type_int = np.unique(data[:, 2], return_inverse=True)
data = np.delete(data, 2, axis=1)
header = np.delete(header, 2, axis=0)
data = np.concatenate((data, type_int.reshape(-1, 1)), axis=1)
header = np.concatenate((header, ["type"]))

In [None]:
print(mapping_type)

In [None]:
# check if name as any impact
out = list(set(data[:, 4]))
out_name = [0 for _ in out]
out_noname = [0 for _ in out]
name, noname = 0, 0

for d in data:
    if d[2]:
        name += 1
        out_name[out.index(d[4])] += 1
    else:
        noname += 1
        out_noname[out.index(d[4])] += 1
out_noname = [o / noname for o in out_noname]
out_name = [o / name for o in out_name]

print("{:20s} | {:^15s} | {:^15s}".format("Outcome", "with name",
                                          "without name"))
print("-" * 56)
for a, b, c in zip(out, out_name, out_noname):
    print("{:20s} | {:^15.2%} | {:^15.2%}".format(a, b, c))

In [None]:
# the presence of a name seems to have an impact on the outcome
# its value should not have any effect (difficult to evaluate)
# replace absence of a name by 0 and presence of a name by 1
for row in data:
    row[2] = 1 if row[2] else 0

In [None]:
# sex analysis
sex, count = np.unique(data[:, 3], return_counts=True)
for s, c in zip(sex, count):
    print("{:20s} : {:6d} ({:6.1%})".format(s, c, c / data.shape[0]))

In [None]:
# check if name as any impact
out = list(set(data[:, 4]))
sex = list(set(data[:, 3]))
table = [[0 for _ in out] for _ in sex]
for d in data:
    table[sex.index(d[3])][out.index(d[4])] += 1

print(" " * 14, end="")
for o in out:
    print("{:^14s}".format(o), end="")
print("")
for i, s in zip(table, sex):
    print("{:^14s}".format(s), end="")
    for j in i:
        print("{:^14d}".format(j), end="")
    print("")

In [None]:
# we cannot remove unknown because it seems to have an impact on the outcome (extremly low adoption rate)
# there is only one NULL, which is suspect, we\"ll remove this
id = np.where(data[:, 3] == "NULL")[0][0]
data = np.delete(data, id, axis=0)

In [None]:
# convert sex to integer
mapping_sex, sex_int = np.unique(data[:, 3], return_inverse=True)
data = np.delete(data, 3, axis=1)
header = np.delete(header, 3, axis=0)
data = np.concatenate((data, sex_int.reshape(-1, 1)), axis=1)
header = np.concatenate((header, ["sex"]))

In [None]:
# keep only month for datetime
mapping_sex, sex_int = np.unique(data[:, 3], return_inverse=True)
for d in data:
    d[1] = int(d[1].split("-")[1])

In [None]:
# put labels (outcomes) in separate matrix
mapping_outcome, labels = np.unique(data[:, 3], return_inverse=True)
x = np.delete(data, 3, axis=1)
header = np.delete(header, 3, axis=0)

In [None]:
print(mapping_outcome)

In [None]:
x = x.astype("float32")
labels = labels.astype("int")

In [None]:
for i, (row, label) in enumerate(zip(x, labels)):
    for h, value in zip(header, row):
        print("{:20s} : {}".format(h, value))
    print("\tOutcome: {}".format(label))
    print("-----------------------")
    if i > 2:
        break

In [None]:
# convert outcomes (y) to one_hot
y = np.eye(len(set(labels)))[labels]

In [None]:
print("Nb attributes: {}".format(x.shape[1]))
print("Nb different outcomes: {}".format(len(set(labels))))

# Visualization

In [None]:
data_embedded = TSNE(n_jobs=-1).fit_transform(x)

In [None]:
plt.figure(figsize=(6, 4))
plt.scatter(
    data_embedded[:, 0],
    data_embedded[:, 1],
    s=2,
    c=labels)
plt.title("Visualization with t-SNE")
plt.tight_layout()
plt.savefig("figures/visualization_multiclasse.png", dpi=300, transparent=True)
plt.show()
plt.close()

In [None]:
plt.figure(figsize=(6, 4))
plt.scatter(
    data_embedded[:, 0],
    data_embedded[:, 1],
    s=2,
    c=[int(label[0] == 1) for label in y])
plt.title("Visualization with t-SNE")
plt.tight_layout()
plt.savefig("figures/visualization_binary.png", dpi=300, transparent=True)
plt.show()
plt.close()

In [None]:
# function to compute and plot the confusion matrix
def cfm(true, pred, mapping, name):
    cond_true = (len(true.shape) == 1) or (true.shape[1] == 1)
    true = true < 0.5 if cond_true else np.argmax(true, axis=1)

    cond_pred = (len(pred.shape) == 1) or (pred.shape[1] == 1)
    pred = pred < 0.5 if cond_pred else np.argmax(pred, axis=1)

    plt.figure()
    plt.gca().set_axis_off()
    plt.table(
        cellText=confusion_matrix(true, pred),
        rowLabels=mapping,
        colLabels=mapping,
        loc="center")

    plt.tight_layout()
    plt.savefig("figures/cfm_{}.png".format(name), dpi=300, transparent=True)
    plt.show()
    plt.close()

In [None]:
# function to plot the learning graphs
def plot_history(history, name):
    fig, axes = plt.subplots(1, 2, figsize=(8, 4))

    # summarize history for accuracy
    axes[0].plot(history.history["acc"])
    axes[0].plot(history.history["val_acc"])
    axes[0].set_ylabel("Accuracy")
    axes[0].set_xlabel("Epoch")

    axes[0].legend(["Train", "Validation"], title="Set")
    # summarize history for loss
    axes[1].plot(history.history["loss"])
    axes[1].plot(history.history["val_loss"])
    axes[1].set_ylabel("Loss")
    axes[1].set_xlabel("Epoch")
    axes[1].legend(["Train", "Validation"], title="Set")

    plt.tight_layout()
    plt.savefig("figures/history_{}.png".format(name), dpi=300, transparent=True)
    plt.show()
    plt.close()

In [None]:
#normalize x
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)

In [None]:
# split dataset into a training, validation and test set
x_train, x_test, y_train, y_test = train_test_split(
    x_scaled, y, test_size=0.2, stratify=y)
# we split first (train/test) then (train/valid) 12.5% of 80% is the same as 10% of 100%
x_train, x_val, y_train, y_val = train_test_split(
    x_train, y_train, test_size=0.125, stratify=y_train)

In [None]:
print(x_train.shape)
print(x_val.shape)
print(x_test.shape)

In [None]:
input = Input(shape=(8, ))
dense = BatchNormalization()(input)
dense = Dense(256)(dense)
dense = BatchNormalization()(dense)
dense = Dropout(0.5)(dense)
dense = Activation("relu")(dense)
dense = Dense(128)(dense)
dense = BatchNormalization()(dense)
dense = Dropout(0.5)(dense)
dense = Activation("relu")(dense)
dense = Dense(64)(dense)
dense = BatchNormalization()(dense)
dense = Dropout(0.5)(dense)
dense = Activation("relu")(dense)
dense = Dense(7)(dense)
output = Activation("softmax")(dense)

model = Model(inputs=input, outputs=output)

model.compile(
    loss="categorical_crossentropy", optimizer="Adam", metrics=["accuracy"])

model.summary()

In [None]:
history = model.fit(
    x_train,
    y_train,
    batch_size=512,
    epochs=9999,
    verbose=1,
    validation_data=(x_val, y_val),
    callbacks=[
        callbacks.EarlyStopping(
            monitor="val_acc",
            min_delta=0.0001,
            patience=25,
            restore_best_weights=True)
    ])

In [None]:
score = model.evaluate(x_test, y_test, verbose=0)
print("Test loss:", score[0])
print("Test accuracy:", score[1])

In [None]:
cfm(y_test, model.predict(x_test), mapping_outcome, "discret_multiclasse")

In [None]:
plot_history(history, "discret_multiclasse")

In [None]:
# not accurate enough, let"s only check if the animal is adopted
binary_y = np.asarray([int(label[0] == 1) for label in y])
# split dataset into a training, validation and test set
x_train, x_test, y_train, y_test = train_test_split(
    x_scaled, binary_y, test_size=0.2, stratify=binary_y)
# we split first (train/test) then (train/valid) 12.5% of 80% is the same as 10% of 100%
x_train, x_val, y_train, y_val = train_test_split(
    x_train, y_train, test_size=0.125, stratify=y_train)

In [None]:
input = Input(shape=(8, ))
dense = BatchNormalization()(input)
dense = Dense(256)(dense)
dense = BatchNormalization()(dense)
dense = Dropout(0.5)(dense)
dense = Activation("relu")(dense)
dense = Dense(128)(dense)
dense = BatchNormalization()(dense)
dense = Dropout(0.5)(dense)
dense = Activation("relu")(dense)
dense = Dense(64)(dense)
dense = BatchNormalization()(dense)
dense = Dropout(0.5)(dense)
dense = Activation("relu")(dense)
dense = Dense(32)(dense)
dense = BatchNormalization()(dense)
dense = Dropout(0.5)(dense)
dense = Activation("relu")(dense)
dense = Dense(1)(dense)
output = Activation("sigmoid")(dense)

model = Model(inputs=input, outputs=output)

model.compile(
    loss="binary_crossentropy", optimizer="Adam", metrics=["accuracy"])

model.summary()

In [None]:
key, value = np.unique(y_train, return_counts=True)
class_weight = dict(zip(key, max(value) / value))
print(class_weight)

In [None]:
history = model.fit(
    x_train,
    y_train,
    batch_size=512,
    epochs=9999,
    verbose=1,
#     class_weight=class_weight,
    validation_data=(x_val, y_val),
    callbacks=[
        callbacks.EarlyStopping(
            monitor="val_acc",
            min_delta=0.001,
            patience=10,
            restore_best_weights=True)
    ])

In [None]:
score = model.evaluate(x_test, y_test, verbose=0)
print("Test loss:", score[0])
print("Test accuracy:", score[1])

In [None]:
cfm(y_test, model.predict(x_test), ["Adoption", "Rejection"], "discret_binaire")

In [None]:
plot_history(history, "discret_binaire")

In [None]:
# Let"s try with one hot vector input (except age, data which are real valued -> scaled version, and name that is arleady binary)
one_hot_animal_breed = np.eye(len(set(x[:, 3])))[x[:, 3].astype("int")]
one_hot_color = np.eye(len(set(x[:, 4])))[x[:, 4].astype("int")]
one_hot_condition = np.eye(len(set(x[:, 5])))[x[:, 5].astype("int")]
one_hot_type = np.eye(len(set(x[:, 6])))[x[:, 6].astype("int")]
one_hot_sex = np.eye(len(set(x[:, 7])))[x[:, 7].astype("int")]

x_one_hot = np.concatenate(
    (x_scaled[:, :2], x[:, 2].reshape(-1, 1), one_hot_animal_breed,
     one_hot_color, one_hot_condition, one_hot_type, one_hot_sex),
    axis=1)

In [None]:
# split dataset into a training, validation and test set
x_train, x_test, y_train, y_test = train_test_split(
    x_one_hot, binary_y, test_size=0.2, stratify=binary_y)
# we split first (train/test) then (train/valid) 12.5% of 80% is the same as 10% of 100%
x_train, x_val, y_train, y_val = train_test_split(
    x_train, y_train, test_size=0.125, stratify=y_train)

In [None]:
print(x_train.shape)
print(x_val.shape)
print(x_test.shape)

In [None]:
input = Input(shape=(2709, ))
dense = BatchNormalization()(input)
dense = Dense(128)(dense)
dense = BatchNormalization()(dense)
dense = Dropout(0.75)(dense)
dense = Activation("relu")(dense)
dense = Dense(64)(dense)
dense = BatchNormalization()(dense)
dense = Dropout(0.75)(dense)
dense = Activation("relu")(dense)
dense = Dense(32)(dense)
dense = BatchNormalization()(dense)
dense = Dropout(0.75)(dense)
dense = Activation("relu")(dense)
dense = Dense(1)(dense)
output = Activation("sigmoid")(dense)

model = Model(inputs=input, outputs=output)

model.compile(
    loss="binary_crossentropy", optimizer="Adam", metrics=["accuracy"])
model.summary()

In [None]:
key, value = np.unique(y_train, return_counts=True)
class_weight = dict(zip(key, max(value) / value))
print(class_weight)

In [None]:
history = model.fit(
    x_train,
    y_train,
    batch_size=512,
    epochs=9999,
    verbose=1,
#     class_weight=class_weight,
    validation_data=(x_val, y_val),
    callbacks=[
        callbacks.EarlyStopping(
            monitor="val_acc",
            min_delta=0.001,
            patience=10,
            restore_best_weights=True)
    ])

In [None]:
score = model.evaluate(x_test, y_test, verbose=0)
print("Test loss:", score[0])
print("Test accuracy:", score[1])

In [None]:
cfm(y_test, model.predict(x_test), ["Adoption", "Rejection"],
    "one_hot_binaire")

In [None]:
plot_history(history, "one_hot_binaire")