***Sam Cressman Capstone Project: Shelter Animal Outcomes***

***Help improve outcomes for shelter animals***

***Capstone inspiration:*** [Kaggle](https://www.kaggle.com/c/shelter-animal-outcomes)

***Modeling Notebook***

In [23]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import to_categorical
import matplotlib.pyplot as plt

In [24]:
animals = pd.read_csv("../CSVs/cleaned_animals_numeric_dummies.csv")

In [25]:
# Evaluating target

animals["Outcome Type"].value_counts();

In [26]:
# Baseline accuracy is high: 90%

animals["Outcome Type"].value_counts(normalize = True);

***Setting X, y, features***

In [27]:
# Disregarding DateTime objects, target (Outcome Type), Outcome Subtype (many nulls: kept for EDA/visualization),
# Breed (added columns "manually"), Color (created buckets, concated back with animals)

features_to_disregard = ["Intake Time", "Outcome Time", "Date of Birth",
                         "Outcome Type", "Outcome Subtype", "Breed", "Color"]

In [28]:
features = [feat for feat in animals.columns if feat not in features_to_disregard]

X = animals[features]
y = animals["Outcome Type"]

X_train, X_test, y_train, y_test = train_test_split(X, y)

***Logistic Regression Model***

In [29]:
# Accuracy score: 0.7196227014996556 when using 9 classes, 0.9582413311900514 when using binary classification

lr = LogisticRegression()

ss = StandardScaler()
ss.fit(X_train)
X_train = ss.transform(X_train)
X_test = ss.transform(X_test)

model = lr.fit(X_train, y_train)

# Performs very well on training data

model.score(X_train, y_train)

0.9590538941195174

In [30]:
# Also performs very well on testing data

model.score(X_test, y_test)

0.9542684542419586

In [31]:
# Confusion Matrix 

# Model is very accurate but overpredicting positive outcomes (due to high baseline accuracy)

y_pred = model.predict(X_test)

tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

print("True Negatives: " + str(tn))
print()
print("False Positives: " + str(fp))
print()
print("False Negatives: " + str(fn))
print()
print("True Positives: " + str(tp))

True Negatives: 1196

False Positives: 742

False Negatives: 121

True Positives: 16812


In [None]:
# Creating DataFrame to view coefficient values

coef_df = pd.DataFrame({
        "coef": model.coef_[0],
        "feature": features})

In [None]:
# Helping to interpret coefficients

coef_df["exponential_value"] = [(np.exp(i)) for i in coef_df["coef"]]

In [None]:
# Examining top features: this makes sense!

coef_df.sort_values("coef", ascending = False).head(10);

In [None]:
# Plotting top features

coef_features_top = coef_df.sort_values("exponential_value", ascending = False).head(10)

coef_features_top = coef_features_top.sort_values("exponential_value", ascending = True)

coef_features_top.plot(kind = "barh", x = "feature", y = "exponential_value", legend = False, color = "blue")

plt.title("Top 10 Positive Outcome Features")
plt.xlabel("Exponential Value")
plt.ylabel("Feature")

plt.show()

In [None]:
# Examining bottom features: this also makes sense!

coef_df.sort_values("coef", ascending = False).tail(10);

In [None]:
# Plotting bottom features

coef_features_bottom = coef_df.sort_values("exponential_value", ascending = False).tail(10)

coef_features_bottom = coef_features_bottom.sort_values("exponential_value", ascending = True)

coef_features_bottom.plot(kind = "barh", x = "feature", y = "exponential_value", legend = False, color = "blue")

plt.title("Bottom 10 Negative Outcome Features")
plt.xlabel("Exponential Value")
plt.ylabel("Feature")

plt.show()

***Random Forest***

In [None]:
# Simple Random Forest performed very well: 6% over baseline which is high to begin with

# Black box: limited/no interpretability

rf = RandomForestClassifier()

rf.fit(X_train, y_train)

rf.score(X_test, y_test)

***Neural Network***

Neural networks, in a single line, attempt to iteratively train a set (or sets) of weights that, when used together, return the most accurate predictions for a set of inputs. The model is trained using a loss function, which our model will attempt to minimize over iterations.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

In [None]:
# One hot encoding target

y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

In [None]:
# Need shape to help create neural network

X_train.shape;

In [None]:
# Performs extremely well but black box: no interpretability

model = Sequential()

model.add(Dense(246, input_dim = 246, activation= "relu"))
model.add(Dense(2, activation = "softmax"))

model.compile(loss = "binary_crossentropy", optimizer = "adam", metrics = ["accuracy"])

model.fit(X_train, y_train, validation_data = (X_test, y_test), epochs = 10)