***Sam Cressman Capstone Project: Shelter Animal Outcomes***

***Help improve outcomes for shelter animals***

***Capstone inspiration:*** [Kaggle](https://www.kaggle.com/c/shelter-animal-outcomes)

***Modeling Notebook***

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import to_categorical

In [None]:
animals = pd.read_csv("./cleaned_animals_numeric_dummies")

In [None]:
# Evaluating target

animals["Outcome Type"].value_counts();

In [None]:
# Baseline accuracy is high

animals["Outcome Type"].value_counts(normalize = True);

***Setting X, y, features***

In [None]:
# Disregarding DateTime objects, target (Outcome Type), Outcome Subtype (many nulls: kept for EDA/visualization),
# Breed (added columns "manually"), Color (created buckets, concated back with animals)

features_to_disregard = ["Intake Time", "Outcome Time", "Date of Birth",
                         "Outcome Type", "Outcome Subtype", "Breed", "Color"]

In [None]:
features = [feat for feat in animals.columns if feat not in features_to_disregard]

X = animals[features]
y = animals["Outcome Type"]

X_train, X_test, y_train, y_test = train_test_split(X, y)

***Logistic Regression Model***

In [None]:
# Accuracy score: 0.7196227014996556 when using 9 classes, 0.9582413311900514 when using binary classification

lr = LogisticRegression()

ss = StandardScaler()
ss.fit(X_train)
X_train = ss.transform(X_train)
X_test = ss.transform(X_test)

model = lr.fit(X_train, y_train)

# Performs very well on training data

model.score(X_train, y_train)

In [None]:
# Also performs very well on testing data

model.score(X_test, y_test)

In [None]:
# Predictions

model.predict(X_test);

In [None]:
# Creating DataFrame to view coefficient values

coef_df = pd.DataFrame({
        "coef": model.coef_[0],
        "feature": features
    })

# coef_df = pd.DataFrame({
#         "coef": lr.coef_[0],
#         "feature": features
#     })

In [None]:
# Helping to interpret coefficients

coef_df["exponential_value"] = [(np.exp(i)) for i in coef_df["coef"]]

In [None]:
# This makes sense!

coef_df.sort_values("coef", ascending=False).head(20);

In [None]:
# This also makes sense!

coef_df.sort_values("coef", ascending=False).tail(20);

***GridSearch Logistic Regression Model***

In [None]:
# GridSearch results on 9 classes (commenting out due to run time)
# Performed only slight better than Logistic Regression with 9 classes

# 0.7242938651498825
# {'C': 1, 'penalty': 'l2', 'solver': 'liblinear'}

# gs_params = {
#     "penalty": ["l1", "l2"],
#     "solver": ["liblinear"],
#     "C": [0.1 , 1]
# }

# lr_gridsearch = GridSearchCV(LogisticRegression(), gs_params)

# lr_gridsearch_model = lr_gridsearch.fit(X_train, y_train)

# print(lr_gridsearch_model.best_score_)

# print(lr_gridsearch_model.best_params_)

***Random Forest***

In [None]:
# Simple Random Forest performed very well: 6% over baseline which is high to begin with

# Black box: limited/no interpretability

rf = RandomForestClassifier()

rf.fit(X_train, y_train)

rf.score(X_test, y_test)

***Neural Network***

Neural networks, in a single line, attempt to iteratively train a set (or sets) of weights that, when used together, return the most accurate predictions for a set of inputs. The model is trained using a loss function, which our model will attempt to minimize over iterations.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

In [None]:
# One hot encoding target

y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

In [None]:
# Need shape to help create neural network

X_train.shape;

In [None]:
# Performs extremely well but black box: no interpretability

model = Sequential()

model.add(Dense(248, input_dim = 248, activation= "relu"))
model.add(Dense(2, activation = "softmax"))

model.compile(loss = "categorical_crossentropy", optimizer = "adam", metrics=["accuracy"])

model.fit(X_train, y_train, validation_data = (X_test, y_test), epochs = 10)

In [None]:
# Predictions

# model.predict_classes(X_test)