# ML Final Project


### Shruti Kotha, Mia Tey, Jeni Pham, and Shruti Patel

In [67]:
#add imports
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
import seaborn as sns

ModuleNotFoundError: No module named 'seaborn'

## Final Project Objective: 
Goal: Help determine the outcome of new intakes at the Austin Animal Center. Specfically, we determined that we can use the features (age, breed, color, animal, sex upon outcome, outcome subtype,
name) to predict outcome (Foster, Adoption, Transfer, Euthanize) of dogs.

### Part 1. Data Preperation and data cleaning for Modeling

For this part of the assignment we read in our data and will perform various data prep techniques like data cleaning and feature engineering to prepare our data for modeling.

In [None]:
data = pd.read_csv('animaloutcomes.csv', skipinitialspace = True)
data.head()

View the distribution of "Outcome Type". We want to ensure there is sufficient data for each "Outcome Type" and if there isn't we can drop columns as needed.

In [None]:
outcome_counts = data['Outcome Type'].value_counts()
print(outcome_counts)

First, our label column will be "Outcome Type", since that is what we want to predict. 
If there are any records that lack an entry for "Outcome Type" we drop them.

In [None]:
print("Length before droping Outcome Type: ", len(data['Outcome Type']))
data = data.dropna(subset=['Outcome Type'])
print("Length after drop Outcome Type: ", len(data['Outcome Type']))

Second, we want to drop "Outcome Subtype". Since most rows have NaN in the "Outcome Subtype" column, it won't be an important column for us to use in our model.

In [None]:
data.drop(columns=['Outcome Subtype', 'Name', 'DateTime', 'Date of Birth', 'Animal ID'], inplace=True)
# print the head to ensure only columns important to us are in the dataset
data.head()

Next, we clean up the colors column. Since we plan on using the colors for our models later, we should simplify the categories, as there are way too many unique categories for testing (23).
First, there are entries such as 'black/white' and 'white/black' let's combine these into one label.
Second, there are a lot of unique combinations of two colors. For these records whose combination of color makes up <5% of all colors, add these into one column called 'Multicolor' and jointly categorize it with the 'Tricolor' attribute.

In [None]:
color_count_before = len(data['Color'].value_counts())
print("Number of unique colors before: ", color_count_before)

# Combine categories such as 'black/white' and 'white/black'
def join_same_color(color):
    color_components = color.split('/')
    color_components.sort()
    return '/'.join(color_components)
data['Color'] = data['Color'].apply(join_same_color)


# Create Multicolor category 
color_counts = data['Color'].value_counts()
color_percent = color_counts / color_counts.sum() * 100
less_than_5_percent = color_percent[color_percent < 1].index
data['Color'] = data['Color'].apply(lambda x: 'Multicolor' if x in less_than_5_percent or x == 'Tricolor' else x)

Next, we decied to create new column for seasons. Given that each month's scope was too broad to determine animal outcomes accurately. We noticed season had more of an impact on animal outtake.

In [None]:
def get_season(month):
    if month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    elif month in [9, 10, 11]:
        return 'Fall'
    else:
        return 'Winter'
    
data['Month'] = pd.to_datetime(data['MonthYear'], format='%b %Y').dt.month
data['Season'] = data['Month'].apply(get_season)
data = data.drop('Month', axis=1)
data.head()

Next, we decieded to categorize each breed into the size of the breed. We noticed our data had too many specific breeds, thus decided to genralize the data. We kept the data of the top most 100 frequent dog breeds and categorized them into their respective size. 

We'll be categorizing the various breeds in the dataset. https://www.trainpetdog.com/dog-breed-size-chart/

In [None]:

# only keep records that are Dogs
records_to_keep = ['Dog']
dog_data = data[data['Animal Type'].isin(records_to_keep)]

# only keep top 100 most frequent dog breeds in dataset
top_100_breeds = dog_data['Breed'].value_counts().head(100).index
breeds_data = dog_data[dog_data['Breed'].isin(top_100_breeds)]
breeds_data = breeds_data.copy()

# print(top_100_breeds)



# large 70 + lbs avg, small, medium
breed_to_category = {
    'Pit Bull Mix': 'Medium/Large Breeds',
    'Labrador Retriever Mix': 'Medium/Large Breeds',
    'Chihuahua Shorthair Mix': 'Small Breeds',
    'German Shepherd Mix': 'Medium/Large Breeds',
    'Pit Bull': 'Medium/Large Breeds',
    'Australian Cattle Dog Mix': 'Medium/Large Breeds',
    'Chihuahua Shorthair': 'Small Breeds',
    'Labrador Retriever': 'Medium/Large Breeds',
    'German Shepherd': 'Medium/Large Breeds',
    'Dachshund Mix': 'Medium/Small Breeds', 
    'Boxer Mix': 'Medium/Large Breeds',
    'Border Collie Mix': 'Medium/Large Breeds',
    'Miniature Poodle Mix': 'Small Breeds',
    'Siberian Husky Mix': 'Medium Breeds',
    'Australian Shepherd Mix': 'Medium/Large Breeds',
    'Catahoula Mix': 'Medium/Large Breeds',
    'Staffordshire Mix': 'Medium/Large Breeds',
    'Siberian Husky': 'Medium Breeds',
    'Rat Terrier Mix': 'Small Breeds',
    'Great Pyrenees Mix': 'Large Breeds',
    'Yorkshire Terrier Mix': 'Small Breeds',
    'Beagle Mix': 'Medium/Large Breeds',
    'Miniature Schnauzer Mix': 'Medium/Small Breeds',
    'Jack Russell Terrier Mix': 'Medium/Small Breeds',
    'Pointer Mix': 'Large Breeds',
    'Cairn Terrier Mix' : 'Small Breeds',
    'American Bulldog Mix': 'Medium/Large Breeds',                      
    'Chihuahua Longhair Mix': 'Small Breeds',   
    'Anatol Shepherd Mix': 'Medium/Large Breeds',
    'Rottweiler Mix': 'Medium/Large Breeds',
    'Australian Cattle Dog': 'Medium Breeds',
    'Black Mouth Cur Mix': 'Medium/Large Breeds',
    'Plott Hound Mix': 'Medium/Large Breeds',
    'Labrador Retriever/Pit Bull': 'Medium/Large Breeds',
    'Australian Kelpie Mix': 'Medium Breeds',
    'Shih Tzu Mix': 'Small Breeds',
    'Chihuahua Shorthair/Dachshund': 'Small Breeds',
    'Great Pyrenees': 'Large Breeds',
    'American Pit Bull Terrier Mix': 'Medium/Large Breeds',
    'German Shepherd/Labrador Retriever': 'Large Breeds',
    'Dachshund/Chihuahua Shorthair': 'Small Breeds',
    'Labrador Retriever/German Shepherd': 'Large Breeds',
    'American Staffordshire Terrier Mix': 'Medium/Large Breeds',
    'Rottweiler': 'Large Breeds',
    'Boxer': 'Medium/Large Breeds',
    'Shih Tzu': 'Small Breeds',
    'Pit Bull/Labrador Retriever': 'Medium/Large Breeds',
    'Dachshund': 'Small Breeds',
    'Golden Retriever Mix': 'Large Breeds',
    'Maltese Mix': 'Small Breeds',
    'Border Terrier Mix': 'Small Breeds',
    'Miniature Pinscher Mix': 'Small Breeds',
    'Yorkshire Terrier': 'Small Breeds',
    'Blue Lacy Mix': 'Medium/Large Breeds',
    'Doberman Pinsch Mix': 'Medium/Large Breeds',
    'Miniature Poodle': 'Small Breeds',
    'Chow Chow Mix': 'Medium/Large Breeds',
    'American Pit Bull Terrier': 'Medium/Large Breeds',
    'Cairn Terrier': 'Small Breeds',
    'Border Collie': 'Medium/Large Breeds',
    'Queensland Heeler Mix': 'Medium Breeds',
    'Doberman Pinsch': 'Medium/Large Breeds',
    'Basset Hound Mix': 'Medium/Large Breeds',
    'Labrador Retriever/Border Collie': 'Large Breeds',
    'Alaskan Husky Mix': 'Medium/Large Breeds',
    'Pug Mix': 'Small Breeds',
    'Beagle': 'Small Breeds',
    'Pomeranian Mix': 'Small Breeds',
    'Mastiff Mix': 'Large Breeds',
    'Cocker Spaniel Mix': 'Small Breeds',
    'Cardigan Welsh Corgi Mix': 'Medium Breeds',
    'Miniature Schnauzer': 'Small Breeds',
    'Lhasa Apso Mix': 'Small Breeds',
    'Chinese Sharpei Mix': 'Medium/Large Breeds',
    'Australian Shepherd': 'Medium/Large Breeds',
    'Labrador Retriever/Australian Cattle Dog': 'Medium/Large Breeds',
    'Black/Tan Hound Mix': 'Medium/Large Breeds',
    'Pug': 'Small Breeds',
    'Chihuahua Longhair': 'Small Breeds',
    'Great Dane Mix': 'Large Breeds',
    'Boston Terrier Mix': 'Small Breeds',
    'Border Collie/Labrador Retriever': 'Large Breeds',
    'Labrador Retriever/Great Pyrenees' : 'Large Breeds',
    'Dachshund Wirehair Mix' : 'Small Breeds',
    'Dachshund Longhair Mix' : 'Small Breeds',
    'Alaskan Husky' : 'Medium/Large Breeds',
    'Flat Coat Retriever Mix' : 'Large Breeds',
    'Manchester Terrier Mix' : 'Small Breeds',
    'Toy Poodle Mix' : 'Small Breeds',
    'American Bulldog' : 'Medium/Large Breeds',
    'Rat Terrier' : 'Small Breeds',
    'Collie Smooth Mix' : 'Medium/Large Breeds',
    'Maltese' : 'Small Breeds',
    'Anatol Shepherd' : 'Medium/Large Breeds',
    'Staffordshire' : 'Medium/Large Breeds',
    'Belgian Malinois Mix' : 'Medium/Large Breeds',
    'Pit Bull/Boxer' : 'Medium/Large Breeds',
    'Norfolk Terrier Mix' : 'Small Breeds',
    'Australian Cattle Dog/Labrador Retriever' : 'Medium/Large Breeds',
    'Rhod Ridgeback Mix' : 'Large Breeds'         
}
print()
print("Distribution based on size of breed:")

categories = [breed_to_category.get(breed) for breed in breeds_data['Breed']]

category_counts = Counter(categories)

# make sure there is still sufficent data for each size of the animal
for category, count in category_counts.items():
    print(f'{category}: {count}')

# to do add a size category with the breed to wsize mapping
breeds_data.loc[:, 'Size'] = breeds_data['Breed'].map(breed_to_category)
dog_data = breeds_data


# make sure there is a new Size column that reflects the size of the breed for that animal
dog_data.head()


### Part 2. Data Exploration 

We will be graphing data distributions as part of data exploration. We will be looking for any imbalances, outliers, or anything that could potentially skeww the results of our data. We will then correct that before modeling.

Plot distribution of colors:

In [None]:
color_counts = dog_data['Color'].value_counts()
color_counts.plot(kind='pie', autopct='%1.1f%%')
plt.title('Distribution of Colors')
plt.show()

Display of distribution of season and outcome

In [None]:
season_counts = dog_data['Season'].value_counts()
season_counts.plot(kind='bar')
plt.title('Frequency of Seasons within dataset')
plt.xlabel('Season')
plt.ylabel('Frequency')
plt.xticks(rotation=0)  
plt.show()

outcome_by_season = pd.crosstab(dog_data['Season'], dog_data['Outcome Type'])
outcome_by_season.plot(kind='bar', stacked=True)
plt.title('Outcomes by Season')
plt.xlabel('Season')
plt.ylabel('Frequency')
plt.xticks(rotation=0)  
plt.legend(loc='upper right', bbox_to_anchor=(1.5, 1))

plt.show()


Distribution of dog breed sizes for the top 100 breeds in the dataset.  

In [None]:
category_labels = [label for label, value in category_counts.items() if label is not None]
category_values = [value for label, value in category_counts.items() if label is not None]


plt.figure(figsize=(10, 6))
plt.bar(category_labels, category_values)
plt.title('Distribution of Dog Breeds by Category')
plt.xlabel('Category')
plt.ylabel('Count')

plt.xticks(rotation=45)

plt.show()

Distribution of dog sex upon Outcome.

In [None]:
sex_upon_outcome = dog_data["Sex upon Outcome"]

male = 0
female = 0

for outcome in sex_upon_outcome:
    if pd.notna(outcome):
        if "Male" in outcome:
            male += 1
        elif "Female" in outcome:
            female += 1

        
categories = ["Male", "Female"]
counts = [male, female]

plt.bar(categories, counts)
plt.title("Distribution of Male and Female")
plt.xlabel("Sex")
plt.ylabel("Count")
plt.show()

print(f"Number of Males: {male}")
print(f"Number of Females: {female}")



Distrubtion of nuetured vs spayed at outcome.

In [None]:
neutered = 0
spayed = 0
intact = 0

sex_counts = dog_data["Sex upon Outcome"].value_counts()
print(sex_counts)

for outcome in sex_upon_outcome:
    if pd.notna(outcome):
        if "Neutered" in outcome:
            neutered += 1
        elif "Spayed" in outcome:
            spayed += 1
        elif "Intact" in outcome: 
            intact += 1
        
categories = ["Neutered", "Spayed", "Intact"]
counts = [neutered, spayed, intact]

plt.bar(categories, counts)
plt.title("Distribution of Neutered, Spayed, or Intact")
plt.xlabel("Sterilization Status")
plt.ylabel("Count")
plt.show()

print(f"Number of Neutered: {neutered}")
print(f"Number of Spayed: {spayed}")
print(f"Number of Intact: {intact}")


Distribution of Months when animals were registered (?)

In [None]:
month_data = dog_data['MonthYear']
df = pd.DataFrame(month_data)
pattern = r'([a-zA-Z]{3}) \d{4}'
df['MonthYear'] = df['MonthYear'].str.extract(pattern)
# print(df)
data.head()
month_order = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']

month_counts = df['MonthYear'].value_counts()
plt.figure(figsize=(12, 6))
sns.barplot(x=month_counts.index, y=month_counts.values, order=month_order)
plt.title('Distribution of Months')
plt.xlabel('Month')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()



Convert the age upon outcome into months so all ages are generalized and then plot the distribution.

In [None]:
def convert_to_months(age_str):
    # Split the age string into value and unit
    value, unit = age_str.split()
    value = int(value)
    
    if 'year' in unit or 'years' in unit:
        return value * 12
    elif 'month' in unit or 'months' in unit:
        return value
    elif 'week' in unit or 'weeks' in unit:
        return value // 4  # Approximate conversion: 1 month = 4 weeks
    elif 'day' in unit or 'days' in unit:
        return value // 30 # Approximate conversion: 1 month = 30 days
    else:
        return 0  # default case

age_data = dog_data['Age upon Outcome']
df = pd.DataFrame(age_data)
df['Age upon Outcome'] = df['Age upon Outcome'].fillna('0 days')  #  fills NaN with '0 days'
# Apply the conversion function to the column
df['Age upon Outcome'] = df['Age upon Outcome'].apply(convert_to_months)
# print(df.head())

mean_age = df['Age upon Outcome'].mean()
median_age = df['Age upon Outcome'].median()
mode_age = df['Age upon Outcome'].mode()
age_df = df

print(f"Mean Age (in months): {mean_age:.2f}")
print(f"Median Age (in months): {median_age:.2f}")
print(f"Mode Age (in months): {mode_age.iloc[0]}")  

age_counts = df.loc[(df['Age upon Outcome'] >= 0) & (df['Age upon Outcome'] <= 228), 'Age upon Outcome'].value_counts().sort_index()
age_counts.plot(kind='bar', figsize=(12,6))
plt.title('Distribution of Ages in Months')
plt.xlabel('Age (in months)')
plt.ylabel('Frequency')
plt.show()





In [None]:
# remove any negative ages
df = df[df['Age upon Outcome'] >= 0]

plt.figure(figsize=(10, 6))
sns.boxplot(x=df['Age upon Outcome'])
plt.xlabel('Age upon Outcome')
plt.ylabel('Frequency')
plt.title(f'Box-and-Whisker Plot')
plt.show()
q1 = df['Age upon Outcome'].quantile(0.25)
q3 = df['Age upon Outcome'].quantile(0.75)
iqr = q3 - q1
print(f"50% of dog's ages in the dataset are between: {q1} and {q3} months")


### Part 3. Feature Engineering for Models

In [None]:
# drop the columns we don't need for our models
dog_data.drop(columns=['Animal Type', 'MonthYear', 'Breed'], inplace=True)
dog_data.head()

#### Age: Years to Months
Change age to be uniformly described by months. 

In [None]:
dog_data = dog_data.drop(columns=['Age upon Outcome'])
dog_data = pd.concat([dog_data, df.iloc[:, 0]], axis=1)
dog_data = dog_data[dog_data['Age upon Outcome'] >= 0]
dog_data.head()

#### Encode Sex: 
This code performs one-hot encoding on the categorical columns 'Sex upon Outcome' and 'Color' in the 'dog_data' DataFrame, creating binary columns for each category. The encoded columns are then concatenated with the original DataFrame, and the original categorical columns are dropped, resulting in a DataFrame with expanded feature representations for 'Sex upon Outcome' and 'Color'.

In [None]:
encoded_sex = pd.get_dummies(dog_data['Sex upon Outcome'], prefix='Sex upon Outcome').astype(int)
encoded_colors = pd.get_dummies(dog_data['Color'], prefix='Color').astype(int)
dog_data = pd.concat([dog_data, encoded_sex, encoded_colors], axis=1)
dog_data = dog_data.drop(columns=['Color', 'Sex upon Outcome'])
dog_data.head()



#### Encode Size: 
This code snippet encodes the 'Size' column in the 'dog_data' DataFrame ordinally based on predefined size categories. It creates a new column named 'Breed size' where each dog's size is represented by an ordinal encoding, and subsequently, the original 'Size' column is dropped from the DataFrame. 

In [None]:
# encode breeds ordinally
sizes = ['Small Breeds', 'Medium/Small Breeds', 'Medium/Large Breeds', 'Large Breeds']

size_encode_mapping = {
    'Small Breeds': 1,
    'Medium/Small Breeds': 2,
    'Medium Breeds': 3,
    'Medium/Large Breeds': 4,
    'Large Breeds': 5
}

dog_data['Breed size'] = dog_data['Size'].map(size_encode_mapping)
dog_data = dog_data.drop(columns=['Size'])


dog_data.head()

#### Encode Season: 
This code snippet encodes the 'Season' column in the 'dog_data' DataFrame cyclically using sine and cosine functions. It creates two new columns, 'Season_cos' and 'Season_sin', which represent the cyclic encoding of seasons. The original 'Season' column and an intermediate numeric encoding column are then dropped from the DataFrame, resulting in a dataset where seasons are represented as cyclical features through trigonometric functions.

In [None]:
import numpy as np

## encode seasons cyclically with sine and cosine functions
season_mapping = {'Winter': 0, 'Spring': 1, 'Summer': 2, 'Fall': 3}
dog_data['Season_numeric_encode'] = dog_data['Season'].map(season_mapping)
dog_data['Season_cos'] = np.cos(2 * np.pi * dog_data['Season_numeric_encode'] / 4)
dog_data['Season_sin'] = np.sin(2 * np.pi * dog_data['Season_numeric_encode'] / 4)
dog_data = dog_data.drop(columns=['Season', 'Season_numeric_encode'])
dog_data.head()

### Part 4. Data Modeling
We decided to only use dogs for this dataset because having cat and other would complicate results and cause
our data to fall victim to the curse of dimensionality. Additionally, breed will likely play a large role in accuracy. In order to use breed, we will have to center our focus on dogs.

### Model 1. Naive Bayes
First, we are using Naive Bayes on our data. The data has already been cleaned and engineered, so all we have to do is set the label and features variables.

In [None]:
#only use these columns from the dataset
# col_names = ['Sex upon Outcome', 'Breed', 'Color', 'Season']
# NB_df = dog_data.loc[:, col_names]
#append age from df and then append outcome type last
# NB_df = pd.concat([NB_df, df.iloc[:, 0]], axis=1)
# NB_df = pd.concat([NB_df, dog_data[['Outcome Type']]], axis=1)

#set label col to Outcome Type
label = dog_data['Outcome Type']
label = label.values.ravel()
features= dog_data.drop(['Outcome Type'],axis=1)


#verify we have correct columns
# NB_df.head()
dog_data.head()

 Next, create a Multinomial Naive Bayes classifier (since it supports categorical target 
variables) and perform a 10-fold cross validation on the classifier. 
Print accuracy.

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings("ignore")

# verify shape is correct for NB model
print(dog_data.shape)
print(label.shape)

# normalize and scale features
features = pd.DataFrame(features)
label = pd.DataFrame(label)
le = LabelEncoder()
features = features.apply(le.fit_transform)
label = label.apply(le.fit_transform)

scaler = MinMaxScaler()
features = scaler.fit_transform(features)

# finally model data and check accuracy
mnb = MultinomialNB()
mnb_CV = cross_val_score(mnb, features, label, cv=10)
print('Accuracy: ', mnb_CV.mean())

warnings.resetwarnings()

In [None]:
from sklearn.metrics import confusion_matrix

class_names = np.unique(dog_data['Outcome Type'])
cm = confusion_matrix(label, pred)
sns.heatmap(cm, annot=True, fmt="d", xticklabels=class_names, yticklabels=class_names)
plt.title('Confusion Matrix for Naive Bayes')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

### Model 2.5 Ensembling: (can move this later)
Let's try ensembling with random forests. Ensembling base classifiers usually performs better because it combines the predictions of multiple base classifiers.

#### Random forests: 
Let's use a GridSearchCV with a 3-fold CV and try 15, 25, and 50 base classifiers of fully grown decision trees and see which performs best. Then wrap the GridSearchCV in a cross_val_predict with 5-fold CV and display the classification report.

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_predict
warnings.filterwarnings("ignore")

label = label.values.ravel()
rf = RandomForestClassifier()

parameter = {
    'n_estimators': [15, 25, 50]
}
grid = GridSearchCV(rf, parameter, cv=3)
pred = cross_val_predict(grid, features, label, cv=5)
# CV_score = cross_val_score(grid, features, label, cv=5)
# print('Accuracy: ', CV_score.mean())
print('Classification Report: \n', classification_report(label, pred))
warnings.resetwarnings()

In [None]:
from sklearn.metrics import confusion_matrix

class_names = np.unique(dog_data['Outcome Type'])
cm = confusion_matrix(label, pred)
sns.heatmap(cm, annot=True, fmt="d", xticklabels=class_names, yticklabels=class_names)
plt.title('Confusion Matrix for Random Forest')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

#### Boosting: 
Let's use a GridSearchCV with a 3-fold CV and try 15, 25, and 50 base classifiers of decision stumps. Then wrap the GridSearchCV in a cross_val_predict with 5-fold CV and display the classification report.

In [68]:
from sklearn.ensemble import AdaBoostClassifier
warnings.filterwarnings("ignore")

boost = AdaBoostClassifier()
parameter = {
    'n_estimators': [15, 25, 50]
}
grid = GridSearchCV(boost, parameter, cv=3)
pred = cross_val_predict(grid, features, label, cv=5)
print('Classification Report: \n', classification_report(label, pred))
warnings.resetwarnings()

NameError: name 'GridSearchCV' is not defined

In [69]:
from sklearn.metrics import confusion_matrix

class_names = np.unique(dog_data['Outcome Type'])
cm = confusion_matrix(label, pred)
sns.heatmap(cm, annot=True, fmt="d", xticklabels=class_names, yticklabels=class_names)
plt.title('Confusion Matrix for Boosting')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

NameError: name 'pred' is not defined

### Model 2. Decision Tree

Next, we will create a Decision Tree. I implemented hyperparameter optimization for a Decision Tree classifier using a grid search strategy to find the identify the most effective combination of hyperparameters—specifically, the maximum depth of the Decision Tree (`max_depth`) and the number of splits in the StratifiedKFold cross-validator (`n_splits`). Further, a cross-validation was implemented to ensure the best split of the dataset.

The script tracks the best hyperparameter values by comparing mean accuracies. The split that yields the best accuracy has max_depth of 10 and N-splits of 40. The best overall accuracy is 0.62.

In [70]:
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier, export_text
import numpy as np
import warnings
warnings.filterwarnings("ignore")

X = dog_data.drop('Outcome Type', axis=1)
y = dog_data['Outcome Type']

# max_depth and n_splits we want to try
max_depth_values = [5, 10, 15, 20]
n_splits_values = [10, 20, 30, 40]

best_mean_accuracy = 0
best_max_depth = None
best_n_splits = None

# iterate over max_depth and n_splits values
for max_depth in max_depth_values:
    for n_splits in n_splits_values:
        dt_model = DecisionTreeClassifier(random_state=42, max_depth=max_depth, min_samples_split=2)
        cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

        # perform cross-validation
        cv_scores = cross_val_score(dt_model, X, y, cv=cv, scoring='accuracy')
        mean_accuracy = np.mean(cv_scores)

        # update the best parameters if the current combination is better
        if mean_accuracy > best_mean_accuracy:
            best_mean_accuracy = mean_accuracy
            best_max_depth = max_depth
            best_n_splits = n_splits

# print the best hyperparameters and mean accuracy
print("Best Max Depth:", best_max_depth)
print("Best N_splits:", best_n_splits)
print("Best Mean Accuracy:", best_mean_accuracy)

Best Max Depth: 10
Best N_splits: 40
Best Mean Accuracy: 0.6211581708887106


In [74]:
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

# Best hyperparameters
best_max_depth = 10
best_n_splits = 40

# Split the data for training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model with the best hyperparameters
dt_model = DecisionTreeClassifier(random_state=42, max_depth=best_max_depth, min_samples_split=2)
dt_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = dt_model.predict(X_test)

# Create a confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Display the confusion matrix
print("Confusion Matrix:")
print(cm)


ModuleNotFoundError: No module named 'seaborn'

### Model 3: K-Nearest Neighbor

The K-nearest neighbor modeling is an algorithm that's simple to understand and implement. It's simplicity and high accuracy potential is why we decided to use it to model our dataset. However, there are various additional considerations we must take into account when using KNN. In cases where a class inbalance exists, KNN is prone to break down. In our case, we have certain class labels that dominate our data. For example, the majority of our outcome instances are adoption, return to owner, and transfer. These outcomes are only three out of nine potential outcomes but make up about 95% of the outcome types in our data. We need to institute a measure to combat against this imbalance. Using weighted voting for our model will prevent the class imbalance from harming our results. In our grid search, weighted and uniform vote will be one of our parameters to see which hyperparameter leads to the better results. This will check for class imbalances worsening our model accuracy. We will also hypertune our parameter K to find a K value that fits our data well and can generalize to new data points too, avoiding overfitting or underfitting. 

Here we're training KNN Model. Grid search will allow us to optimize for our paramaters. We also want to scale our data before training the model since the KNN's algorithm is based on distance and we want to make sure each feature's distances are on the same scale. 



In [31]:
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

warnings.filterwarnings("ignore")

# seperate features & labels
features = dog_data.drop('Outcome Type', axis=1)
labels = dog_data['Outcome Type']

# KNN with 80-20 train/test split
x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)
x_test = np.ascontiguousarray(x_test)

scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)
x_test_scaled = np.ascontiguousarray(x_test_scaled)


#initialize parameter grid to find best parameters
param_grid = {
    'n_neighbors': range(3, 25),
    'weights': ['uniform', 'distance']
}

#create knn classifier 
KNN_classifier = KNeighborsClassifier()

# search for best parameters
grid_search = GridSearchCV(KNN_classifier, param_grid, cv=4, scoring='accuracy')
grid_search.fit(x_train_scaled, y_train)
best_params = grid_search.best_params_
print("Best Hyperparameters for KNN model:", best_params)

warnings.resetwarnings()


Best Hyperparameters for KNN model: {'n_neighbors': 23, 'weights': 'uniform'}


Calculate the accuracy for the model

In [32]:
from sklearn.metrics import classification_report

print("Best Cross-Validated Accuracy:", grid_search.best_score_)

# calculate accuracy using best parameters
best_model = grid_search.best_estimator_
y_pred = best_model.predict(x_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy for each trial:", accuracy)
avg_test_scores = grid_search.cv_results_['mean_test_score']
print(avg_test_scores)

# get classfification report
report = classification_report(y_test, y_pred, zero_division=1)
print("Classification Report:\n", report)

Best Cross-Validated Accuracy: 0.6139198708541239
Test Accuracy for each trial: 0.6115635776652726
[0.55963824 0.55591429 0.57508439 0.56679263 0.58524729 0.57411212
 0.59242002 0.58077121 0.59535515 0.58218374 0.60074846 0.58594438
 0.60245451 0.58667816 0.60641694 0.58911799 0.60711403 0.59029205
 0.60922366 0.59126431 0.60748092 0.59119093 0.6090769  0.59262181
 0.60953552 0.59201644 0.61025095 0.59276856 0.61193866 0.59353904
 0.61215879 0.59407103 0.61080129 0.59275022 0.61278251 0.59374083
 0.61311271 0.59456633 0.61371808 0.59506164 0.61391987 0.59474978
 0.61358967 0.59542853]
Classification Report:
                  precision    recall  f1-score   support

       Adoption       0.65      0.90      0.76      6547
           Died       1.00      0.00      0.00        55
       Disposal       1.00      0.00      0.00        10
     Euthanasia       1.00      0.00      0.00       340
        Missing       1.00      0.00      0.00         4
Return to Owner       0.48      0.34     

SVM -> was taking wayy too long to run

In [None]:
# from sklearn.svm import SVC

# features = dog_data.drop('Outcome Type', axis=1)
# labels = dog_data['Outcome Type']

# # test different c + kernel hyperparameters
# param_grid = {
#     'C': [0.1, 1, 10],
#     'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
# }

# x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

# scaler = StandardScaler()
# x_train = scaler.fit_transform(x_train)
# x_test = scaler.transform(x_test)
# svm_model = SVC()
# grid_search = GridSearchCV(svm_model, param_grid, cv=4, scoring='accuracy', verbose=1, n_jobs=-1)
# grid_search.fit(x_train, y_train)
# print("Best Parameters:", grid_search.best_params_)
# y_pred = grid_search.predict(x_test)


In [None]:
# accuracy = accuracy_score(y_test, y_pred)
# print("accuracy of SVM model : ", accuracy)
# print(classification_report(y_test, y_pred))

A potential reason for low accuracy could be the class imbalance that exists within the dataset. We have a lot of data/records for certain classes + those are the classes that tend to have to most data predicted accurately whereas underrepresented classes don't have e