This file was an attempt to use the Neural Net hyperparameters that the FindingModel file returned. We used the same preprocessing as the majority of our later files; bucketing colors, converting age to weeks. Notably, we also tried bucketing the breeds here into small, medium, and large, based on keywords from the Breed values and the typical sizes associated. For example, if chihuahua was in the breed, we'd consider that dog small.
We ended up commenting it out because it didn't seem to make an improvement, it actually reduced the accuracy.

In [20]:
# PREPROCESSING

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier  # For KNN
import lightgbm as lgb
import warnings
import xgboost as xgb
from tqdm.notebook import tqdm
warnings.filterwarnings('ignore')

# === Helper Functions ===
def simplify_color(color_str):
    """
    Map raw color strings to a smaller set of standardized categories.
    This function checks for the presence of common color keywords.
    """
    if pd.isna(color_str):
        return "unknown"
    color_str = color_str.lower()
    if "black" in color_str:
        return "black"
    elif "brown" in color_str:
        return "brown"
    elif "white" in color_str:
        return "white"
    elif "tan" in color_str or "gold" in color_str:
        return "tan_gold"
    elif "grey" in color_str or "gray" in color_str:
        return "gray"
    else:
        return "other"

def age_to_weeks(age_str):
    """
    Convert an age string into estimated weeks.
    e.g., "2 years" becomes 104 weeks and "6 months" becomes about 24 weeks.
    """
    if pd.isna(age_str):
        return np.nan
    tokens = age_str.split()
    if len(tokens) < 2:
        return np.nan
    if 'year' in tokens[1]:
        return float(tokens[0]) * 52
    elif 'month' in tokens[1]:
        return float(tokens[0]) * 4
    elif 'week' in tokens[1]:
        return float(tokens[0])
    else:
        return np.nan
    

breed_size = {
    # SMALL (<20 lbs avg)
    'chihuahua':        'small',
    'pembroke welsh corgi':'small',
    'pug':               'small',
    'yorkshire terrier':'small',
    'dachshund':         'small',
    'pomeranian':        'small',
    'papillon':          'small',
    'shih tzu':          'small',
    'maltese':           'small',
    'rat terrier':       'small',
    'jack russell terrier':'small',
    'west highland white terrier':'small',
    # MEDIUM (20–50 lbs avg)
    'border collie':     'medium',
    'australian cattle dog':'medium',
    'beagle':            'medium',
    'boston terrier':    'medium',
    'cocker spaniel':    'medium',
    'cairn terrier':     'medium',
    'bichon frise':      'medium',
    'boston terrier':    'medium',
    'siberian husky':    'medium',
    # LARGE (50+ lbs avg)
    'labrador retriever':'large',
    'golden retriever':  'large',
    'german shepherd':   'large',
    'rottweiler':        'large',
    ' dob':              'large',  # for “doberman”
    'boxer':             'large',
    'great dane':        'large',
    'mastiff':           'large',
    'newfoundland':      'large',
    'bernese mountain dog':'large',
    'great pyrenees':    'large',
    'alaskan malamute':  'large',
    'cane corso':        'large',
    'doberman pinscher': 'large',
    'chow chow':         'large',
    # …add more breeds as needed…
}

# 2) Fallback keyword sets for truly rare / unlisted mixes
small_keys = ['chihuahua','toy','pomeranian','papillon','yorkshire','pug']
large_keys = ['mastiff','wolfhound','dane','newfoundland','retriever','shepherd','rottweiler','boxer','bulldog','malamute']

def size_from_breed(breed):
    b = breed.lower().replace('mix','').strip()
    # split on slashes
    components = [c.strip() for c in b.replace('/',',').split(',')]
    sizes = []
    for comp in components:
        # exact lookup?
        if comp in breed_size:
            sizes.append(breed_size[comp])
        else:
            # fallback to keyword scan
            if any(k in comp for k in small_keys):
                sizes.append('small')
            elif any(k in comp for k in large_keys):
                sizes.append('large')
            else:
                sizes.append('medium')
    # mixture takes the LARGEST of its parents
    if 'large'  in sizes: return 'large'
    if 'medium' in sizes: return 'medium'
    return 'small'

In [21]:


# === 1. Load and Subsample Data ===
# For prototyping, we select a stratified random sample of ~40,000 rows
# to maintain the same target distribution as the full dataset.
df_full = pd.read_csv("train.csv")
# Option A: Using groupby with sample (ensures stratification)
# sample_fraction = 40000 / len(df_full)
# sample_df = df_full.groupby("Outcome Type", group_keys=False).apply(lambda x: x.sample(frac=sample_fraction, random_state=42))
# Option B (alternative): Using train_test_split's stratify option:
sample_df, _ = train_test_split(df_full, train_size=50000, stratify=df_full["Outcome Type"], random_state=42)

# === 2. Target & ID Setup ===
target_col = "Outcome Type"
id_col = "Id"
# Drop columns that are either high-leakage or not available in the test set.
# Here, we drop Outcome Time as well since it isn't present in the test data.
drop_cols = ['Found Location', 'Date of Birth', 'Name', target_col, id_col, 'Outcome Time']
X = df_full.drop(columns=drop_cols, errors='ignore')
y = df_full[target_col]

# === 3. Encode the Target Variable ===
# Although there are only five outcomes, encoding ensures compatibility with all models.
le_y = LabelEncoder()
y_encoded = le_y.fit_transform(y)

# === 4. Feature Engineering ===
# Simplify the 'Color' feature into standardized categories
X['Color Category'] = X['Color'].apply(simplify_color)
X = X.drop(columns=['Color'])

# Convert Age to Weeks
X['Age in Weeks'] = X['Age upon Intake'].apply(age_to_weeks)
X.drop(columns=['Age upon Intake'], inplace=True)

# Process Intake Time if necessary. Since Outcome Time is not used and not in test data,
# we drop Intake Time as well (or extract features if desired).
datetime_format = "%m/%d/%Y %I:%M:%S %p"
X['Intake Time'] = pd.to_datetime(X['Intake Time'], format=datetime_format, errors='coerce')
# If you wish to derive time-based features (like hour or day of week), do it here.
# For now, we drop it to ensure consistency with the test set.
X = X.drop(columns=['Intake Time'])


# Process Breed into Sizes

# 3) Apply to your DataFrame
# X['Size Category'] = X['Breed'].apply(size_from_breed)

# # 4) Quick sanity check
# print(X[['Breed','Size Category']]
#       .drop_duplicates()
#       .groupby('Size Category')
#       .size())
# X = X.drop(columns=['Breed'])



# Fill missing values: For object columns, fill with "Unknown"; for numeric, fill with median.
for col in X.columns:
    if X[col].dtype == 'object':
        X[col] = X[col].fillna("Unknown")
    else:
        X[col] = X[col].fillna(X[col].median())



# # === 5. Encode Categorical Variables ===
categorical_cols = X.select_dtypes(include='object').columns
print("categorical cols", categorical_cols)
X = pd.get_dummies(X, columns=categorical_cols, drop_first=True)


# print(X["Intake Type"].unique())
# print(X["Intake Condition"].unique())
# print(X["Animal Type"].unique())
# print(X["Breed"].unique())
# print(X["Color Category"].unique())
# Ensure all features are numeric
X = X.astype(float)

print("Processed feature sample:")

X.head()



categorical cols Index(['Intake Type', 'Intake Condition', 'Animal Type', 'Sex upon Intake',
       'Breed', 'Color Category'],
      dtype='object')
Processed feature sample:


Unnamed: 0,Age in Weeks,Intake Type_Euthanasia Request,Intake Type_Owner Surrender,Intake Type_Public Assist,Intake Type_Stray,Intake Type_Wildlife,Intake Condition_Agonal,Intake Condition_Behavior,Intake Condition_Congenital,Intake Condition_Feral,...,Breed_Yorkshire Terrier/Soft Coated Wheaten Terrier,Breed_Yorkshire Terrier/Standard Poodle,Breed_Yorkshire Terrier/Standard Schnauzer,Breed_Yorkshire Terrier/Toy Poodle,Breed_Yorkshire Terrier/Yorkshire Terrier,Color Category_brown,Color Category_gray,Color Category_other,Color Category_tan_gold,Color Category_white
0,416.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,44.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,104.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,104.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,312.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
import numpy as np

# 1. Scale your features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 2. Define the MLP with your chosen hyperparameters
mlp = MLPClassifier(
    solver='adam',
    max_iter=200,
    learning_rate_init=0.001,
    hidden_layer_sizes=(50,),
    alpha=0.001,
    activation='relu',
    random_state=42
)

# 3. Perform 5‑fold cross‐validation
scores = cross_val_score(
    mlp,
    X_scaled,
    y_encoded,
    cv=5,
    scoring='balanced_accuracy',  # you can change this to 'balanced_accuracy' if you prefer
    n_jobs=-1
)

# 4. Report results
print(f"5‑Fold CV Accuracy: {scores.mean():.4f} ± {scores.std():.4f}")
