In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
from lightgbm import LGBMClassifier
import optuna

# === Helper: Simplify color into dark/light/mixed/other ===
def simplify_color(color):
    color = str(color).lower()
    light = ['white', 'cream', 'tan', 'yellow', 'fawn']
    dark = ['black', 'brown', 'chocolate', 'blue', 'gray', 'grey']
    if any(w in color for w in light) and any(w in color for w in dark):
        return 'mixed'
    elif any(w in color for w in light):
        return 'light'
    elif any(w in color for w in dark):
        return 'dark'
    else:
        return 'other'

# === Helper: Convert age to weeks ===
def age_to_days(age_str):
    if pd.isnull(age_str):
        return np.nan
    num, unit = age_str.split()
    num = int(num)
    unit = unit.lower()
    if 'day' in unit:
        return num
    elif 'week' in unit:
        return num * 7
    elif 'month' in unit:
        return num * 30
    elif 'year' in unit:
        return num * 365
    return np.nan

# === Load and preprocess train.csv ===
train_df = pd.read_csv("train.csv")

target_col = "Outcome Type"
id_col = "Id"

drop_cols = ['Outcome Time', 'Found Location', 'Date of Birth', 'Name', target_col, id_col]
X = train_df.drop(columns=drop_cols, errors='ignore')
y = train_df[target_col]

# Label encode target
le_y = LabelEncoder()
y_encoded = le_y.fit_transform(y)

# Feature engineering
X['Color Category'] = X['Color'].apply(simplify_color)
X.drop(columns=['Color'], inplace=True)

X['Age in Days'] = X['Age upon Intake'].apply(age_to_days)
X.drop(columns=['Age upon Intake'], inplace=True)

# X['Age Group'] = X['Age in Weeks'].apply(age_group)
# X = X.drop(columns=['Age in Weeks'])

# Categorize intake time into hour, weekday, and season (slightly improved accuracy by about .01%: .62 to .63)
X['Intake Time'] = pd.to_datetime(train_df['Intake Time'], errors='coerce')
X['Intake Hour'] = X['Intake Time'].dt.hour
X['Weekday'] = X['Intake Time'].dt.weekday
X['Season'] = X['Intake Time'].dt.month.map({
    12: 'winter', 1: 'winter', 2: 'winter',
    3: 'spring', 4: 'spring', 5: 'spring',
    6: 'summer', 7: 'summer', 8: 'summer',
    9: 'fall', 10: 'fall', 11: 'fall'
})

X.drop(columns=['Intake Time'], inplace=True, errors='ignore')

# Fill missing values
for col in X.columns:
    if X[col].dtype == 'object':
        X[col] = X[col].fillna("Unknown")
    else:
        X[col] = X[col].fillna(X[col].median())

# Convert categoricals to category dtype for LightGBM
categorical_cols = X.select_dtypes(include='object').columns.tolist()
for col in categorical_cols:
    X[col] = X[col].astype('category')

X.head()

Unnamed: 0,Intake Type,Intake Condition,Animal Type,Sex upon Intake,Breed,Color Category,Age in Days,Intake Hour,Weekday,Season
0,Stray,Normal,Dog,Spayed Female,English Springer Spaniel,light,2920.0,12,6,summer
1,Stray,Normal,Dog,Intact Male,Basenji Mix,light,330.0,18,3,spring
2,Public Assist,Normal,Cat,Neutered Male,Domestic Shorthair,other,730.0,0,3,spring
3,Owner Surrender,Normal,Dog,Neutered Male,Labrador Retriever Mix,dark,730.0,12,5,winter
4,Public Assist,Normal,Dog,Neutered Male,Great Dane Mix,dark,2190.0,9,1,spring


In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import balanced_accuracy_score

# === Optuna tuning ===
def objective(trial):
    params = {
        'objective': 'multiclass',
        'num_class': len(le_y.classes_),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'num_leaves': trial.suggest_int('num_leaves', 15, 63),
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 100),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 5.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 5.0),
        'random_state': 42,
        'n_estimators': 200,
        'verbosity': -1,
        'is_unbalance': True
    }
    model = LGBMClassifier(**params)

    # Make sure class balance is maintained in each fold: gave highest accuracy of .639
    # skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    # score = cross_val_score(model, X, y_encoded, cv=skf, scoring='accuracy').mean()

    # Use balanced accuracy score instead of accuracy
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = []
    for train_idx, val_idx in skf.split(X, y_encoded):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y_encoded[train_idx], y_encoded[val_idx]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        score = balanced_accuracy_score(y_val, y_pred)
        scores.append(score)

    return np.mean(scores)

    # return score

In [30]:
print("Starting Optuna hyperparameter search...")
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)
print(f"\nBest params: {study.best_params}")

# === Train best model on full training data ===
best_params = study.best_params
best_params.update({
    'objective': 'multiclass',
    'num_class': len(le_y.classes_),
    'random_state': 42,
    'n_estimators': 200,
    'is_unbalance': True
})

# best_model = LGBMClassifier(**best_params)
# best_model.fit(X, y_encoded)

# print("\nModel training complete and ready for prediction!")

from sklearn.model_selection import train_test_split

# Split off validation set
X_train_part, X_val_part, y_train_part, y_val_part = train_test_split(
    X, y_encoded, stratify=y_encoded, test_size=0.2, random_state=42)

# Train on 80%
final_model = LGBMClassifier(**best_params)
final_model.fit(X_train_part, y_train_part)

# Evaluate on 20%
y_val_pred = final_model.predict(X_val_part)
val_balanced_acc = balanced_accuracy_score(y_val_part, y_val_pred)

print(f"\nFinal Balanced Accuracy on held-out validation set: {val_balanced_acc:.4f}")

[I 2025-04-15 20:32:50,432] A new study created in memory with name: no-name-45bcf973-1a1b-4bf8-a8a6-f17451723a61


Starting Optuna hyperparameter search...


[I 2025-04-15 20:33:07,102] Trial 0 finished with value: 0.6368109590864182 and parameters: {'learning_rate': 0.15351546880594447, 'max_depth': 11, 'num_leaves': 31, 'min_child_samples': 91, 'subsample': 0.9308653671796123, 'colsample_bytree': 0.9846328499239296, 'reg_alpha': 2.8265547986872637, 'reg_lambda': 0.6035500532463162}. Best is trial 0 with value: 0.6368109590864182.
[I 2025-04-15 20:33:28,056] Trial 1 finished with value: 0.638943083297139 and parameters: {'learning_rate': 0.10888902187401671, 'max_depth': 12, 'num_leaves': 35, 'min_child_samples': 15, 'subsample': 0.5722793460411375, 'colsample_bytree': 0.6396660286828952, 'reg_alpha': 2.6029015273092266, 'reg_lambda': 3.3897260782941148}. Best is trial 1 with value: 0.638943083297139.
[I 2025-04-15 20:33:46,706] Trial 2 finished with value: 0.6375126689567193 and parameters: {'learning_rate': 0.048968770538089, 'max_depth': 8, 'num_leaves': 40, 'min_child_samples': 98, 'subsample': 0.6433094321091599, 'colsample_bytree': 0


Best params: {'learning_rate': 0.10060123023174604, 'max_depth': 9, 'num_leaves': 62, 'min_child_samples': 23, 'subsample': 0.5006824942780478, 'colsample_bytree': 0.5935244944749666, 'reg_alpha': 0.4053472537289082, 'reg_lambda': 0.27630401721982417}

Final Balanced Accuracy on held-out validation set: 0.4146


In [31]:
# Apply the model to the test data
# === Load and preprocess test.csv ===
test_df = pd.read_csv("test.csv")
test_ids = test_df['Id']

# Drop columns not used in prediction
drop_cols_test = ['Found Location', 'Date of Birth', 'Id']
X_test = test_df.drop(columns=drop_cols_test, errors='ignore')

# Feature engineering (same as training)
X_test['Color Category'] = X_test['Color'].apply(simplify_color)
X_test.drop(columns=['Color'], inplace=True)

X_test['Age in Days'] = X_test['Age upon Intake'].apply(age_to_days)
X_test.drop(columns=['Age upon Intake'], inplace=True)

X_test['Intake Time'] = pd.to_datetime(test_df['Intake Time'], errors='coerce')
X_test['Intake Hour'] = X_test['Intake Time'].dt.hour
X_test['Weekday'] = X_test['Intake Time'].dt.weekday
X_test['Season'] = X_test['Intake Time'].dt.month.map({
    12: 'winter', 1: 'winter', 2: 'winter',
    3: 'spring', 4: 'spring', 5: 'spring',
    6: 'summer', 7: 'summer', 8: 'summer',
    9: 'fall', 10: 'fall', 11: 'fall'
})
X_test.drop(columns=['Intake Time'], inplace=True, errors='ignore')

# Fill missing values
for col in X_test.columns:
    if X_test[col].dtype == 'object':
        X_test[col] = X_test[col].fillna("Unknown")
    else:
        X_test[col] = X_test[col].fillna(X_test[col].median())

# Ensure categorical types match training
for col in categorical_cols:
    if col in X_test.columns:
        X_test[col] = X_test[col].astype('category')

X_test.head()


  X_test['Intake Time'] = pd.to_datetime(test_df['Intake Time'], errors='coerce')


Unnamed: 0,Intake Type,Intake Condition,Animal Type,Sex upon Intake,Breed,Color Category,Age in Days,Intake Hour,Weekday,Season
0,Stray,Normal,Dog,Neutered Male,Beagle Mix,other,730,16,3,winter
1,Stray,Sick,Cat,Intact Female,Domestic Shorthair Mix,other,28,7,0,fall
2,Stray,Normal,Dog,Neutered Male,Doberman Pinsch/Australian Cattle Dog,mixed,1460,10,6,summer
3,Stray,Normal,Dog,Intact Female,Pit Bull,mixed,150,18,5,summer
4,Stray,Injured,Cat,Intact Female,Domestic Shorthair Mix,mixed,730,10,5,winter


In [32]:
# === Make predictions ===

y_test_pred = final_model.predict(X_test)
y_test_labels = le_y.inverse_transform(y_test_pred)

# === Create submission ===
submission_df = pd.DataFrame({
    'Id': test_ids,
    'Outcome Type': y_test_labels
})

submission_df.to_csv("submission4146.csv", index=False)
print("submission.csv created with test predictions!")

submission.csv created with test predictions!
