In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import cross_val_predict
from xgboost import XGBClassifier
import optuna
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

# Helper: Simplify color into dark/light/mixed/other 
def simplify_color(color):
    color = str(color).lower()
    light = ['white', 'cream', 'tan', 'yellow', 'fawn']
    dark = ['black', 'brown', 'chocolate', 'blue', 'gray', 'grey']
    if any(w in color for w in light) and any(w in color for w in dark):
        return 'mixed'
    elif any(w in color for w in light):
        return 'light'
    elif any(w in color for w in dark):
        return 'dark'
    else:
        return 'other'

# Helper: Convert age to days 
def age_to_days(age_str):
    if pd.isnull(age_str):
        return np.nan
    num, unit = age_str.split()
    num = int(num)
    unit = unit.lower()
    if 'day' in unit:
        return num
    elif 'week' in unit:
        return num * 7
    elif 'month' in unit:
        return num * 30
    elif 'year' in unit:
        return num * 365
    return np.nan

# Load and preprocess train.csv
train_df = pd.read_csv("train.csv")

# Get the target and id column
target_col = "Outcome Type"
id_col = "Id"

# drop unneeded columns
drop_cols = ['Outcome Time', 'Found Location', 'Date of Birth', 'Name', target_col, id_col]
X = train_df.drop(columns=drop_cols, errors='ignore')
y = train_df[target_col]

# Label encode target
le_y = LabelEncoder()
y_encoded = le_y.fit_transform(y)

# Feature engineering
X['Color Category'] = X['Color'].apply(simplify_color)
X.drop(columns=['Color'], inplace=True)

X['Age in Days'] = X['Age upon Intake'].apply(age_to_days)
X.drop(columns=['Age upon Intake'], inplace=True)

# DID NOT USE: this would result in "0" for animals less than 1wk old
# X['Age Group'] = X['Age in Weeks'].apply(age_group)
# X = X.drop(columns=['Age in Weeks'])

# Categorize intake time into hour, weekday, and season (slightly improved accuracy by about .01%: .62 to .63)
X['Intake Time'] = pd.to_datetime(train_df['Intake Time'], errors='coerce')
X['Intake Hour'] = X['Intake Time'].dt.hour
X['Weekday'] = X['Intake Time'].dt.weekday
X['Season'] = X['Intake Time'].dt.month.map({
    12: 'winter', 1: 'winter', 2: 'winter',
    3: 'spring', 4: 'spring', 5: 'spring',
    6: 'summer', 7: 'summer', 8: 'summer',
    9: 'fall', 10: 'fall', 11: 'fall'
})

# Drop intake time after using to create new columns
X.drop(columns=['Intake Time'], inplace=True, errors='ignore')

# Fill missing values
for col in X.columns:
    if X[col].dtype == 'object':
        X[col] = X[col].fillna("Unknown")
    else:
        X[col] = X[col].fillna(X[col].median())

# Convert categoricals to category dtype for LightGBM
categorical_cols = X.select_dtypes(include='object').columns.tolist()
for col in categorical_cols:
    X[col] = X[col].astype('category')

# Convert all categorical columns to numeric codes for XGBoost
X_numeric = X.copy()
for col in X_numeric.select_dtypes(include='category').columns:
    X_numeric[col] = X_numeric[col].cat.codes

X.head()
X_numeric.head()

  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,Intake Type,Intake Condition,Animal Type,Sex upon Intake,Breed,Color Category,Age in Days,Intake Hour,Weekday,Season
0,4,11,1,3,1186,1,2920.0,12,6,2
1,4,11,1,1,281,1,330.0,18,3,1
2,3,11,0,2,1125,3,730.0,0,3,1
3,2,11,1,2,1488,0,730.0,12,5,3
4,3,11,1,2,1329,0,2190.0,9,1,1


In [None]:
import xgboost as xgb

# objective function for the lgbm 
def lgbm_objective(trial):
    params = {
        'objective': 'multiclass',
        'num_class': len(le_y.classes_),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'num_leaves': trial.suggest_int('num_leaves', 15, 63),
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 100),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 5.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 5.0),
        'random_state': 42,
        'n_estimators': 200,
        'verbosity': -1,
        'is_unbalance': True
    }

    # do a kfold cross validation and return the balanced accuracy score
    model = LGBMClassifier(**params)
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    y_preds = cross_val_predict(model, X, y_encoded, cv=skf, method="predict")
    return balanced_accuracy_score(y_encoded, y_preds)

# objective function for xgb
def xgb_objective(trial):
    params = {
        'objective': 'multi:softprob',
        'num_class': len(le_y.classes_),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 5.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 5.0),
        'random_state': 42,
        'n_estimators': 200,
        'use_label_encoder': False,
        'eval_metric': 'mlogloss'
    }
    
    # do a kfold cross validation and return the balanced accuracy score
    model = XGBClassifier(**params)
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    y_preds = cross_val_predict(model, X_numeric, y_encoded, cv=skf, method="predict")
    return balanced_accuracy_score(y_encoded, y_preds)

In [None]:
# use optuna to adjust hyperparameters for lgbm
study_lgbm = optuna.create_study(direction='maximize')
study_lgbm.optimize(lgbm_objective, n_trials=30)

# get the model with highest accuracy
best_lgbm_params = study_lgbm.best_params
best_lgbm_params.update({
    'objective': 'multiclass',
    'num_class': len(le_y.classes_),
    'random_state': 42,
    'n_estimators': 200,
    'verbosity': -1,
    'is_unbalance': True
})

model_lgbm = LGBMClassifier(**best_lgbm_params)

[I 2025-04-15 23:53:30,091] A new study created in memory with name: no-name-16d1ee6e-79ca-4bec-b9e3-f9364828569a
[I 2025-04-15 23:53:37,214] Trial 0 finished with value: 0.3984862277002101 and parameters: {'learning_rate': 0.1487388665567224, 'max_depth': 3, 'num_leaves': 54, 'min_child_samples': 47, 'subsample': 0.6064490309479724, 'colsample_bytree': 0.6446360917691923, 'reg_alpha': 4.617543554758913, 'reg_lambda': 2.449597336919772}. Best is trial 0 with value: 0.3984862277002101.
[I 2025-04-15 23:53:56,391] Trial 1 finished with value: 0.40543215024488166 and parameters: {'learning_rate': 0.0497386308741707, 'max_depth': 7, 'num_leaves': 57, 'min_child_samples': 45, 'subsample': 0.7967649129320018, 'colsample_bytree': 0.7242924997376072, 'reg_alpha': 2.6508938749420623, 'reg_lambda': 2.670885885317549}. Best is trial 1 with value: 0.40543215024488166.
[I 2025-04-15 23:54:09,111] Trial 2 finished with value: 0.4058292541874081 and parameters: {'learning_rate': 0.10037797474064049, 

In [None]:
# adjust hyperparameters for xgb model
study_xgb = optuna.create_study(direction='maximize')
study_xgb.optimize(xgb_objective, n_trials=30)

# get the model with highest accuracy
best_xgb_params = study_xgb.best_params
best_xgb_params.update({
    'objective': 'multi:softprob',
    'num_class': len(le_y.classes_),
    'random_state': 42,
    'n_estimators': 200,
    'use_label_encoder': False,
    'eval_metric': 'mlogloss'
})

model_xgb = XGBClassifier(**best_xgb_params)

[I 2025-04-16 00:02:14,700] A new study created in memory with name: no-name-332d724f-662e-4a67-8000-3b750d92d591
[I 2025-04-16 00:02:29,718] Trial 0 finished with value: 0.4094631820171082 and parameters: {'learning_rate': 0.09232834482440931, 'max_depth': 12, 'subsample': 0.9813988285780086, 'colsample_bytree': 0.5896194484195714, 'reg_alpha': 2.2260006045983993, 'reg_lambda': 4.227364319048787}. Best is trial 0 with value: 0.4094631820171082.
[I 2025-04-16 00:02:37,085] Trial 1 finished with value: 0.4021488606296394 and parameters: {'learning_rate': 0.1956870368408444, 'max_depth': 5, 'subsample': 0.7840069465387474, 'colsample_bytree': 0.8149256362061087, 'reg_alpha': 3.4185989706672153, 'reg_lambda': 4.200436640566577}. Best is trial 0 with value: 0.4094631820171082.
[I 2025-04-16 00:02:44,243] Trial 2 finished with value: 0.3786705545284648 and parameters: {'learning_rate': 0.02861073136024856, 'max_depth': 5, 'subsample': 0.7942550771833077, 'colsample_bytree': 0.67064758254568

In [None]:
from sklearn.model_selection import train_test_split

# split the data to training and test set
X_train, X_val, y_train, y_val = train_test_split(X, y_encoded, test_size=0.2, stratify=y_encoded, random_state=42)
X_train_numeric, X_val_numeric = X_numeric.loc[X_train.index], X_numeric.loc[X_val.index]

# train both models using the training set
model_lgbm.fit(X_train, y_train)
model_xgb.fit(X_train_numeric, y_train)

# Ensemble with soft voting
probs_lgbm = model_lgbm.predict_proba(X_val)
probs_xgb  = model_xgb.predict_proba(X_val_numeric)

# Equal-weight ensemble
avg_probs = (probs_lgbm + probs_xgb) / 2
y_pred_ensemble = np.argmax(avg_probs, axis=1)
val_score_ensemble = balanced_accuracy_score(y_val, y_pred_ensemble)

print("Balanced Accuracy:", val_score_ensemble)

# Weighted ensemble to improve accuracy
avg_probs = (0.7 * probs_xgb + 0.3 * probs_lgbm)
y_pred_weighted = np.argmax(avg_probs, axis=1)
val_score_weighted = balanced_accuracy_score(y_val, y_pred_weighted)

print("Balanced Accuracy (Weighted Ensemble):", val_score_weighted)

Balanced Accuracy: 0.4237261841269383
Balanced Accuracy (Weighted Ensemble): 0.4247346533991651


In [15]:
# Apply the model to the test data
# === Load and preprocess test.csv ===
test_df = pd.read_csv("test.csv")
test_ids = test_df['Id']

# Drop columns not used in prediction
drop_cols_test = ['Found Location', 'Date of Birth', 'Id']
X_test = test_df.drop(columns=drop_cols_test, errors='ignore')

# Feature engineering (same as training)
X_test['Color Category'] = X_test['Color'].apply(simplify_color)
X_test.drop(columns=['Color'], inplace=True)

X_test['Age in Days'] = X_test['Age upon Intake'].apply(age_to_days)
X_test.drop(columns=['Age upon Intake'], inplace=True)

X_test['Intake Time'] = pd.to_datetime(test_df['Intake Time'], errors='coerce')
X_test['Intake Hour'] = X_test['Intake Time'].dt.hour
X_test['Weekday'] = X_test['Intake Time'].dt.weekday
X_test['Season'] = X_test['Intake Time'].dt.month.map({
    12: 'winter', 1: 'winter', 2: 'winter',
    3: 'spring', 4: 'spring', 5: 'spring',
    6: 'summer', 7: 'summer', 8: 'summer',
    9: 'fall', 10: 'fall', 11: 'fall'
})
X_test.drop(columns=['Intake Time'], inplace=True, errors='ignore')

# Fill missing values
for col in X_test.columns:
    if X_test[col].dtype == 'object':
        X_test[col] = X_test[col].fillna("Unknown")
    else:
        X_test[col] = X_test[col].fillna(X_test[col].median())

# Ensure categorical types match training
for col in categorical_cols:
    if col in X_test.columns:
        X_test[col] = X_test[col].astype('category')

# Convert all categorical columns to numeric codes for XGBoost
X_test_numeric = X_test.copy()
for col in X_test_numeric.select_dtypes(include='category').columns:
    X_test_numeric[col] = X_test_numeric[col].cat.codes

X_test.head()


Unnamed: 0,Intake Type,Intake Condition,Animal Type,Sex upon Intake,Breed,Color Category,Age in Days,Intake Hour,Weekday,Season
0,Stray,Normal,Dog,Neutered Male,Beagle Mix,other,730,16,3,winter
1,Stray,Sick,Cat,Intact Female,Domestic Shorthair Mix,other,28,7,0,fall
2,Stray,Normal,Dog,Neutered Male,Doberman Pinsch/Australian Cattle Dog,mixed,1460,10,6,summer
3,Stray,Normal,Dog,Intact Female,Pit Bull,mixed,150,18,5,summer
4,Stray,Injured,Cat,Intact Female,Domestic Shorthair Mix,mixed,730,10,5,winter


In [None]:
# predict on the test
probs_lgbm_test = model_lgbm.predict_proba(X_test)
probs_xgb_test  = model_xgb.predict_proba(X_test_numeric)

# for weighted ensemble
avg_probs_test = (
    0.7 * probs_xgb_test +
    0.3 * probs_lgbm_test
)

# equal weight ensemble
# avg_probs_test = (probs_lgbm_test + probs_xgb_test) / 2

# predict the labels
y_test_pred = np.argmax(avg_probs_test, axis=1)
y_test_labels = le_y.inverse_transform(y_test_pred)

# create the csv file
submission = pd.DataFrame({'Id': test_ids, 'Outcome Type': y_test_labels})
submission.to_csv("submission_ensemble.csv", index=False)
