This file was another attempt with slightly different preprocessing and tuning techniques for LightGBM. It did not yield us results higher than our final model, so we didn't end up using it. This model used the breed and color bucketing. It was one of our later trials, and definitely a more efficient approach to hyperparameter tuning. 

Preprocessing functions

In [6]:
# PREPROCESSING

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier  # For KNN
import lightgbm as lgb
import warnings
import xgboost as xgb
from tqdm.notebook import tqdm
warnings.filterwarnings('ignore')

# === Helper Functions ===
def simplify_color(color_str):
    """
    Map raw color strings to a smaller set of standardized categories.
    This function checks for the presence of common color keywords.
    """
    if pd.isna(color_str):
        return "unknown"
    color_str = color_str.lower()
    if "black" in color_str:
        return "black"
    elif "brown" in color_str:
        return "brown"
    elif "white" in color_str:
        return "white"
    elif "tan" in color_str or "gold" in color_str:
        return "tan_gold"
    elif "grey" in color_str or "gray" in color_str:
        return "gray"
    else:
        return "other"

def age_to_weeks(age_str):
    """
    Convert an age string into estimated weeks.
    e.g., "2 years" becomes 104 weeks and "6 months" becomes about 24 weeks.
    """
    if pd.isna(age_str):
        return np.nan
    tokens = age_str.split()
    if len(tokens) < 2:
        return np.nan
    if 'year' in tokens[1]:
        return float(tokens[0]) * 52
    elif 'month' in tokens[1]:
        return float(tokens[0]) * 4
    elif 'week' in tokens[1]:
        return float(tokens[0])
    else:
        return np.nan
    

breed_size = {
    # SMALL (<20 lbs avg)
    'chihuahua':        'small',
    'pembroke welsh corgi':'small',
    'pug':               'small',
    'yorkshire terrier':'small',
    'dachshund':         'small',
    'pomeranian':        'small',
    'papillon':          'small',
    'shih tzu':          'small',
    'maltese':           'small',
    'rat terrier':       'small',
    'jack russell terrier':'small',
    'west highland white terrier':'small',
    # MEDIUM (20–50 lbs avg)
    'border collie':     'medium',
    'australian cattle dog':'medium',
    'beagle':            'medium',
    'boston terrier':    'medium',
    'cocker spaniel':    'medium',
    'cairn terrier':     'medium',
    'bichon frise':      'medium',
    'boston terrier':    'medium',
    'siberian husky':    'medium',
    # LARGE (50+ lbs avg)
    'labrador retriever':'large',
    'golden retriever':  'large',
    'german shepherd':   'large',
    'rottweiler':        'large',
    ' dob':              'large',  # for “doberman”
    'boxer':             'large',
    'great dane':        'large',
    'mastiff':           'large',
    'newfoundland':      'large',
    'bernese mountain dog':'large',
    'great pyrenees':    'large',
    'alaskan malamute':  'large',
    'cane corso':        'large',
    'doberman pinscher': 'large',
    'chow chow':         'large',
    # …add more breeds as needed…
}

# 2) Fallback keyword sets for truly rare / unlisted mixes
small_keys = ['chihuahua','toy','pomeranian','papillon','yorkshire','pug']
large_keys = ['mastiff','wolfhound','dane','newfoundland','retriever','shepherd','rottweiler','boxer','bulldog','malamute']

def size_from_breed(breed):
    b = breed.lower().replace('mix','').strip()
    # split on slashes
    components = [c.strip() for c in b.replace('/',',').split(',')]
    sizes = []
    for comp in components:
        # exact lookup?
        if comp in breed_size:
            sizes.append(breed_size[comp])
        else:
            # fallback to keyword scan
            if any(k in comp for k in small_keys):
                sizes.append('small')
            elif any(k in comp for k in large_keys):
                sizes.append('large')
            else:
                sizes.append('medium')
    # mixture takes the LARGEST of its parents
    if 'large'  in sizes: return 'large'
    if 'medium' in sizes: return 'medium'
    return 'small'

Preprocessing

In [7]:


# === 1. Load and Subsample Data ===
# For prototyping, we select a stratified random sample of ~40,000 rows
# to maintain the same target distribution as the full dataset.
df_full = pd.read_csv("train.csv")
# Option A: Using groupby with sample (ensures stratification)
# sample_fraction = 40000 / len(df_full)
# sample_df = df_full.groupby("Outcome Type", group_keys=False).apply(lambda x: x.sample(frac=sample_fraction, random_state=42))
# Option B (alternative): Using train_test_split's stratify option:
sample_df, _ = train_test_split(df_full, train_size=50000, stratify=df_full["Outcome Type"], random_state=42)

# === 2. Target & ID Setup ===
target_col = "Outcome Type"
id_col = "Id"
# Drop columns that are either high-leakage or not available in the test set.
# Here, we drop Outcome Time as well since it isn't present in the test data.
drop_cols = ['Found Location', 'Date of Birth', 'Name', target_col, id_col, 'Outcome Time']
X = df_full.drop(columns=drop_cols, errors='ignore')
y = df_full[target_col]

# === 3. Encode the Target Variable ===
# Although there are only five outcomes, encoding ensures compatibility with all models.
le_y = LabelEncoder()
y_encoded = le_y.fit_transform(y)

# === 4. Feature Engineering ===
# Simplify the 'Color' feature into standardized categories
X['Color Category'] = X['Color'].apply(simplify_color)
X = X.drop(columns=['Color'])

# Convert Age to Weeks
X['Age in Weeks'] = X['Age upon Intake'].apply(age_to_weeks)
X.drop(columns=['Age upon Intake'], inplace=True)

# Process Intake Time if necessary. Since Outcome Time is not used and not in test data,
# we drop Intake Time as well (or extract features if desired).
datetime_format = "%m/%d/%Y %I:%M:%S %p"
X['Intake Time'] = pd.to_datetime(X['Intake Time'], format=datetime_format, errors='coerce')
# If you wish to derive time-based features (like hour or day of week), do it here.
# For now, we drop it to ensure consistency with the test set.
X = X.drop(columns=['Intake Time'])



# --- target‑encode breed ###
breed_counts = df_full.groupby(['Breed','Outcome Type']).size()\
                      .unstack(fill_value=0)
breed_probs = breed_counts.div(breed_counts.sum(axis=1), axis=0)
global_probs = df_full['Outcome Type']\
                .value_counts(normalize=True).to_dict()

for outcome in breed_probs.columns:
    X[f'breed_prob_{outcome}'] = X['Breed'].map(breed_probs[outcome])\
                                        .fillna(global_probs[outcome])
X.drop(columns=['Breed'], inplace=True)

# … then your color bucketing, age, one‑hot on color category, etc. …

# Fill missing values: For object columns, fill with "Unknown"; for numeric, fill with median.
for col in X.columns:
    if X[col].dtype == 'object':
        X[col] = X[col].fillna("Unknown")
    else:
        X[col] = X[col].fillna(X[col].median())



# # === 5. Encode Categorical Variables ===
categorical_cols = X.select_dtypes(include='object').columns
print("categorical cols", categorical_cols)
X = pd.get_dummies(X, columns=categorical_cols, drop_first=True)


# print(X["Intake Type"].unique())
# print(X["Intake Condition"].unique())
# print(X["Animal Type"].unique())
# print(X["Breed"].unique())
# print(X["Color Category"].unique())
# Ensure all features are numeric
X = X.astype(float)

print("Processed feature sample:")

X.head()



categorical cols Index(['Intake Type', 'Intake Condition', 'Animal Type', 'Sex upon Intake',
       'Color Category'],
      dtype='object')
Processed feature sample:


Unnamed: 0,Age in Weeks,breed_prob_Adoption,breed_prob_Died,breed_prob_Euthanasia,breed_prob_Return to Owner,breed_prob_Transfer,Intake Type_Euthanasia Request,Intake Type_Owner Surrender,Intake Type_Public Assist,Intake Type_Stray,...,Animal Type_Dog,Sex upon Intake_Intact Male,Sex upon Intake_Neutered Male,Sex upon Intake_Spayed Female,Sex upon Intake_Unknown,Color Category_brown,Color Category_gray,Color Category_other,Color Category_tan_gold,Color Category_white
0,416.0,0.0,0.0,0.0,0.6,0.4,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,44.0,0.654545,0.0,0.0,0.2,0.145455,0.0,0.0,0.0,1.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,104.0,0.56419,0.016577,0.028917,0.033653,0.356662,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,104.0,0.552352,0.004775,0.022285,0.193314,0.227273,0.0,1.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,312.0,0.34,0.0,0.0,0.4,0.26,0.0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:


# 1. Assign X_full and y_full for tuning
X_full = X.copy()
y_full = y_encoded.copy()


In [9]:
!pip install optuna

Defaulting to user installation because normal site-packages is not writeable


Running the Optuna tuning on the model. 

In [13]:
import optuna
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import balanced_accuracy_score
# import lightgbm as lgb
from lightgbm import LGBMClassifier, early_stopping


# 1) Pre-materialize your features once
# --------------------------------------------------
# (Assuming X_full, y_full are your preprocessed DataFrame/Series)
X_full_mat = X_full.values.astype(np.float32)
y_full_arr = y_full

# 2) Define Stratified 3‑fold splitter
# --------------------------------------------------
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)




def objective(trial):
    # sample hyperparameters
    params = {
        'boosting_type': 'gbdt',
        'objective':    'multiclass',
        'num_class':    len(le_y.classes_),
        'n_jobs':       -1,
        'num_leaves':   trial.suggest_int('num_leaves', 15, 127),
        'max_depth':    trial.suggest_int('max_depth', 3, 16),
        'learning_rate':    trial.suggest_loguniform('learning_rate', 1e-3, 0.3),
        'subsample':        trial.suggest_uniform('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
        'min_child_samples':trial.suggest_int('min_child_samples', 5, 100),
        'reg_lambda':       trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),
        'verbose': -1      # silence internal logs
    }

    cv_scores = []
    for train_idx, valid_idx in skf.split(X_full_mat, y_full_arr):
        X_tr, X_val = X_full_mat[train_idx], X_full_mat[valid_idx]
        y_tr, y_val = y_full_arr[train_idx], y_full_arr[valid_idx]

        # inject params + n_estimators + silent mode
        clf = LGBMClassifier(**params, n_estimators=1000)

        clf.fit(
            X_tr, y_tr,
            eval_set=[(X_val, y_val)],
            eval_metric='multi_logloss',
            callbacks=[early_stopping(stopping_rounds=50)]
        )

        preds = clf.predict(X_val)
        cv_scores.append(balanced_accuracy_score(y_val, preds))

    return sum(cv_scores) / len(cv_scores)




# 5) Build the study with a MedianPruner
# --------------------------------------------------
study = optuna.create_study(
    direction='maximize',
    pruner=optuna.pruners.MedianPruner(
        n_startup_trials=10,  # let 10 trials run to build median
        n_warmup_steps=0,
        interval_steps=1
    )
)

# 6) Run for up to 60 minutes (3600 sec) or 100 trials, whichever comes first
# --------------------------------------------------
study.optimize(objective, n_trials=100, timeout=2400)

print("Best trial:")
print(study.best_trial.params)
print("Best CV balanced accuracy:", study.best_value)


[I 2025-04-16 23:12:07,691] A new study created in memory with name: no-name-78bb93a0-098d-438a-b3a9-04af0cb07fbf


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's multi_logloss: 0.860821
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's multi_logloss: 0.868326
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's multi_logloss: 0.867246


[I 2025-04-16 23:17:59,270] Trial 0 finished with value: 0.384253869194326 and parameters: {'num_leaves': 73, 'max_depth': 10, 'learning_rate': 0.0023572195982527987, 'subsample': 0.9128722430506582, 'colsample_bytree': 0.562697722770656, 'min_child_samples': 96, 'reg_lambda': 8.49894773519296}. Best is trial 0 with value: 0.384253869194326.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's multi_logloss: 0.880333
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's multi_logloss: 0.886883
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's multi_logloss: 0.885755


[I 2025-04-16 23:22:33,720] Trial 1 finished with value: 0.37764626385954486 and parameters: {'num_leaves': 42, 'max_depth': 15, 'learning_rate': 0.0012459970191869751, 'subsample': 0.7789145720635944, 'colsample_bytree': 0.9161013397474682, 'min_child_samples': 45, 'reg_lambda': 0.018750468028232378}. Best is trial 0 with value: 0.384253869194326.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[532]	valid_0's multi_logloss: 0.841194
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[438]	valid_0's multi_logloss: 0.849687
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[433]	valid_0's multi_logloss: 0.849164


[I 2025-04-16 23:23:35,094] Trial 2 finished with value: 0.40433068308146414 and parameters: {'num_leaves': 82, 'max_depth': 4, 'learning_rate': 0.03781276602727258, 'subsample': 0.6265458956868566, 'colsample_bytree': 0.8896544944005711, 'min_child_samples': 21, 'reg_lambda': 0.3112995658918904}. Best is trial 2 with value: 0.40433068308146414.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[48]	valid_0's multi_logloss: 0.839924
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[43]	valid_0's multi_logloss: 0.84974
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[37]	valid_0's multi_logloss: 0.848261


[I 2025-04-16 23:23:54,390] Trial 3 finished with value: 0.41197995057628817 and parameters: {'num_leaves': 75, 'max_depth': 8, 'learning_rate': 0.14526045233692922, 'subsample': 0.7275549031684927, 'colsample_bytree': 0.800324833803508, 'min_child_samples': 52, 'reg_lambda': 0.8514175545663637}. Best is trial 3 with value: 0.41197995057628817.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[452]	valid_0's multi_logloss: 0.839233
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[460]	valid_0's multi_logloss: 0.848957
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[461]	valid_0's multi_logloss: 0.847465


[I 2025-04-16 23:25:52,282] Trial 4 finished with value: 0.41047899812516436 and parameters: {'num_leaves': 62, 'max_depth': 8, 'learning_rate': 0.014887356208053888, 'subsample': 0.9145099324425869, 'colsample_bytree': 0.7464286260559454, 'min_child_samples': 90, 'reg_lambda': 0.8152888863943535}. Best is trial 3 with value: 0.41197995057628817.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's multi_logloss: 0.845529
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's multi_logloss: 0.854225
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's multi_logloss: 0.852835


[I 2025-04-16 23:33:23,077] Trial 5 finished with value: 0.4059664864426534 and parameters: {'num_leaves': 97, 'max_depth': 11, 'learning_rate': 0.002772310936430767, 'subsample': 0.9254561017651506, 'colsample_bytree': 0.7020494999712577, 'min_child_samples': 39, 'reg_lambda': 0.03320649613597291}. Best is trial 3 with value: 0.41197995057628817.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[98]	valid_0's multi_logloss: 0.840081
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[101]	valid_0's multi_logloss: 0.84957
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[115]	valid_0's multi_logloss: 0.848477


[I 2025-04-16 23:33:53,243] Trial 6 finished with value: 0.4087687611813194 and parameters: {'num_leaves': 104, 'max_depth': 7, 'learning_rate': 0.06948876962108243, 'subsample': 0.9819748147394038, 'colsample_bytree': 0.7549787946679074, 'min_child_samples': 88, 'reg_lambda': 0.050096537187753025}. Best is trial 3 with value: 0.41197995057628817.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[72]	valid_0's multi_logloss: 0.840273
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[74]	valid_0's multi_logloss: 0.850075
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[60]	valid_0's multi_logloss: 0.848893


[I 2025-04-16 23:34:09,937] Trial 7 finished with value: 0.40987323544851356 and parameters: {'num_leaves': 111, 'max_depth': 5, 'learning_rate': 0.1900119587656553, 'subsample': 0.8415029095462363, 'colsample_bytree': 0.7796653248301413, 'min_child_samples': 44, 'reg_lambda': 1.754853349531811}. Best is trial 3 with value: 0.41197995057628817.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[158]	valid_0's multi_logloss: 0.842096
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[157]	valid_0's multi_logloss: 0.851004
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[147]	valid_0's multi_logloss: 0.850727


[I 2025-04-16 23:35:28,119] Trial 8 finished with value: 0.4126591546668496 and parameters: {'num_leaves': 122, 'max_depth': 14, 'learning_rate': 0.02488190246631083, 'subsample': 0.6089319175272458, 'colsample_bytree': 0.8590622623545716, 'min_child_samples': 44, 'reg_lambda': 0.003622933097101497}. Best is trial 8 with value: 0.4126591546668496.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[88]	valid_0's multi_logloss: 0.839214
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[85]	valid_0's multi_logloss: 0.849407
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[86]	valid_0's multi_logloss: 0.848042


[I 2025-04-16 23:35:53,490] Trial 9 finished with value: 0.4119381399929949 and parameters: {'num_leaves': 36, 'max_depth': 11, 'learning_rate': 0.07865752348687625, 'subsample': 0.9290916242564145, 'colsample_bytree': 0.7206818110753674, 'min_child_samples': 89, 'reg_lambda': 0.004366567030366397}. Best is trial 8 with value: 0.4126591546668496.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[324]	valid_0's multi_logloss: 0.841165
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[317]	valid_0's multi_logloss: 0.849816
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[315]	valid_0's multi_logloss: 0.849671


[I 2025-04-16 23:38:15,690] Trial 10 finished with value: 0.4142831661274733 and parameters: {'num_leaves': 122, 'max_depth': 16, 'learning_rate': 0.012295555988009719, 'subsample': 0.5016370896820155, 'colsample_bytree': 0.9564976280542989, 'min_child_samples': 12, 'reg_lambda': 0.0010792826300803577}. Best is trial 10 with value: 0.4142831661274733.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[344]	valid_0's multi_logloss: 0.8413
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[336]	valid_0's multi_logloss: 0.85022
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[332]	valid_0's multi_logloss: 0.849598


[I 2025-04-16 23:40:48,672] Trial 11 finished with value: 0.41412663012732026 and parameters: {'num_leaves': 127, 'max_depth': 16, 'learning_rate': 0.011454586652525828, 'subsample': 0.5072554689981483, 'colsample_bytree': 0.9867791919587336, 'min_child_samples': 10, 'reg_lambda': 0.0011713898274647993}. Best is trial 10 with value: 0.4142831661274733.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[435]	valid_0's multi_logloss: 0.841406
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[438]	valid_0's multi_logloss: 0.850089
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[418]	valid_0's multi_logloss: 0.849405


[I 2025-04-16 23:44:11,270] Trial 12 finished with value: 0.4129303440839353 and parameters: {'num_leaves': 127, 'max_depth': 16, 'learning_rate': 0.009049837387586308, 'subsample': 0.5081600208317252, 'colsample_bytree': 0.9952429059618526, 'min_child_samples': 6, 'reg_lambda': 0.0010837441088158038}. Best is trial 10 with value: 0.4142831661274733.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[573]	valid_0's multi_logloss: 0.840107
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[580]	valid_0's multi_logloss: 0.848823
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[553]	valid_0's multi_logloss: 0.848528


[I 2025-04-16 23:48:40,445] Trial 13 finished with value: 0.4134504777995816 and parameters: {'num_leaves': 97, 'max_depth': 13, 'learning_rate': 0.007244193069973117, 'subsample': 0.5064923839465701, 'colsample_bytree': 0.9987648142593202, 'min_child_samples': 11, 'reg_lambda': 0.0012951552897274947}. Best is trial 10 with value: 0.4142831661274733.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's multi_logloss: 0.842275
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's multi_logloss: 0.851693
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's multi_logloss: 0.850651


[I 2025-04-16 23:51:45,470] Trial 14 finished with value: 0.4049047930619114 and parameters: {'num_leaves': 15, 'max_depth': 13, 'learning_rate': 0.005892365443801871, 'subsample': 0.58582704808971, 'colsample_bytree': 0.9348504410951178, 'min_child_samples': 25, 'reg_lambda': 0.0069971879901187615}. Best is trial 10 with value: 0.4142831661274733.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[258]	valid_0's multi_logloss: 0.841165
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[242]	valid_0's multi_logloss: 0.85111
Training until validation scores don't improve for 50 rounds


[W 2025-04-16 23:53:40,268] Trial 15 failed with parameters: {'num_leaves': 117, 'max_depth': 16, 'learning_rate': 0.01774142981741026, 'subsample': 0.6975965711505208, 'colsample_bytree': 0.6290678448742418, 'min_child_samples': 28, 'reg_lambda': 0.011656179632202137} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "C:\Users\saifk\AppData\Roaming\Python\Python39\site-packages\optuna\study\_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\saifk\AppData\Local\Temp\ipykernel_35356\705198822.py", line 48, in objective
    clf.fit(
  File "C:\Users\saifk\AppData\Roaming\Python\Python39\site-packages\lightgbm\sklearn.py", line 1560, in fit
    super().fit(
  File "C:\Users\saifk\AppData\Roaming\Python\Python39\site-packages\lightgbm\sklearn.py", line 1049, in fit
    self._Booster = train(
  File "C:\Users\saifk\AppData\Roaming\Python\Python39\site-packages\lightgbm\engine.py", line 322, in train
    boos

KeyboardInterrupt: 

Late attempt to train the model on the full data set after optuna used just a smaller subset, but ran out of time.

In [15]:
best_model = lgb.LGBMClassifier()
best_model.fit(X_train_full, y_train_full)

NameError: name 'X_train_full' is not defined

Adjusting the test data so that the model could work on it.

In [16]:
# Apply the model to the test data
# === Load and preprocess test.csv ===
test_df = pd.read_csv("test.csv")
test_ids = test_df['Id']

# Drop columns not used in prediction
drop_cols_test = ['Found Location', 'Date of Birth', 'Id']
X_test = test_df.drop(columns=drop_cols_test, errors='ignore')

# Feature engineering (same as training)
X_test['Color Category'] = X_test['Color'].apply(simplify_color)
X_test.drop(columns=['Color'], inplace=True)

X_test['Age in Days'] = X_test['Age upon Intake'].apply(age_to_days)
X_test.drop(columns=['Age upon Intake'], inplace=True)

X_test['Intake Time'] = pd.to_datetime(test_df['Intake Time'], errors='coerce')
X_test['Intake Hour'] = X_test['Intake Time'].dt.hour
X_test['Weekday'] = X_test['Intake Time'].dt.weekday
X_test['Season'] = X_test['Intake Time'].dt.month.map({
    12: 'winter', 1: 'winter', 2: 'winter',
    3: 'spring', 4: 'spring', 5: 'spring',
    6: 'summer', 7: 'summer', 8: 'summer',
    9: 'fall', 10: 'fall', 11: 'fall'
})
X_test.drop(columns=['Intake Time'], inplace=True, errors='ignore')

# Fill missing values
for col in X_test.columns:
    if X_test[col].dtype == 'object':
        X_test[col] = X_test[col].fillna("Unknown")
    else:
        X_test[col] = X_test[col].fillna(X_test[col].median())

# Ensure categorical types match training
for col in categorical_cols:
    if col in X_test.columns:
        X_test[col] = X_test[col].astype('category')

X_test.head()


FileNotFoundError: [Errno 2] No such file or directory: 'test.csv'

Run the test data through the model.

In [None]:
# === Make predictions ===

y_test_pred = best_model.predict(X_test)
y_test_labels = le_y.inverse_transform(y_test_pred)

# === Create submission ===
submission_df = pd.DataFrame({
    'Id': test_ids,
    'Outcome Type': y_test_labels
})

submission_df.to_csv("submission4149.csv", index=False)
print("submission.csv created with test predictions!")