In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report
from sklearn.impute import KNNImputer
import lightgbm as lgb

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [3]:
train = train.dropna(subset=["age_group"])

In [4]:
le = LabelEncoder()
train["age_group_encoded"] = le.fit_transform(train["age_group"])

In [5]:
features = ['RIAGENDR', 'PAQ605', 'BMXBMI', 'LBXGLU', 'DIQ010', 'LBXGLT', 'LBXIN']
X = train[features]
y = train["age_group_encoded"]
X_test = test[features]

In [6]:
imputer = KNNImputer(n_neighbors=5)
X = imputer.fit_transform(X)
X_test = imputer.transform(X_test)

In [7]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [8]:
model = lgb.LGBMClassifier(
    class_weight='balanced',
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    random_state=42
)
model.fit(X_train, y_train)



[LightGBM] [Info] Number of positive: 263, number of negative: 1298
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000337 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 751
[LightGBM] [Info] Number of data points in the train set: 1561, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


In [9]:
y_val_probs = model.predict_proba(X_val)[:, 1]



In [10]:
best_thresh, best_f1 = 0.5, 0
for t in np.arange(0.3, 0.7, 0.01):
    f1 = f1_score(y_val, (y_val_probs >= t).astype(int))
    if f1 > best_f1:
        best_f1 = f1
        best_thresh = t

print("Best Threshold:", best_thresh)
print("Best F1 Score:", best_f1)
print(classification_report(y_val, (y_val_probs >= best_thresh).astype(int), target_names=le.classes_))

Best Threshold: 0.6000000000000003
Best F1 Score: 0.3826086956521739
              precision    recall  f1-score   support

       Adult       0.91      0.88      0.89       340
      Senior       0.34      0.43      0.38        51

    accuracy                           0.82       391
   macro avg       0.63      0.65      0.64       391
weighted avg       0.84      0.82      0.83       391



In [11]:
test_probs = model.predict_proba(X_test)[:, 1]
test_preds = (test_probs >= best_thresh).astype(int)




In [12]:
submission = pd.DataFrame({
    "age_group": test_preds
})
submission.to_csv("submission.csv", index=False)

In [13]:
from google.colab import files
files.download('submission.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>