In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/hackathon-set/hacktest.csv
/kaggle/input/hackathon-set/hacktrain.csv


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from lightgbm import LGBMClassifier
from scipy.stats import mode
from lightgbm import early_stopping, log_evaluation

# Load data
train_df = pd.read_csv("/kaggle/input/hackathon-set/hacktrain.csv")
test_df = pd.read_csv("/kaggle/input/hackathon-set/hacktest.csv")

train_df = train_df.drop(columns=["Unnamed: 0"])
test_df = test_df.drop(columns=["Unnamed: 0"])

le = LabelEncoder()
train_df["class"] = le.fit_transform(train_df["class"])

y = train_df["class"].values
X = train_df.drop(columns=["ID", "class"])
test_ids = test_df["ID"].values
test_X = test_df.drop(columns=["ID"])

X = X.fillna(X.mean())
test_X = test_X.fillna(X.mean())

def add_features(df):
    df = df.copy()
    df["mean"] = df.mean(axis=1)
    df["std"] = df.std(axis=1)
    df["max"] = df.max(axis=1)
    df["min"] = df.min(axis=1)
    df["range"] = df["max"] - df["min"]

    # Polynomial features (squares)
    for col in ["mean", "std", "max", "min", "range"]:
        df[f"{col}_sq"] = df[col] ** 2

    # Pairwise products (just a few, not all combos)
    df["mean_std"] = df["mean"] * df["std"]
    df["max_min"] = df["max"] * df["min"]
    df["mean_range"] = df["mean"] * df["range"]

    return df

X = add_features(X)
test_X = add_features(test_X)

X.columns = X.columns.str.replace('[^A-Za-z0-9_]+', '_', regex=True)
test_X.columns = test_X.columns.str.replace('[^A-Za-z0-9_]+', '_', regex=True)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
test_preds = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    print(f"Training fold {fold+1}")
    model = LGBMClassifier(
        objective="multiclass",
        num_class=len(np.unique(y)),
        learning_rate=0.05,
        max_depth=8,
        num_leaves=31,
        n_estimators=500,
        min_child_samples=20,   # added parameter
        random_state=42
    )
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]

    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        callbacks=[
            early_stopping(stopping_rounds=20),
            log_evaluation(period=10)
        ]
    )

    preds = model.predict(test_X)
    test_preds.append(preds)

final_preds = mode(np.array(test_preds), axis=0).mode.flatten()
final_labels = le.inverse_transform(final_preds)

submission = pd.DataFrame({"ID": test_ids, "class": final_labels})
submission.to_csv("submission.csv", index=False)

print("✅ Submission file saved! Ready to submit.")


Training fold 1
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008513 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10200
[LightGBM] [Info] Number of data points in the train set: 6400, number of used features: 40
[LightGBM] [Info] Start training from score -2.252308
[LightGBM] [Info] Start training from score -0.261568
[LightGBM] [Info] Start training from score -3.707807
[LightGBM] [Info] Start training from score -2.481787
[LightGBM] [Info] Start training from score -5.585999
[LightGBM] [Info] Start training from score -4.333236
Training until validation scores don't improve for 20 rounds
[10]	valid_0's multi_logloss: 0.404533
[20]	valid_0's multi_logloss: 0.29908
[30]	valid_0's multi_logloss: 0.24524
[40]	valid_0's multi_logloss: 0.216435
[50]	valid_0's multi_logloss: 0.200865
[60]	valid_0's multi_logloss: 0.190521
[70]	valid_0's multi_logloss: 0.186215
[80]	valid_0's multi_logloss: 0.18444
[