<a href="https://colab.research.google.com/github/sonjoy1s/ML/blob/main/Untitled10.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from sklearn.linear_model import Ridge

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

!pip install catboost
from catboost import CatBoostRegressor

# ================================
# Load Data
# ================================
train_df = pd.read_csv("/content/train (1).csv")
test_df  = pd.read_csv("/content/test (1).csv")

X = train_df.drop("Age", axis=1)
y = train_df["Age"]

# ================================
# Split
# ================================
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ================================
# Log Transform Target
# ================================
y_train_log = np.log1p(y_train)

# ================================
# Columns
# ================================
num_cols = X.select_dtypes(include=["int64","float64"]).columns
cat_cols = X.select_dtypes(include=["object"]).columns

# ================================
# Preprocessing
# ================================
num_pipe = Pipeline([
    ("scaler", StandardScaler())
])

cat_pipe = Pipeline([
    ("encoder", OrdinalEncoder(
        handle_unknown="use_encoded_value",
        unknown_value=-1
    ))
])

preprocess = ColumnTransformer([
    ("num", num_pipe, num_cols),
    ("cat", cat_pipe, cat_cols)
])

# ================================
# Base Models
# ================================
xgb = XGBRegressor(
    n_estimators=800,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

lgbm = LGBMRegressor(
    n_estimators=800,
    learning_rate=0.05,
    num_leaves=31,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

cat = CatBoostRegressor(
    iterations=800,
    depth=6,
    learning_rate=0.05,
    verbose=0,
    random_state=42
)

rf = RandomForestRegressor(
    n_estimators=400,
    max_depth=12,
    random_state=42
)

# ================================
# Stacking
# ================================
stack = StackingRegressor(
    estimators=[
        ("xgb", xgb),
        ("lgbm", lgbm),
        ("cat", cat),
        ("rf", rf)
    ],
    final_estimator=Ridge(alpha=1.0),
    cv=5,
    n_jobs=-1
)

# ================================
# Pipeline
# ================================
pipe = Pipeline([
    ("prep", preprocess),
    ("model", stack)
])

# ================================
# Train
# ================================
pipe.fit(X_train, y_train_log)

# ================================
# Validation
# ================================
val_pred = np.expm1(pipe.predict(X_val))

print("R2:", r2_score(y_val, val_pred))
print("MAE:", mean_absolute_error(y_val, val_pred))
print("MSE:", mean_squared_error(y_val, val_pred))

# ================================
# Train on FULL DATA
# ================================
pipe.fit(X, np.log1p(y))

# ================================
# Submission
# ================================
X_sub = test_df.drop(columns=["id"], errors="ignore")
pred_sub = np.expm1(pipe.predict(X_sub))

submission = pd.DataFrame({
    "id": test_df["id"],
    "Age": pred_sub
})

submission.to_csv("submission.csv", index=False)
print("submission.csv saved!")





R2: 0.6097447029659935
MAE: 1.3578539317602139
MSE: 4.1350181209539985


ValueError: columns are missing: {'id'}