<a href="https://colab.research.google.com/github/rohankhanna1928/ML_Model_for_Kaggle/blob/main/ML_Model_for_Kaggle.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Thapar Summer School Hackathon - III**

In [None]:
#Competition Link
https://www.kaggle.com/competitions/thapar-summer-school-2025-hack-iii/overview

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.pipeline import Pipeline

# Load datasets
train = pd.read_csv('/content/train.csv').drop_duplicates()
test = pd.read_csv('/content/test.csv').drop_duplicates()
sample_sub = pd.read_csv('/content/sample_submission.csv')

# Separate features and target
X = train.drop(columns=['output', 'Row#'])
y = train['output']

# Split data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.18, random_state=456, shuffle=True)

# Identify feature types
numeric_features = X.select_dtypes(include=['float64', 'int64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

# Preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', RobustScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', drop='first'), categorical_features)
    ]
)

# Pipeline setup
model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(
        n_estimators=2500,
        max_depth=10,
        min_samples_split=5,
        min_samples_leaf=4,
        max_features=0.7,
        bootstrap=True,
        max_samples=0.6,
        criterion='absolute_error',
        random_state=42,
        n_jobs=-1,
        warm_start=True,
        verbose=0
    ))
])

# Train the model
model.fit(X_train, y_train)

# Evaluate performance
train_preds = model.predict(X_train)
train_r2 = r2_score(y_train, train_preds)
print(f"Training R²: {train_r2:.4f}")

val_preds = model.predict(X_val)
val_r2 = r2_score(y_val, val_preds)
print(f"Validation R²: {val_r2:.4f}")

# Predict on test set
test_features = test.drop(columns=['Row#'])
test_preds = model.predict(test_features)

# Prepare final submission to match sample_submission.csv
submission = sample_sub.copy()
submission.iloc[:, 1] = test_preds  # Assumes 2nd column is the target
submission.to_csv('submission.csv', index=False)
print("Submission file 'submission.csv' created successfully.")

Training R²: 0.9387
Validation R²: 0.9225
Submission file 'submission.csv' created successfully.


 2536.55954309]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  submission.iloc[:, 1] = test_preds  # Assumes 2nd column is the target
