Findings and Insights

1. Best Model: XGBoost with parameters (`n_estimators=1200`, `learning_rate=0.003`, `max_depth=13`) gave the best performance, with the lowest RMSE on validation data.

2. Data Transformation: Standard Scaling and One-Hot Encoding improved model accuracy by preparing the data effectively.

3. Hyperparameters: Small learning rate and high n_estimators helped fine-tune the model.

- XGBoost was the best-performing model, benefiting from preprocessing and hyperparameter optimization.

In [None]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

# File paths
train_file_path = 'C:\\Users\\DELL\\Downloads\\iml-fall-2024-challenge-2\\train\\train.csv'
test_file_path = 'C:\\Users\\DELL\\Downloads\\iml-fall-2024-challenge-2\\test\\test.csv'
submission_template_path = 'C:\\Users\\DELL\\Downloads\\iml-fall-2024-challenge-2\\sample_submission.csv'

# Load datasets
train_data = pd.read_csv(train_file_path)
test_data = pd.read_csv(test_file_path)
submission_template = pd.read_csv(submission_template_path)

# Define target column
TARGET_COL = 'price_doc' 
FEATURE_COLS = train_data.columns.drop(TARGET_COL)

# Separate features and target in training data
X = train_data[FEATURE_COLS]
y = train_data[TARGET_COL]

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns

# Preprocessing for numerical and categorical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),  # Fill missing numerical values with median
    ('scaler', StandardScaler())  # Standardize numerical data
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Fill missing categorical values with mode
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # One-hot encode categorical data
])

# Combine preprocessors
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the preprocessor on training data
print("Fitting the preprocessor...")
preprocessor.fit(X_train)

# Transform the training, validation, and test data
X_train_transformed = preprocessor.transform(X_train)
X_val_transformed = preprocessor.transform(X_val)

# Ensure test_data has the same features as training data
test_transformed = preprocessor.transform(test_data[FEATURE_COLS])

# Parameter
params = {'n_estimators': 1200, 'learning_rate': 0.003, 'max_depth': 13, 'subsample': 0.85, 'colsample_bytree': 0.9}

# Train the model
model = XGBRegressor(
    n_estimators=params['n_estimators'],
    learning_rate=params['learning_rate'],
    max_depth=params['max_depth'],
    subsample=params['subsample'],
    colsample_bytree=params['colsample_bytree'],
    random_state=42
)
model.fit(X_train_transformed, y_train)

# Evaluate the model
val_predictions = model.predict(X_val_transformed)
val_mse = mean_squared_error(y_val, val_predictions)
print(f"Validation MSE: {val_mse}")

# Predict on the test data
test_predictions = model.predict(test_transformed)

# Save submission file
submission = submission_template.copy()
submission['price_doc'] = test_predictions 
submission_file_path = './submission.csv'
submission.to_csv(submission_file_path, index=False)
print(f"Submission file saved to: {submission_file_path}")

# Log experiment details
num_features = X_train_transformed.shape[1]
print(f"Experiment details:")
print(f" - Parameters: {params}")
print(f" - Number of features: {num_features}\n")
