In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
import xgboost as xgb
import optuna
import os

# Load the data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
sample_submission = pd.read_csv("sample_submission.csv")

# Separate features and target
X = train.drop(columns=["price_doc", "id"], errors='ignore')
y = train["price_doc"]

# Handle outliers
q1 = y.quantile(0.01)
q99 = y.quantile(0.99)
outlier_mask = (y >= q1) & (y <= q99)
X = X[outlier_mask]
y = y[outlier_mask]

# Impute missing values
numerical_features = X.select_dtypes(include=['float64', 'int64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

# Numerical imputation
num_imputer = SimpleImputer(strategy='median')
X[numerical_features] = num_imputer.fit_transform(X[numerical_features])
test[numerical_features] = num_imputer.transform(test[numerical_features])

# Categorical encoding and imputation
cat_imputer = SimpleImputer(strategy='most_frequent')
X[categorical_features] = cat_imputer.fit_transform(X[categorical_features])
test[categorical_features] = cat_imputer.transform(test[categorical_features])

label_encoders = {}
for col in categorical_features:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    test[col] = le.transform(test[col])
    label_encoders[col] = le

# Feature Scaling
scaler = StandardScaler()
X[numerical_features] = scaler.fit_transform(X[numerical_features])
test[numerical_features] = scaler.transform(test[numerical_features])

# Select top 190 features
xgb_fs_model = xgb.XGBRegressor(objective='reg:squarederror', n_jobs=-1, random_state=42, n_estimators=100)
xgb_fs_model.fit(X, y)
feature_importances = pd.Series(xgb_fs_model.feature_importances_, index=X.columns)
top_features = feature_importances.nlargest(190).index
X = X[top_features]
test = test[top_features]

# Train-Test Split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Define Optuna Objective Function
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1, log=True),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'gamma': trial.suggest_float('gamma', 0.0, 5.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 1.0)
    }
    model = xgb.XGBRegressor(objective='reg:squarederror', tree_method='hist', n_jobs=-1, random_state=42, **params)
    model.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric="rmse", early_stopping_rounds=50, verbose=False)
    val_pred = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, val_pred))
    return rmse

# Optuna Study
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

# Best Parameters
best_params = study.best_params
print("Best Parameters:", best_params)

# Train Best Model
best_model = xgb.XGBRegressor(objective='reg:squarederror', tree_method='hist', n_jobs=-1, random_state=42, **best_params)
best_model.fit(X, y)

# Predict on Test Data
test_predictions = best_model.predict(test)

# Prepare Submission Directory
if not os.path.exists("submissions6"):
    os.makedirs("submissions6")

# Generate Submission File
submission = sample_submission.copy()
submission["price_doc"] = test_predictions
submission_file = f"submissions6/submission_xgb_190_features.csv"
submission.to_csv(submission_file, index=False)
print(f"Submission saved as {submission_file}")


  from .autonotebook import tqdm as notebook_tqdm
[I 2024-11-27 20:49:13,731] A new study created in memory with name: no-name-9349ec8c-b1c7-486d-9359-c5d9ab544d9a
[I 2024-11-27 20:49:29,610] Trial 0 finished with value: 11608351.065571614 and parameters: {'n_estimators': 397, 'max_depth': 9, 'learning_rate': 0.04800532946310139, 'subsample': 0.7969537066610024, 'colsample_bytree': 0.8558915101132738, 'gamma': 1.2887412562415324, 'reg_alpha': 0.7121785891873489, 'reg_lambda': 0.722152533784593}. Best is trial 0 with value: 11608351.065571614.
[I 2024-11-27 20:49:37,503] Trial 1 finished with value: 11675534.886925774 and parameters: {'n_estimators': 727, 'max_depth': 4, 'learning_rate': 0.042787949390670765, 'subsample': 0.627508423575198, 'colsample_bytree': 0.8213034578526898, 'gamma': 0.25135814888992636, 'reg_alpha': 0.6523703700268305, 'reg_lambda': 0.17581333225429907}. Best is trial 0 with value: 11608351.065571614.
[I 2024-11-27 20:49:56,124] Trial 2 finished with value: 116066

Best Parameters: {'n_estimators': 857, 'max_depth': 14, 'learning_rate': 0.006893711269294302, 'subsample': 0.9194600447921879, 'colsample_bytree': 0.6944684003332068, 'gamma': 4.611505781924122, 'reg_alpha': 0.6254836368325944, 'reg_lambda': 0.13568523146983164}
Submission saved as submissions6/submission_xgb_190_features.csv
