In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
import xgboost as xgb
import optuna
import os

# Load the data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# Separate features and target
X = train.drop(columns=["price_doc", "id"], errors='ignore')
y = train["price_doc"]

# Handle Outliers (Example: Remove extreme price outliers)
q1 = y.quantile(0.01)
q99 = y.quantile(0.99)
outlier_mask = (y >= q1) & (y <= q99)
X = X[outlier_mask]
y = y[outlier_mask]

# Impute missing values
numerical_features = X.select_dtypes(include=['float64', 'int64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

# Numerical imputation
num_imputer = SimpleImputer(strategy='median')
X[numerical_features] = num_imputer.fit_transform(X[numerical_features])
test[numerical_features] = num_imputer.transform(test[numerical_features])

# Categorical encoding and imputation
cat_imputer = SimpleImputer(strategy='most_frequent')
X[categorical_features] = cat_imputer.fit_transform(X[categorical_features])
test[categorical_features] = cat_imputer.transform(test[categorical_features])

label_encoders = {}
for col in categorical_features:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    test[col] = le.transform(test[col])
    label_encoders[col] = le

# Feature Scaling
scaler = StandardScaler()
X[numerical_features] = scaler.fit_transform(X[numerical_features])
test[numerical_features] = scaler.transform(test[numerical_features])

# Feature Selection using XGBoost
xgb_fs_model = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_jobs=-1,
    random_state=42,
    n_estimators=100  # Use a moderate number of trees for feature selection
)
xgb_fs_model.fit(X, y)

# Get feature importance scores
feature_importances = pd.Series(xgb_fs_model.feature_importances_, index=X.columns)
top_features = feature_importances.nlargest(30).index  # Select top 30 features
print("Top Features Selected:", top_features.tolist())

# Keep only the top features
X = X[top_features]
test = test[top_features]

# Train-Test Split for validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Optuna Objective Function
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 1),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 1),
    }

    model = xgb.XGBRegressor(
        **params,
        objective='reg:squarederror',
        tree_method='hist',
        n_jobs=-1,
        random_state=42
    )
    model.fit(X_train, y_train)

    # Validate the model
    val_predictions = model.predict(X_val)
    val_rmse = np.sqrt(mean_squared_error(y_val, val_predictions))
    return val_rmse

# Run Optuna Optimization
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=500)

# Best Parameters
best_params = study.best_params
best_rmse = study.best_value
print("Best Parameters:", best_params)
print(f"Best Validation RMSE: {best_rmse:.4f}")

# Train the Best Model on Full Training Data
best_model = xgb.XGBRegressor(
    **best_params,
    objective='reg:squarederror',
    tree_method='hist',
    n_jobs=-1,
    random_state=42
)
best_model.fit(X, y)

# Predict on Test Data
test_predictions = best_model.predict(test)

# Prepare Submission Directory
if not os.path.exists("submissions4"):
    os.makedirs("submissions4")

# Save Submission File
submission = pd.DataFrame({
    "row ID": test.index.map(lambda x: f"Row{x}"),
    "price_doc": test_predictions
})
submission_file = f"submissions/best_submission_optuna_rmse{best_rmse:.4f}.csv"
submission.to_csv(submission_file, index=False)
print(f"Best submission saved as {submission_file}")


  from .autonotebook import tqdm as notebook_tqdm
[I 2024-11-26 01:50:57,278] A new study created in memory with name: no-name-c18b03e0-216c-473b-b4f4-ed5113c609c4


Top Features Selected: ['mosque_count_500', 'full_sq', 'culture_objects_top_25_raion', 'cafe_count_5000_price_high', 'school_km', 'cafe_count_3000_price_2500', 'cafe_count_1000_price_4000', 'cafe_count_5000_price_2500', 'cafe_count_3000_price_high', 'public_transport_station_min_walk', 'sport_count_3000', 'cafe_count_3000', 'cafe_count_1500', 'leisure_count_500', 'additional_education_km', 'cafe_count_1000', 'exhibition_km', 'build_count_monolith', 'ts_km', 'park_km', 'big_church_km', 'nuclear_reactor_km', 'incineration_km', 'ID_big_road2', 'office_sqm_3000', 'railroad_station_avto_km', 'stadium_km', 'cafe_count_2000', '16_29_male', 'raion_build_count_with_builddate_info']


[I 2024-11-26 01:51:04,986] Trial 0 finished with value: 11720268.185620273 and parameters: {'n_estimators': 870, 'max_depth': 7, 'learning_rate': 0.02028537460867115, 'subsample': 0.9722132093249598, 'colsample_bytree': 0.8822059653842809, 'gamma': 1.5242444727057554, 'reg_alpha': 0.4661085278322241, 'reg_lambda': 0.980016486674115}. Best is trial 0 with value: 11720268.185620273.
[I 2024-11-26 01:51:11,738] Trial 1 finished with value: 11901734.107307464 and parameters: {'n_estimators': 847, 'max_depth': 7, 'learning_rate': 0.05948001307923698, 'subsample': 0.9964001922636964, 'colsample_bytree': 0.5269858406890016, 'gamma': 0.393979206396628, 'reg_alpha': 0.429724584311694, 'reg_lambda': 0.10415548615791703}. Best is trial 0 with value: 11720268.185620273.
[I 2024-11-26 01:51:27,622] Trial 2 finished with value: 11921122.54821465 and parameters: {'n_estimators': 621, 'max_depth': 10, 'learning_rate': 0.07399890776609329, 'subsample': 0.5472942614645011, 'colsample_bytree': 0.7410306

Best Parameters: {'n_estimators': 358, 'max_depth': 10, 'learning_rate': 0.012052957847022892, 'subsample': 0.8709090119701554, 'colsample_bytree': 0.8025287629097406, 'gamma': 3.494445915246377, 'reg_alpha': 0.0911086553730072, 'reg_lambda': 0.9110857474090794}
Best Validation RMSE: 11567284.9255
Best submission saved as submissions/best_submission_optuna_rmse11567284.9255.csv
