In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
import xgboost as xgb
import os

# Load the data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# Separate features and target
X = train.drop(columns=["price_doc", "id"], errors='ignore')
y = train["price_doc"]

# Handle Outliers (Example: Remove extreme price outliers)
q1 = y.quantile(0.01)
q99 = y.quantile(0.99)
outlier_mask = (y >= q1) & (y <= q99)
X = X[outlier_mask]
y = y[outlier_mask]

# Impute missing values
numerical_features = X.select_dtypes(include=['float64', 'int64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

# Numerical imputation
num_imputer = SimpleImputer(strategy='median')
X[numerical_features] = num_imputer.fit_transform(X[numerical_features])
test[numerical_features] = num_imputer.transform(test[numerical_features])

# Categorical encoding and imputation
cat_imputer = SimpleImputer(strategy='most_frequent')
X[categorical_features] = cat_imputer.fit_transform(X[categorical_features])
test[categorical_features] = cat_imputer.transform(test[categorical_features])

label_encoders = {}
for col in categorical_features:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    test[col] = le.transform(test[col])
    label_encoders[col] = le

# Feature Scaling
scaler = StandardScaler()
X[numerical_features] = scaler.fit_transform(X[numerical_features])
test[numerical_features] = scaler.transform(test[numerical_features])

# XGBoost for Feature Selection
xgb_fs_model = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_jobs=-1,
    random_state=42,
    n_estimators=100  # Use a moderate number of trees for feature selection
)
xgb_fs_model.fit(X, y)

# Get feature importance scores
feature_importances = pd.Series(xgb_fs_model.feature_importances_, index=X.columns)
top_features = feature_importances.nlargest(30).index  # Select top 30 features
print("Top Features Selected:", top_features.tolist())

# Keep only the top features
X = X[top_features]
test = test[top_features]

# Train-Test Split for validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# XGBoost Regressor for Training
xgb_model = xgb.XGBRegressor(
    objective='reg:squarederror',
    tree_method='hist',
    n_jobs=-1,
    random_state=42
)

# Expanded Hyperparameter Grid
param_grid = {
    'n_estimators': [100, 500, 1000],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.7, 1.0],
}

# Track the best models
best_models = []
best_rmse = float('inf')

# Prepare Submission Directory
if not os.path.exists("submissions"):
    os.makedirs("submissions")

# Grid Search Loop
for n_estimators in param_grid['n_estimators']:
    for max_depth in param_grid['max_depth']:
        for learning_rate in param_grid['learning_rate']:
            for subsample in param_grid['subsample']:
                for colsample_bytree in param_grid['colsample_bytree']:
                    params = {
                        'n_estimators': n_estimators,
                        'max_depth': max_depth,
                        'learning_rate': learning_rate,
                        'subsample': subsample,
                        'colsample_bytree': colsample_bytree,
                    }
                    model = xgb.XGBRegressor(
                        **params,
                        objective='reg:squarederror',
                        tree_method='hist',
                        n_jobs=-1,
                        random_state=42
                    )
                    model.fit(X_train, y_train)

                    # Validate
                    val_predictions = model.predict(X_val)
                    val_rmse = np.sqrt(mean_squared_error(y_val, val_predictions))

                    # Check if this is the best model
                    if val_rmse < best_rmse:
                        best_rmse = val_rmse
                        best_models = [(model, params, val_rmse)]
                    elif val_rmse == best_rmse:  # Handle ties
                        best_models.append((model, params, val_rmse))

# Generate Submission Files for Best Models
for idx, (best_model, params, rmse) in enumerate(best_models):
    # Predict on Test Data
    test_predictions = best_model.predict(test)

    # Save Submission File
    submission = pd.DataFrame({
        "row ID": test.index.map(lambda x: f"Row{x}"),
        "price_doc": test_predictions
    })
    param_info = f"est{params['n_estimators']}_depth{params['max_depth']}_lr{params['learning_rate']}_sub{params['subsample']}_col{params['colsample_bytree']}"
    submission_file = f"submissions/best_submission_xgb_{idx+1}_rmse{rmse:.4f}_{param_info}.csv"
    submission.to_csv(submission_file, index=False)
    print(f"Best submission saved as {submission_file}")


Top Features Selected: ['mosque_count_500', 'full_sq', 'culture_objects_top_25_raion', 'cafe_count_5000_price_high', 'school_km', 'cafe_count_3000_price_2500', 'cafe_count_1000_price_4000', 'cafe_count_5000_price_2500', 'cafe_count_3000_price_high', 'public_transport_station_min_walk', 'sport_count_3000', 'cafe_count_3000', 'cafe_count_1500', 'leisure_count_500', 'additional_education_km', 'cafe_count_1000', 'exhibition_km', 'build_count_monolith', 'ts_km', 'park_km', 'big_church_km', 'nuclear_reactor_km', 'incineration_km', 'ID_big_road2', 'office_sqm_3000', 'railroad_station_avto_km', 'stadium_km', 'cafe_count_2000', '16_29_male', 'raion_build_count_with_builddate_info']
Best submission saved as submissions/best_submission_xgb_1_rmse11635870.0381_est500_depth7_lr0.01_sub1.0_col0.7.csv
