In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Load your training and testing data
training_dataset = pd.read_csv('train.csv')
testing_dataset = pd.read_csv('test.csv')

X_train = training_dataset.drop('SalePrice', axis=1)
X_train = X_train.drop('Id', axis=1)
y_train = training_dataset['SalePrice']

X_test = testing_dataset.drop('Id', axis=1)

numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X_train.select_dtypes(include=['object', 'category']).columns

# Combine the training and testing datasets for one-hot encoding
combined_data = pd.concat([X_train, X_test], axis=0)

# Define transformers for numerical and categorical columns
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Fit and transform the combined data
combined_data_transformed = preprocessor.fit_transform(combined_data)

# Split the transformed data back into training and testing datasets
X_train_transformed = combined_data_transformed[:len(X_train)]
X_test_transformed = combined_data_transformed[len(X_train):]

# Fit your model, let's assume RandomForestRegressor
model = RandomForestRegressor()

# Feature selection using RFE
rfe = RFE(estimator=model, n_features_to_select=50)
rfe = rfe.fit(X_train_transformed, y_train)

# Get the mask of selected features
selected_mask = rfe.support_

# Get the feature names for the categorical columns
categorical_feature_names = preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(categorical_cols)

# Combine the numerical and categorical feature names
all_feature_names = np.array(numerical_cols.tolist() + categorical_feature_names.tolist())

# Get the selected feature names
selected_features = all_feature_names[selected_mask]

# Separate selected numerical and categorical features
selected_numerical = [feature for feature in selected_features if feature in numerical_cols]
selected_categorical = [feature for feature in selected_features if feature not in numerical_cols]

# Create transformers for selected numerical and categorical features
numerical_transformer_selected = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer_selected = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Use 'most_frequent' for categorical
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Create a list of columns that need one-hot encoding (categorical columns)
categorical_cols_selected = selected_categorical

# Create a ColumnTransformer for selected features
preprocessor_selected = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer_selected, selected_numerical),
        ('cat', categorical_transformer_selected, categorical_cols_selected)
    ])

# Fit and transform the training data with selected features
X_train_selected = preprocessor_selected.fit_transform(X_train)
X_test_selected = preprocessor_selected.transform(X_test)

# Now, you can proceed to fit your regression model using X_train_selected and y_train
model.fit(X_train_selected, y_train)