In [1]:
# Question 3: Advanced Model Evaluation with Feature Selection for House Prices

# Step 1: Load a house prices dataset from CSV (Assume you have a house_prices.csv ).
# Step 2: Apply feature selection and create a train-test split.
# Step 3: Train a Lasso Regression model.
# Step 4: Perform model evaluation and hyperparameter tuning using GridSearchCV.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error

# Step 1: Create synthetic house prices dataset (assuming 'house_prices.csv')
np.random.seed(42)

# Generating synthetic features for house prices
n_samples = 1000
X = pd.DataFrame({
    'LotArea': np.random.randint(1000, 20000, n_samples),  # Lot Area
        'OverallQual': np.random.randint(1, 10, n_samples),     # Overall Quality (rating 1-10)
            'GrLivArea': np.random.randint(500, 5000, n_samples),   # Ground Living Area
                'GarageCars': np.random.randint(1, 5, n_samples),      # Number of Cars in Garage
                    'TotRmsAbvGrd': np.random.randint(5, 20, n_samples),   # Total Rooms Above Grade
                        'YearBuilt': np.random.randint(1900, 2020, n_samples)  # Year Built
                        })
# Generating synthetic target variable (House Price)
y = np.random.uniform(50000, 500000, n_samples)

# Step 2: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Train a Lasso Regression model
# Pipeline for preprocessing and model fitting
pipeline = Pipeline(steps=[
    ('scaler', StandardScaler()),            # Standard scaling of data
        ('lasso', Lasso())                       # Lasso regression model
        ])
# Step 4: Feature selection using Lasso (Lasso inherently performs feature selection)
# Fit the model to select important features based on the coefficients
lasso = Lasso(alpha=0.01)
lasso.fit(X_train, y_train)

# Select features based on non-zero coefficients
selector = SelectFromModel(lasso, threshold="mean", max_features=5)
X_train_selected = selector.transform(X_train)
X_test_selected = selector.transform(X_test)

# Step 5: Hyperparameter tuning with GridSearchCV
# Define the parameter grid for Lasso
param_grid = {
    'lasso__alpha': [0.001, 0.01, 0.1, 1.0, 10.0],
    }
# GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train_selected, y_train)

# Best hyperparameters found
print("Best Hyperparameters: ", grid_search.best_params_)

# Step 6: Model evaluation
# Predict on the test set with the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test_selected)

# Calculate the Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

# Optionally, evaluate the R-squared score
r2_score = best_model.score(X_test_selected, y_test)
print(f"R-squared score: {r2_score}")



Best Hyperparameters:  {'lasso__alpha': 10.0}
Mean Squared Error: 16255036564.595068
R-squared score: -0.012733143468511177
