Boston House Price Prediction


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV

In [7]:
# Load dataset
df = pd.read_csv(r"C:\Users\Shashank Mahato\Desktop\ShadowFox\HousingData.csv")

# Display first few rows
display(df.head())

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,,36.2


In [8]:
# Check for missing values
missing_values = df.isnull().sum()
print("Missing values:")
print(missing_values)

# Determine target column based on missing values
target_column = missing_values.idxmax()  # Column with the most missing values
print(f"Selected target column: {target_column}")

# Handle missing values using imputation
imputer = SimpleImputer(strategy='median')
df.iloc[:, :] = imputer.fit_transform(df)

Missing values:
CRIM       20
ZN         20
INDUS      20
CHAS       20
NOX         0
RM          0
AGE        20
DIS         0
RAD         0
TAX         0
PTRATIO     0
B           0
LSTAT      20
MEDV        0
dtype: int64
Selected target column: CRIM


In [4]:
# Define features and target variable
X = df.drop(columns=[target_column])
y = df[target_column]

# Feature selection using PCA (only if model is not Linear Regression)
pca = PCA(n_components=0.95)  # Retain 95% variance
X_pca = pca.fit_transform(X)

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [5]:
# Initialize models
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42)
}

# Hyperparameter tuning
param_grid = {
    "Random Forest": {"n_estimators": [50, 100, 200], "max_depth": [None, 10, 20]},
    "Gradient Boosting": {"n_estimators": [50, 100, 200], "learning_rate": [0.01, 0.1, 0.2]}
}

best_model = None
best_score = float('-inf')
best_model_name = ""

In [6]:
# Train and evaluate models
for name, model in models.items():
    if name in param_grid:
        grid_search = GridSearchCV(model, param_grid[name], cv=5, scoring='r2')
        grid_search.fit(X_train_scaled, y_train)
        model = grid_search.best_estimator_
    else:
        model.fit(X_train_scaled, y_train)
    
    y_pred = model.predict(X_test_scaled)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"{name}: MSE = {mse:.2f}, R2 Score = {r2:.2f}")
    
    # Cross-validation
    scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='r2')
    avg_r2 = np.mean(scores)
    print(f"{name}: Cross-validated R2 Score = {avg_r2:.2f}")
    
    # Select the best model based on R2 score
    if avg_r2 > best_score:
        best_score = avg_r2
        best_model = model
        best_model_name = name

print(f"Best Model: {best_model_name} with Cross-validated R2 Score = {best_score:.2f}")

# Plot feature importance for the best model if it's tree-based
if best_model_name in ["Random Forest", "Gradient Boosting"]:
    feature_importances = best_model.feature_importances_
    features = [f"PC{i+1}" for i in range(len(feature_importances))]
    plt.figure(figsize=(10, 6))
    sns.barplot(x=feature_importances, y=features)
    plt.xlabel("Feature Importance")
    plt.ylabel("Principal Components")
    plt.title(f"Feature Importance in {best_model_name} Model")
    plt.show()


Linear Regression: MSE = 31.84, R2 Score = 0.42
Linear Regression: Cross-validated R2 Score = 0.34
Random Forest: MSE = 59.70, R2 Score = -0.08
Random Forest: Cross-validated R2 Score = 0.07
Gradient Boosting: MSE = 51.37, R2 Score = 0.07
Gradient Boosting: Cross-validated R2 Score = 0.21
Best Model: Linear Regression with Cross-validated R2 Score = 0.34
