In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

In [12]:
import pandas as pd

# File path for the CSV
file_path = '/housing_data.csv'

# Load the CSV into a DataFrame
df = pd.read_csv(file_path)  # Correct syntax

# Display the first few rows of the dataset
print(df.head())


      price  area  bedrooms  bathrooms  stories mainroad guestroom basement  \
0  13300000  7420         4          2        3      yes        no       no   
1  12250000  8960         4          4        4      yes        no       no   
2  12250000  9960         3          2        2      yes        no      yes   
3  12215000  7500         4          2        2      yes        no      yes   
4  11410000  7420         4          1        2      yes       yes      yes   

  hotwaterheating airconditioning  parking prefarea furnishingstatus  
0              no             yes        2      yes        furnished  
1              no             yes        3       no        furnished  
2              no              no        2      yes   semi-furnished  
3              no             yes        3      yes        furnished  
4              no             yes        2       no        furnished  


In [18]:
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [23]:
# Check for categorical columns
categorical_columns = X.select_dtypes(include=['object']).columns
print("Categorical columns:", categorical_columns)


Categorical columns: Index(['mainroad', 'guestroom', 'basement', 'hotwaterheating',
       'airconditioning', 'prefarea', 'furnishingstatus'],
      dtype='object')


In [26]:
# Check for categorical columns before encoding
categorical_columns = X.select_dtypes(include=['object']).columns
print("Categorical columns before encoding:", categorical_columns)

# Apply one-hot encoding to categorical columns
X = pd.get_dummies(X, drop_first=True)

# Check for remaining non-numeric columns
non_numeric_columns = X.select_dtypes(include=['object']).columns
print("Remaining non-numeric columns:", non_numeric_columns)

# Ensure all columns are numeric
X = X.apply(pd.to_numeric, errors='coerce')  # Convert all to numeric (if any issue)

# Check if there are any NaN values
print(X.isnull().sum())

# Fill NaN values if any remain
X.fillna(X.mean(), inplace=True)

# Check the types and shape of the final dataset
print(X.dtypes)
print(X.shape)


Categorical columns before encoding: Index(['mainroad', 'guestroom', 'basement', 'hotwaterheating',
       'airconditioning', 'prefarea', 'furnishingstatus'],
      dtype='object')
Remaining non-numeric columns: Index([], dtype='object')
area                               0
bedrooms                           0
bathrooms                          0
stories                            0
parking                            0
mainroad_yes                       0
guestroom_yes                      0
basement_yes                       0
hotwaterheating_yes                0
airconditioning_yes                0
prefarea_yes                       0
furnishingstatus_semi-furnished    0
furnishingstatus_unfurnished       0
dtype: int64
area                               int64
bedrooms                           int64
bathrooms                          int64
stories                            int64
parking                            int64
mainroad_yes                        bool
guestroom_yes         

In [28]:
# Check for any non-numeric columns in the training data
non_numeric_columns = X_train.select_dtypes(include=['object']).columns
print("Non-numeric columns in X_train:", non_numeric_columns)

# Check for any non-numeric columns in the test data
non_numeric_columns_test = X_test.select_dtypes(include=['object']).columns
print("Non-numeric columns in X_test:", non_numeric_columns_test)


Non-numeric columns in X_train: Index(['mainroad', 'guestroom', 'basement', 'hotwaterheating',
       'airconditioning', 'prefarea', 'furnishingstatus'],
      dtype='object')
Non-numeric columns in X_test: Index(['mainroad', 'guestroom', 'basement', 'hotwaterheating',
       'airconditioning', 'prefarea', 'furnishingstatus'],
      dtype='object')


In [29]:
# Apply one-hot encoding to all categorical columns in both training and testing sets
X_train = pd.get_dummies(X_train, drop_first=True)
X_test = pd.get_dummies(X_test, drop_first=True)

# Make sure both train and test data have the same columns
X_train, X_test = X_train.align(X_test, join='left', axis=1)

# Check if there are still any non-numeric values
non_numeric_columns_train = X_train.select_dtypes(include=['object']).columns
non_numeric_columns_test = X_test.select_dtypes(include=['object']).columns
print(f"Non-numeric columns in training data: {non_numeric_columns_train}")
print(f"Non-numeric columns in test data: {non_numeric_columns_test}")


Non-numeric columns in training data: Index([], dtype='object')
Non-numeric columns in test data: Index([], dtype='object')


In [30]:
# Convert all non-numeric columns to numeric if necessary
X_train = X_train.apply(pd.to_numeric, errors='coerce')
X_test = X_test.apply(pd.to_numeric, errors='coerce')

# Fill any NaN values that might appear due to conversion
X_train.fillna(X_train.mean(), inplace=True)
X_test.fillna(X_test.mean(), inplace=True)


In [31]:
# Check model fitting
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Calculate metrics
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    print(f"{model_name}: MAE={mae}, RMSE={rmse}, MSE={mse}, R2={r2}")


Linear Regression: MAE=970043.4039201636, RMSE=1324506.9600914386, MSE=1754318687330.6638, R2=0.6529242642153184
Decision Tree: MAE=1250206.4220183487, RMSE=1711671.5078885409, MSE=2929819350917.431, R2=0.420362323972554
Random Forest: MAE=1032306.9177370031, RMSE=1414055.5673286545, MSE=1999553147493.1626, R2=0.6044068931610502
Gradient Boosting: MAE=961927.7803751583, RMSE=1300223.4941557832, MSE=1690581134754.674, R2=0.665534149817738


In [32]:
#  hyperparameter tuning for Random Forest
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10]
}

# Grid Search for Random Forest
grid_search = GridSearchCV(RandomForestRegressor(), param_grid, cv=3)
grid_search.fit(X_train, y_train)

best_rf_model = grid_search.best_estimator_
y_pred_rf = best_rf_model.predict(X_test)

# Evaluate best model
mae_rf = mean_absolute_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
r2_rf = r2_score(y_test, y_pred_rf)

print(f"Random Forest (Tuned): MAE={mae_rf}, RMSE={rmse_rf}, R2={r2_rf}")

Random Forest (Tuned): MAE=1031973.9044923742, RMSE=1411056.7222739237, R2=0.6060830146918244
