Importing the libraries

In [30]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Step 1: Data Initialization

In [None]:
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/422/Housing.csv')
print(data.head())
# Checking data types
print(data.dtypes)

Heatmap showing data correlation

In [None]:
# Encode binary categorical variables as numbers
binary_columns = ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea']
for col in binary_columns:
    data[col] = data[col].map({'yes': 1, 'no': 0})

# One-hot encode the 'furnishingstatus' column
data = pd.get_dummies(data, columns=['furnishingstatus'], drop_first=True)

# Calculate the correlation matrix
corr_matrix = data.corr()

# Using seaborn to create heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

Data Visualization

In [None]:
num_cols = data.select_dtypes(include=['int64', 'float64', 'object']).columns
num_plots = len(num_cols)
num_rows = (num_plots // 4) + (num_plots % 4 > 0)

# Barplots for numerical features to identify outliers
plt.figure(figsize=(20, num_rows * 4))

for i, col in enumerate(num_cols):
    plt.subplot(num_rows, 4, i + 1)
    sns.barplot(x=data[col])
    plt.title(f'Boxplot of {col}')

plt.tight_layout()
plt.show()

Step 2: Data Preprocessing

In [35]:
# Encoding non-binary variables
binary_columns = ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea', 'furnishingstatus']
label_encoder = LabelEncoder()
for col in binary_columns:
  data[col] = label_encoder.fit_transform(data[col])
# Dropping the Label column
X = data.drop(['price', 'bedrooms'], axis=1)
y = data['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
#scalling
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
#checking the categories of categorical features
for col in data.select_dtypes(include=['object']).columns:
  print(f"Unique categories in {col}: {data[col].nunique()}")
  print(data[col].value_counts())

Step 3: Model Training and Validation

In [36]:
# Initializing the models
models = {
    'LinearRegression': LinearRegression(),
    'DecisionTree': DecisionTreeRegressor(),
    'SVR': SVR()
}

# Training and evaluating the models
for name, model in models.items():
  model.fit(X_train, y_train)
  predictions = model.predict(X_test)
  mse = mean_squared_error(y_test, predictions)
  mae = mean_absolute_error(y_test, predictions)
  r2 = r2_score(y_test, predictions)
  print(f"{name} - MSE: {mse}, MAE: {mae}, R2: {r2}")

LinearRegression - MSE: 965623065237.2881, MAE: 731372.855567254, R2: 0.7216241485133134
DecisionTree - MSE: 3190025039634.1465, MAE: 1197682.9268292682, R2: 0.08035964690447084
SVR - MSE: 3614249150937.7734, MAE: 1391446.274830629, R2: -0.04193833090564936


Step 4: Model Evaluation

In [37]:
for model_name, model in models.items():
  predictions = model.predict(X_test)

  mae = mean_absolute_error(y_test, predictions)
  mse = mean_squared_error(y_test, predictions)
  r2 = r2_score(y_test, predictions)

  print(f"{model_name} Performance:")
  print(f"  Mean Absolute Error: {mae}")
  print(f"  Mean Squared Error: {mse}")
  print(f"  R-Squared: {r2}\n")

LinearRegression Performance:
  Mean Absolute Error: 731372.855567254
  Mean Squared Error: 965623065237.2881
  R-Squared: 0.7216241485133134

DecisionTree Performance:
  Mean Absolute Error: 1197682.9268292682
  Mean Squared Error: 3190025039634.1465
  R-Squared: 0.08035964690447084

SVR Performance:
  Mean Absolute Error: 1391446.274830629
  Mean Squared Error: 3614249150937.7734
  R-Squared: -0.04193833090564936

