In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

data = pd.DataFrame(pd.read_csv("Housing.csv"))

var = ["mainroad", "guestroom", "basement", "hotwaterheating", "airconditioning", "prefarea"]
def binarymap(x):
    return x.map({'yes': 1, 'no': 0})
data[var] = data[var].apply(binarymap)

stat = pd.get_dummies(data['furnishingstatus'])
stat = stat.drop('furnished', axis=1)
data = pd.concat([data, stat], axis=1)
data.drop(['furnishingstatus'], axis=1, inplace=True)

labelen = LabelEncoder()
data["mainroad"] = labelen.fit_transform(data["mainroad"])

np.random.seed(0)
df_train, df_test = train_test_split(data, train_size=0.7, test_size=0.3, random_state=100)
scaler = MinMaxScaler()

num_vars = ['area', 'bedrooms', 'bathrooms', 'stories', 'parking', 'price']
scaler.fit(df_train[num_vars])
df_train[num_vars] = scaler.transform(df_train[num_vars])
df_test[num_vars] = scaler.transform(df_test[num_vars])

plt.figure(figsize=(16, 10))
sns.heatmap(df_train.corr(), annot=True, cmap="YlGnBu")
plt.title("Correlation Heatmap")
plt.show()

lm_simple = LinearRegression()
X_train_simple = df_train[['area']]
y_train_simple = df_train['price']
lm_simple.fit(X_train_simple, y_train_simple)

y_pred_simple = lm_simple.predict(df_test[['area']])

plt.figure(figsize=(10, 5))
plt.scatter(df_test['area'], df_test['price'], color='blue', label='Actual Prices')
plt.plot(df_test['area'], y_pred_simple, color='red', linewidth=2, label='Best Fit Line')
plt.xlabel('Area')
plt.ylabel('Price')
plt.legend()
plt.title('Simple Linear Regression (Best Fit Line)')
plt.show()

r2_simple = r2_score(df_test['price'], y_pred_simple)
mse_simple = mean_squared_error(df_test['price'], y_pred_simple)
rmse_simple = np.sqrt(mse_simple)
mae_simple = mean_absolute_error(df_test['price'], y_pred_simple)
adj_r2_simple = 1 - (1-r2_simple) * (len(y_train_simple)-1) / (len(y_train_simple)-X_train_simple.shape[1]-1)

print(f"Simple Linear Regression - R²: {r2_simple}, Adjusted R²: {adj_r2_simple}, MSE: {mse_simple}, RMSE: {rmse_simple}, MAE: {mae_simple}")

lm = LinearRegression()
y_train = df_train["price"]
X_train = df_train.drop(columns=["price"])
lm.fit(X_train, y_train)

y_train_price = lm.predict(X_train)
y_test = df_test['price']
X_test = df_test.drop(columns=["price"])
y_pred = lm.predict(X_test)

r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
adj_r2 = 1 - (1-r2) * (len(y_train)-1) / (len(y_train)-X_train.shape[1]-1)

print(f"Multiple Linear Regression - R²: {r2}, Adjusted R²: {adj_r2}, MSE: {mse}, RMSE: {rmse}, MAE: {mae}")

df_test['predicted_price'] = y_pred
df_test[num_vars] = scaler.inverse_transform(df_test[num_vars])

plt.figure(figsize=(10, 5))
plt.scatter(df_test['price'], y_pred, color='blue', label='Predicted vs Actual')
plt.plot([df_test['price'].min(), df_test['price'].max()],
         [df_test['price'].min(), df_test['price'].max()],
         color='red', linewidth=2, label='Best Fit Line')
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.legend()
plt.title('Multiple Linear Regression - Best Fit Line')
plt.show()

residuals = y_test - y_pred

plt.figure(figsize=(10, 5))
sns.histplot(residuals, bins=20, kde=True, color="purple")
plt.xlabel("Residuals")
plt.ylabel("Frequency")
plt.title("Residuals Distribution")
plt.show()

plt.figure(figsize=(10, 5))
plt.scatter(y_pred, residuals, color='blue')
plt.axhline(y=0, color='red', linestyle='--')
plt.xlabel("Predicted Prices")
plt.ylabel("Residuals")
plt.title("Residuals vs. Predicted Prices")
plt.show()

KeyError: "None of [Index(['mainroad', 'guestroom', 'basement', 'hotwaterheating',\n       'airconditioning', 'prefarea'],\n      dtype='object')] are in the [columns]"