In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_excel("data/real-estate.xlsx")
df.head()

In [None]:
df.isnull().sum()

In [None]:
sns.pairplot(df)
plt.show()

In [None]:
sns.heatmap(df.corr(), annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

In [None]:
def lin_reg(colX):
    X = df[colX].values
    y = df['Y house price of unit area'].values

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    N = len(X_train)
    beta1 = (N * np.sum(X_train * y_train) - np.sum(X_train) * np.sum(y_train)) / (N * np.sum(X_train ** 2) - (np.sum(X_train)) ** 2)
    beta0 = (np.sum(y_train) - beta1 * np.sum(X_train)) / N

    y_pred_train = beta1 * X_train + beta0
    y_pred_test = beta1 * X_test + beta0

    plt.figure(figsize=(8, 5))
    plt.scatter(X_train, y_train, color='blue', label='Train Data')
    plt.scatter(X_test, y_test, color='green', label='Test Data')
    x_line = np.linspace(min(X.min(), X_test.min()), max(X.max(), X_test.max()), 100)
    y_line = beta1 * x_line + beta0
    plt.plot(x_line, y_line, color='red', linewidth=2, label='Fitted Line')
    plt.xlabel(colX)
    plt.ylabel('Y house price of unit area')
    plt.title(f'Linear Regression: {colX} vs Y')
    plt.legend()
    plt.grid(True)
    plt.show()

    mse_train = np.mean((y_train - beta1 * X_train - beta0) ** 2)
    mse_test = np.mean((y_test - beta1 * X_test - beta0) ** 2)

    return mse_train, mse_test

In [None]:
mse1 = lin_reg('X3 distance to the nearest MRT station')
mse2 = lin_reg('X5 latitude')
mse3 = lin_reg('X6 longitude')

In [None]:
print(mse1)
print(mse2)
print(mse3)