In [None]:
import os
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures

%matplotlib inline
print("Libraries Imported...")

In [None]:
def load_data():
    file_path = '../input/real-estate-dataset/data.csv'
    df = pd.read_csv(file_path)
    print(df.info())
    
    # Replacing missing values with median of that specific column
    df['RM'].fillna(value=df['RM'].median(), inplace=True)
    
    x = df.drop('CRIM', axis=1).values
    y = df['CRIM'].values
    
    return x, y, df

In [None]:
X, y, data_frame = load_data()
print(f"\nData shape: {X.shape}\nLabels: {y.shape}")

In [None]:
def normalize(data):
    min_val = np.min(data, axis=0)
    max_val = np.max(data, axis=0)
    norm_x = np.array([(row - min_val) / (max_val - min_val) for row in data])
    return norm_x

In [None]:
norm_x = normalize(data=X)
print(f"Normalized data shape: {norm_x.shape}")

In [None]:
# Creating Polynomial features and adding them to the dataset
poly_transform = PolynomialFeatures(degree=2)
poly_x = poly_transform.fit_transform(norm_x)

print(f"Data after adding polynomial features")
print(f"Data shape: {poly_x.shape}")

In [None]:
def correlation_heatmap(dataframe):
    plt.figure(figsize=(20, 10))
    corr_mat = dataframe[dataframe.keys()].corr()
    sns.heatmap(corr_mat, cmap='RdBu_r', robust=True, annot=True)
    plt.show()

In [None]:
correlation_heatmap(dataframe=data_frame)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(poly_x, y, test_size=0.2, shuffle=True, random_state=42)
print(f"Training data shape: {x_train.shape}, labels: {y_train.shape}")
print(f"Testing data shape: {x_test.shape}, labels: {y_test.shape}")

In [None]:
# Linear regression cost function
def compute_cost(features, labels, parameters):
    m = labels.size
    h_x = np.dot(features, parameters)
    cost = (1 / (2 * m)) * sum(np.square(h_x - labels))
    return cost


def gradient_descent(features, labels, epochs, learning_rate):
    J = []
    m = labels.size
    thetas = np.random.random(size=features[0, :].shape)
    for _ in range(epochs):
        h_x = np.dot(features, thetas)
        for i in range(len(thetas)):
            thetas[i] -= (learning_rate / m) * sum((h_x - labels) * features[:, i])
        J.append(compute_cost(features=features, labels=labels, parameters=thetas))

    return thetas, J


def visualize_loss(cost):
    plt.plot(cost)
    plt.xlabel('Number of iterations')
    plt.ylabel('Cost J')
    plt.show()


def MSE(actual, prediction):
    mse = np.sum(np.square(actual - prediction))
    return mse / len(actual)

In [None]:
alpha = 0.01
epochs = 10000
thetas, J = gradient_descent(features=x_train, labels=y_train, epochs=epochs, learning_rate=alpha)
print(f"Optimized Thetas: {thetas}")

In [None]:
visualize_loss(cost=J)
print("MSE of training set: {}".format(MSE(y_train, np.dot(x_train, thetas))))
print("MSE of testing set: {}".format(MSE(y_test, np.dot(x_test, thetas))))