In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from math import sqrt

%matplotlib inline
print("Libraries Imported...")

# **Import Dataset**

In [None]:
def get_data():
    df = pd.read_csv("/kaggle/input/real-estate-dataset/data.csv")
    df['RM'].fillna(value=df['RM'].median(), inplace=True)
    x = df.drop('CRIM', axis=1).values
    y = df['CRIM'].values
    return x, y, df

In [None]:
X, y, data_frame = get_data()

In [None]:
def normalize(data):
    min_val = np.min(data, axis=0)
    max_val = np.max(data, axis=0)
    norm_x = np.array([(row - min_val) / (max_val - min_val) for row in data])
    return norm_x

In [None]:
norm_x = normalize(data=X)
print(f"Normalized data shape: {norm_x.shape}")

In [None]:
# Creating Polynomial features and adding them to the dataset
poly_transform = PolynomialFeatures(degree=2)
poly_x = poly_transform.fit_transform(norm_x)

print(f"Data after adding polynomial features")
print(f"Data shape: {poly_x.shape}")

# **Exploratory Data Analysis**

In [None]:
def correlation_heatmap(dataframe):
    plt.figure(figsize=(20, 10))
    corr_mat = dataframe[dataframe.keys()].corr()
    sns.heatmap(corr_mat, cmap='RdBu_r', robust=True, annot=True)
    plt.show()

In [None]:
correlation_heatmap(dataframe=data_frame)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(poly_x, y, test_size=0.2, shuffle=True, random_state=42)
print(f"Training data shape: {x_train.shape}, labels: {y_train.shape}")
print(f"Testing data shape: {x_test.shape}, labels: {y_test.shape}")

In [None]:
# Import ML Libraries
from sklearn.linear_model import LinearRegression, SGDRegressor, ElasticNet, BayesianRidge
from lightgbm import LGBMRegressor
from xgboost.sklearn import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor

regressors = [[LinearRegression(),'LinearRegression'],[SGDRegressor(),'SGDRegressor'], [ElasticNet(),'ElasticNet'], 
              [BayesianRidge(), 'BayesianRidge'], [LGBMRegressor(),'LGBMRegressor'], [XGBRegressor(),'XGBRegressor'],
              [CatBoostRegressor(verbose=0),'CatBoostRegressor'],[KernelRidge(),'KernelRidge'],
              [GradientBoostingRegressor(),'GradientBoostingRegressor'],[SVR(),'SVR'],[AdaBoostRegressor(),"AdaBoostRegressor"],
              [DecisionTreeRegressor(),"DecisionTreeRegressor"]]

In [None]:
from sklearn import metrics

for rgs in regressors:
    model = rgs[0]
    model.fit(x_train, y_train)
    
    y_pred = model.predict(x_test)
    print(rgs[1])
    print("Mean Absolute Error = ", metrics.mean_absolute_error(y_test,y_pred))
    print("Mean Squared Error = ", metrics.mean_squared_error(y_test,y_pred))
    print("Root Mean Squared Error = ", np.sqrt(metrics.mean_squared_error(y_test,y_pred)))
    print("R2 score = ", metrics.r2_score(y_test, y_pred))
    print("\n\n")

In [None]:
# Linear regression cost function
def compute_cost(features, labels, parameters):
    m = labels.size
    h_x = np.dot(features, parameters)
    cost = (1 / (2 * m)) * sum(np.square(h_x - labels))
    return cost


def gradient_descent(features, labels, epochs, learning_rate):
    J = []
    m = labels.size
    thetas = np.random.random(size=features[0, :].shape)
    for _ in range(epochs):
        h_x = np.dot(features, thetas)
        for i in range(len(thetas)):
            thetas[i] -= (learning_rate / m) * sum((h_x - labels) * features[:, i])
        J.append(compute_cost(features=features, labels=labels, parameters=thetas))

    return thetas, J


def visualize_loss(cost):
    plt.plot(cost)
    plt.xlabel('Number of iterations')
    plt.ylabel('Cost J')
    plt.show()


def MSE(actual, prediction):
    mse = np.sum(np.square(actual - prediction))
    return mse / len(actual)

def RMSE(actual, prediction):
    rmse = np.sqrt(((prediction - actual) ** 2).mean())
    return rmse

In [None]:
alpha = 0.01
epochs = 10000
thetas, J = gradient_descent(features=x_train, labels=y_train, epochs=epochs, learning_rate=alpha)

In [None]:
visualize_loss(cost=J)
print("MSE of training set: {}".format(MSE(y_train, np.dot(x_train, thetas))))
print("MSE of testing set: {}".format(MSE(y_test, np.dot(x_test, thetas))))

In [None]:
print("RMSE of training set: {}".format(RMSE(y_train, np.dot(x_train, thetas))))
print("RMSE of testing set: {}".format(RMSE(y_test, np.dot(x_test, thetas))))