In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Step 1: Load the dataset
data = pd.read_csv('housing.csv', delim_whitespace=True, header=None)

In [None]:
data.shape

(506, 14)

In [None]:
# Step 2: Preprocess the data
#X contains all columns except the last one
X = data.iloc[:, :-1].values  # Features
print(X)

[[6.3200e-03 1.8000e+01 2.3100e+00 ... 1.5300e+01 3.9690e+02 4.9800e+00]
 [2.7310e-02 0.0000e+00 7.0700e+00 ... 1.7800e+01 3.9690e+02 9.1400e+00]
 [2.7290e-02 0.0000e+00 7.0700e+00 ... 1.7800e+01 3.9283e+02 4.0300e+00]
 ...
 [6.0760e-02 0.0000e+00 1.1930e+01 ... 2.1000e+01 3.9690e+02 5.6400e+00]
 [1.0959e-01 0.0000e+00 1.1930e+01 ... 2.1000e+01 3.9345e+02 6.4800e+00]
 [4.7410e-02 0.0000e+00 1.1930e+01 ... 2.1000e+01 3.9690e+02 7.8800e+00]]


In [None]:
# y contains only the last column.
y = data.iloc[:, -1].values   # Target variable

print(y)

[24.  21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 15.  18.9 21.7 20.4
 18.2 19.9 23.1 17.5 20.2 18.2 13.6 19.6 15.2 14.5 15.6 13.9 16.6 14.8
 18.4 21.  12.7 14.5 13.2 13.1 13.5 18.9 20.  21.  24.7 30.8 34.9 26.6
 25.3 24.7 21.2 19.3 20.  16.6 14.4 19.4 19.7 20.5 25.  23.4 18.9 35.4
 24.7 31.6 23.3 19.6 18.7 16.  22.2 25.  33.  23.5 19.4 22.  17.4 20.9
 24.2 21.7 22.8 23.4 24.1 21.4 20.  20.8 21.2 20.3 28.  23.9 24.8 22.9
 23.9 26.6 22.5 22.2 23.6 28.7 22.6 22.  22.9 25.  20.6 28.4 21.4 38.7
 43.8 33.2 27.5 26.5 18.6 19.3 20.1 19.5 19.5 20.4 19.8 19.4 21.7 22.8
 18.8 18.7 18.5 18.3 21.2 19.2 20.4 19.3 22.  20.3 20.5 17.3 18.8 21.4
 15.7 16.2 18.  14.3 19.2 19.6 23.  18.4 15.6 18.1 17.4 17.1 13.3 17.8
 14.  14.4 13.4 15.6 11.8 13.8 15.6 14.6 17.8 15.4 21.5 19.6 15.3 19.4
 17.  15.6 13.1 41.3 24.3 23.3 27.  50.  50.  50.  22.7 25.  50.  23.8
 23.8 22.3 17.4 19.1 23.1 23.6 22.6 29.4 23.2 24.6 29.9 37.2 39.8 36.2
 37.9 32.5 26.4 29.6 50.  32.  29.8 34.9 37.  30.5 36.4 31.1 29.1 50.
 33.3 3

In [None]:
X.shape

(506, 13)

In [None]:
y.shape

(506,)

In [None]:
# Standardize the features,This part standardizes the features by scaling them to have a mean of 0 and a standard deviation of 1
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Reshape y to have compatible dimensions
y = y.reshape(-1, 1)

In [None]:
y.shape

(506, 1)

In [None]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [None]:
# Step 3: Implement simple linear regression
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

In [None]:
# Step 4: Calculate performance metrics for simple linear regression
r2_lr = r2_score(y_test, y_pred_lr)
mae_lr = mean_absolute_error(y_test, y_pred_lr)
mse_lr = mean_squared_error(y_test, y_pred_lr)
rmse_lr = np.sqrt(mse_lr)

print("Simple Linear Regression Metrics:")
print("R2 Score:", r2_lr)
print("Mean Absolute Error (MAE):", mae_lr)
print("Mean Squared Error (MSE):", mse_lr)
print("Root Mean Squared Error (RMSE):", rmse_lr)

Simple Linear Regression Metrics:
R2 Score: 0.668759493535632
Mean Absolute Error (MAE): 3.1890919658878487
Mean Squared Error (MSE): 24.291119474973513
Root Mean Squared Error (RMSE): 4.928602182665336


In [None]:
# Step 5: Implement linear regression using a DNN
model = Sequential([
    Dense(64, input_shape=(X_train.shape[1],), activation='relu'),
    Dense(1)
])

model.compile(optimizer='adam', loss='mse')
model.fit(X_train, y_train, epochs=100, batch_size=32, verbose=0)

y_pred_dnn = model.predict(X_test)



In [None]:
# Step 6: Calculate performance metrics for linear regression using DNN
r2_dnn = r2_score(y_test, y_pred_dnn)
mae_dnn = mean_absolute_error(y_test, y_pred_dnn)
mse_dnn = mean_squared_error(y_test, y_pred_dnn)
rmse_dnn = np.sqrt(mse_dnn)

print("\nLinear Regression using DNN Metrics:")
print("R2 Score:", r2_dnn)
print("Mean Absolute Error (MAE):", mae_dnn)
print("Mean Squared Error (MSE):", mse_dnn)
print("Root Mean Squared Error (RMSE):", rmse_dnn)



Linear Regression using DNN Metrics:
R2 Score: 0.7569252471078556
Mean Absolute Error (MAE): 2.736520586761774
Mean Squared Error (MSE): 17.825591220341604
Root Mean Squared Error (RMSE): 4.222036383114386


In [None]:
# Comparison of Algorithms
print("\nComparison of Algorithms:")
print("Simple Linear Regression - R2 Score:", r2_lr)
print("Linear Regression using DNN - R2 Score:", r2_dnn)
print("\nSimple Linear Regression - MAE:", mae_lr)
print("Linear Regression using DNN - MAE:", mae_dnn)
print("\nSimple Linear Regression - MSE:", mse_lr)
print("Linear Regression using DNN - MSE:", mse_dnn)
print("\nSimple Linear Regression - RMSE:", rmse_lr)
print("Linear Regression using DNN - RMSE:", rmse_dnn)



Comparison of Algorithms:
Simple Linear Regression - R2 Score: 0.668759493535632
Linear Regression using DNN - R2 Score: 0.7569252471078556

Simple Linear Regression - MAE: 3.1890919658878487
Linear Regression using DNN - MAE: 2.736520586761774

Simple Linear Regression - MSE: 24.291119474973513
Linear Regression using DNN - MSE: 17.825591220341604

Simple Linear Regression - RMSE: 4.928602182665336
Linear Regression using DNN - RMSE: 4.222036383114386


In [None]:
print("Hence Linear Regression using DNN  is better than Simple Linear Regression \n as  DNN model has higher R2 score, and lower MAE, MSE, and RMSE compared to the simple linear regression model")

Hence Linear Regression using DNN  is better than Simple Linear Regression 
 as  DNN model has higher R2 score, and lower MAE, MSE, and RMSE compared to the simple linear regression model


In [None]:
# # # delim_whitespace=True: This parameter specifies that the delimiter in the CSV file is whitespace (spaces or tabs),
# rather than the default comma (,). This is because some datasets may use whitespace as the delimiter instead of commas.


# # A delimiter, in the context of data processing and storage, is a character or sequence of
# characters used to separate individual pieces of data within a larger dataset.

# # # header=None: This parameter specifies that the CSV file does not contain a header row.
# In other words, the first row of the CSV file will be treated as data rather than column names.
# If the CSV file does contain a header row with column names, you would typically set header=0 to indicate
# that the first row contains the column names