# Linear Regression Model using sklearn

In [None]:
# Loading and exploring

import pandas as pd
from google.colab import drive
drive.mount('/content/drive')

file_path = '/content/drive/MyDrive/Housing_modified.csv'
df = pd.read_csv(file_path) #Database loaded

print("First 5 rows of dataset:") #Displays first 5 rows
print(df.head())

pd.set_option("display.max_rows", None) #Shows all rows
print(df)

print("Dataset shape:")
print(df.shape) #Database shape (rows,columns)

print("Data types:")
print(df.dtypes) #Data types of columns

print("Missing values in each column:")
print(df.isnull().sum()) #Checks for missing values

# Preprocess the Data
from google.colab import drive
drive.mount('/content/drive')

file_path = '/content/drive/MyDrive/Housing_modified.csv'
df = pd.read_csv(file_path)

binary_cols = ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea']

for col in binary_cols:
    df[col] = df[col].map({'yes': 1, 'no': 0})  # Converts Yes/No to 1/0


df = pd.get_dummies(df, columns=['furnishingstatus'], drop_first=True)  # One-hot encoding for 'furnishingstatus'


X = df.drop("price", axis=1)   # All columns except price
y = df["price"]                # Target variable
# This separates Features (X) and Target (y)

# Train and Test Split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Training set:", X_train)
print("Testing set:", X_test)
print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)

# Step 3 - Model Building using sklearn

import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

from google.colab import drive
drive.mount('/content/drive')

file_path = '/content/drive/MyDrive/Housing_modified.csv'
df = pd.read_csv(file_path)

binary_cols = ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea']

for col in binary_cols:
    df[col] = df[col].map({'yes': 1, 'no': 0})

numerical_cols = [col for col in df.columns if col not in binary_cols + ["price"]]

df = pd.get_dummies(df, columns=['furnishingstatus'], drop_first=True)

X = df.drop("price", axis=1)
y = df["price"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

train_mse = mean_squared_error(y_train, y_train_pred)
train_r2 = r2_score(y_train, y_train_pred)

test_mse = mean_squared_error(y_test, y_test_pred)
test_r2 = r2_score(y_test, y_test_pred)

print("Training Set Performance:")
print("  Mean Squared Error:", train_mse)
print("  R² Score:", train_r2)

print("\nTesting Set Performance:")
print("  Mean Squared Error:", test_mse)
print("  R² Score:", test_r2)

# Graphs and visual representation
import matplotlib.pyplot as plt

# 1. Actual vs Predicted:
plt.figure(figsize=(6,6))
plt.scatter(y_test, y_test_pred, alpha=0.7)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', lw=2)
plt.xlabel("Actual Prices")
plt.ylabel("Predicted Prices")
plt.title("Actual vs Predicted House Prices")
plt.show()

# 2. Residual Plot:
residuals = y_test - y_test_pred
plt.figure(figsize=(6,4))
sns.histplot(residuals, kde=True, bins=30)
plt.xlabel("Residuals (Errors)")
plt.title("Distribution of Residuals")
plt.show()

# 3. Predicted vs Residuals:
plt.figure(figsize=(6,4))
plt.scatter(y_test_pred, residuals, alpha=0.7)
plt.axhline(y=0, color='red', linestyle='--')
plt.xlabel("Predicted Values")
plt.ylabel("Residuals")
plt.title("Predicted vs Residuals")
plt.show()

# 4. Correlation heatmap for numerical features:
plt.figure(figsize=(8,6))
sns.heatmap(df[binary_cols + ['price']].corr(), annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Heatmap (Numerical Features)")
plt.show()




# Linear Regression Model using Normal Equation

In [None]:
import pandas as pd
import numpy as np

# Loading the dataset
drive.mount('/content/drive')

file_path = '/content/drive/MyDrive/Housing_modified.csv'
df = pd.read_csv(file_path)

# Preprocessing the data
# Convert Yes/No to 1/0
yes_no_columns = ["mainroad", "guestroom", "basement",
                  "hotwaterheating", "airconditioning", "prefarea"]

for col in yes_no_columns:
    df[col] = df[col].map({"yes": 1, "no": 0})

# One-hot encode for furnishingstatus:
df = pd.get_dummies(df, columns=["furnishingstatus"], drop_first=True)

# Ensuring all data is numeric (convert dtype=object → float)
df = df.apply(pd.to_numeric, errors="coerce")

# Defining features (X) and target (y)
X = df.drop("price", axis=1).values.astype(float)
y = df["price"].values.reshape(-1, 1).astype(float)

# Adding intercept column (bias term)
X_b = np.c_[np.ones((X.shape[0], 1)), X]

# Using the Normal Equation -> theta = (X^T X)^(-1) X^T y
theta = np.linalg.inv(X_b.T.dot(X_b)).dot(X_b.T).dot(y)

print("Model Parameters (theta):")
print(theta.flatten())

# Predictions
y_pred = X_b.dot(theta)

# Evaluation (R² Score & RMSE)
def r2_score(y, y_pred):
    ss_res = np.sum((y - y_pred) ** 2)
    ss_tot = np.sum((y - np.mean(y)) ** 2)
    return 1 - (ss_res / ss_tot)

rmse = np.sqrt(np.mean((y - y_pred) ** 2))
r2 = r2_score(y, y_pred)

print("\nModel Evaluation:")
print("RMSE:", rmse)
print("R² Score:", r2)

# Making an Example prediction
example = X_b[0].reshape(1, -1)
pred_price = example.dot(theta)[0][0]
print("\nExample House Features:", X[0])
print("Predicted Price:", pred_price)
print("Actual Price:", y[0][0])

# Linear Regression Model using Gradient descent

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import copy

drive.mount('/content/drive')

file_path = '/content/drive/MyDrive/Housing_modified.csv'
df = pd.read_csv(file_path) #Loading Dataset

binary_cols = ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea'] #Convert categorical yes/no columns to binary
for col in binary_cols:
    df[col] = df[col].map({'yes':1, 'no':0})

df = pd.get_dummies(df, columns=['furnishingstatus'], drop_first=False) # One-hot encode furnishingstatus

X = df[['area','bedrooms','bathrooms','stories','parking','mainroad','guestroom','basement','hotwaterheating','airconditioning','prefarea','furnishingstatus_furnished','furnishingstatus_semi-furnished','furnishingstatus_unfurnished']].to_numpy(dtype=float)
y = df['price'].to_numpy(dtype=float)
#Now we need to split data into 80:20 for training and testing:L
m = X.shape[0]
split_index = int(0.8 * m)

X_train, X_test = X[:split_index], X[split_index:]
y_train, y_test = y[:split_index], y[split_index:]

# Normalising feautures for better accuracy of the model:
mu = np.mean(X_train, axis=0)
sigma = np.std(X_train, axis=0)
X_train = (X_train - mu) / sigma
X_test = (X_test - mu) / sigma

#Cost Function concept:
def compute_cost(X, y, w, b):
    m = X.shape[0]
    cost = 0.0
    for i in range(m):
        f_wb = np.dot(X[i], w) + b
        cost += (f_wb - y[i])**2
    return cost / (2*m)

# The concept of gradient function:
def compute_gradient(X, y, w, b):
    m, n = X.shape
    dj_dw = np.zeros(n)
    dj_db = 0.0

    for i in range(m):
        f_wb = np.dot(X[i], w) + b
        error = f_wb - y[i]
        dj_dw += error * X[i]
        dj_db += error

    dj_dw /= m
    dj_db /= m
    return dj_dw, dj_db
# Gradient descent calculation
def gradient_descent(X, y, w_in, b_in, alpha, num_iters):
    w = copy.deepcopy(w_in)
    b = b_in
    J_history = []
    for i in range(num_iters):
        dj_dw, dj_db = compute_gradient(X, y, w, b)
        w -= alpha * dj_dw
        b -= alpha * dj_db
        if i % 100 == 0:
            J_history.append(compute_cost(X, y, w, b))
    return w, b, J_history

# R2 score
def r2_score(y_true, y_pred):
    ss_res = np.sum((y_true - y_pred)**2)
    ss_tot = np.sum((y_true - np.mean(y_true))**2)
    return 1 - ss_res/ss_tot

# Learning rate tuning
learning_rates = [0.0005, 0.001, 0.0015, 0.002, 0.005, 0.006, 0.007,0.008,0.009]
iterations = 4000
results = {}

for alpha in learning_rates:
    print(f"\nTraining with learning rate: {alpha}")
    initial_w = np.zeros(X_train.shape[1])
    initial_b = 0
    w_final, b_final, J_hist = gradient_descent(X_train, y_train, initial_w, initial_b, alpha, iterations)
    y_pred_test = X_test @ w_final + b_final
    r2 = r2_score(y_test, y_pred_test)
    results[alpha] = {'w': w_final, 'b': b_final, 'J_hist': J_hist, 'r2_test': r2}
    print(f"Test R2: {r2:.4f}")

# Selecting the best learning rate
best_alpha = max(results, key=lambda x: results[x]['r2_test'])
w_final = results[best_alpha]['w']
b_final = results[best_alpha]['b']
print(f"\nBest learning rate: {best_alpha}, Test R2: {results[best_alpha]['r2_test']:.4f}")

# Ploting all alphas for visual clarity
plt.figure(figsize=(8,6))
for alpha in learning_rates:
    plt.plot(results[alpha]['J_hist'], label=f'alpha={alpha}')
plt.xlabel("Iterations (x100)")
plt.ylabel("Cost")
plt.title("Cost Convergence for Different Learning Rates")
plt.legend()
plt.show()

# Predicted vs actual
y_pred_test = X_test @ w_final + b_final
plt.scatter(y_test, y_pred_test, color="blue", alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")
plt.title("Actual vs Predicted Prices (Test Data)")
plt.show()

#Example:
x_house = np.array([1200, 3, 2, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0])
x_house_norm = (x_house - mu) / sigma
predicted_price = np.dot(x_house_norm, w_final) + b_final
print(f"\nPredicted price of the house = ${predicted_price:0.0f}")

#Input data to predict the price:

area = float(input("Enter area of the house: "))
bedrooms = int(input("Enter number of bedrooms: "))
bathrooms = int(input("Enter number of bathrooms: "))
stories = int(input("Enter number of stories: "))
parking = int(input("Enter number of parking spots: "))

mainroad = int(input("Mainroad? (1 = yes, 0 = no): "))
guestroom = int(input("Guestroom? (1 = yes, 0 = no): "))
basement = int(input("Basement? (1 = yes, 0 = no): "))
hotwaterheating = int(input("Hot water heating? (1 = yes, 0 = no): "))
airconditioning = int(input("Air conditioning? (1 = yes, 0 = no): "))
prefarea = int(input("Preferred area? (1 = yes, 0 = no): "))

print("\nFurnishing status options:")
print("1. Furnished\n2. Semi-furnished\n3. Unfurnished")
f_status = int(input("Enter furnishing status (1/2/3): "))

furnished = 1 if f_status == 1 else 0
semi_furnished = 1 if f_status == 2 else 0
unfurnished = 1 if f_status == 3 else 0

x_user = np.array([area, bedrooms, bathrooms, stories, parking, mainroad, guestroom, basement, hotwaterheating, airconditioning, prefarea, furnished, semi_furnished, unfurnished], dtype=float)

x_user_norm = (x_user - mu) / sigma

pred_price = np.dot(x_user_norm, w_final) + b_final
print(f"Predicted Price of the House = ${pred_price:,.0f}")


#Bonus Challenges

#Challenge 1: Implement Linear Regression manually (no sklearn).
1. DONE in code_main_using_normal_equation.
2. DONE in code_main_using_gradient_descent.

#Challenge 2: Try normalising/standardising features and compare results.
1. Without normalising (in code_main_using_sklearn), we got: Mean Squared Error (MSE): 1352268306323.821 , R² Score: 0.6608043993998174

2. With normalising (in code_main_using_gradient_descent_method), we got: Mean Squared Error (MSE): 1108628115166.75, R2 Score: 0.6813

Hence, we can clearly conclude that with normalising, we got a lower MSE (Relatively lower error) and a higher R² score (Higher accuracy of the model).

Also, we can conclude that by using normalisation, we can reach minima in relatively lower number of steps..as the contour graph (for example) becomes more uniform and circular.

#Challenge 3: Detect and explain any strong correlations between features.
Done in last cell of code_main_using_skelearn using correlation heatmap.

**Here are the findings:**
1. Shows correlation between numerical features and the target (price).
airconditioning (0.45), prefarea (0.33), and mainroad (0.30) have stronger positive correlations with price.
2. Other features like hotwaterheating have weak/no correlation.
This suggests some features contribute more strongly to house price prediction than others.

#Challenge 4: Visualise residuals and discuss your findings.
Done in last cell of code_main_using_skelearn using distribution of residuals and prediction vs residual graph in the last cell.

**Findings: Distribution of residuals:**

1. The residuals (errors) are roughly centered around zero.
2. The distribution is approximately normal, indicating unbiased predictions.
3. However, there are some outliers (large positive/negative errors).
4. This suggests the model performs well but has occasional large deviations.

**Findings: Prediction vs residual graph:**

1. Residuals are scattered randomly around zero, which is a good sign.
2. No strong pattern is visible, meaning errors are not dependent on predicted values.
3. However, variance increases for higher predicted prices.
4. This indicates the model may not be equally accurate across all price ranges.

# Graphs and visual analysis along with findings and outcomes

In [None]:
# Interpretation of various graphs and conclusions from them:

#1. Actual vs Predicted House Prices:
from IPython.display import Image, display
display(Image(filename="Actual_vs_Predicted_prices_graph.png"))
print('''1. The scatter plot shows how well predicted prices align with actual house prices.
2. The red diagonal line represents perfect predictions.
3. Most points are close to the line, but some deviations indicate errors.
4. Overall, the model captures the trend but slightly underestimates higher values and overestimates the lower values, as we expected it to.''')

display(Image(filename="distribution_of_residuals.png"))
print('''1. The residuals (errors) are roughly centered around zero.
2. The distribution is approximately normal, indicating unbiased predictions.
3. However, there are some outliers (large positive/negative errors).
4. This suggests the model performs well but has occasional large deviations.''')

display(Image(filename="predicted_vs_residuals.png"))
print('''1. Residuals are scattered randomly around zero, which is a good sign.
2. No strong pattern is visible, meaning errors are not dependent on predicted values
3. However, variance increases for higher predicted prices.
4. This indicates the model may not be equally accurate across all price ranges.''')

display(Image(filename="correlation_heatmap.png"))
print('''1. Shows correlation between numerical features and the target (price).
2. airconditioning (0.45), prefarea (0.33), and mainroad (0.30) have stronger positive correlations with price.
3. Other features like hotwaterheating have weak/no correlation.
4. This suggests some features contribute more strongly to house price prediction than others.''')

# Learning Outcomes

Through this task, I was able to understand the working of linear regression in depth by implementing it using three different approaches- sklearn’s built-in function, the Normal Equation, and Gradient Descent.
Using sklearn helped me see how easily models can be trained with just a few lines of code.
The Normal Equation showed me the direct mathematical solution without iterations, which is efficient for smaller datasets.
On the other hand, Gradient Descent gave me valuable insight into how optimization works step by step, how the learning rate affects convergence, and why feature normalization is often necessary.
I also explored evaluation metrics like MSE and R² score to measure performance.
Overall, the task helped me connect theory with practice, compare the strengths and weaknesses of each method, and gain a deeper understanding of how linear regression models actually learn from data.
I would like to thank the Coding Club- AI/ML vertical for such an outstanding opportunity.
- Rishit, 20251295