<a href="https://colab.research.google.com/github/rayasrujanareddy/ML-1/blob/main/Simple_Linear_Regression_Assignment_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Question 1
Dataset x = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] , y = [1, 3, 2, 5, 7, 8, 8, 9, 10, 12]

In [31]:
import numpy as np

# Data
x = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
y = np.array([1, 3, 2, 5, 7, 8, 8, 9, 10, 12])

# Calculate coefficients using analytic formulation
n = len(x)
x_mean = np.mean(x)
y_mean = np.mean(y)

beta_1 = np.sum((x - x_mean) * (y - y_mean)) / np.sum((x - x_mean) ** 2)
beta_0 = y_mean - beta_1 * x_mean

# Predictions
y_pred = beta_0 + beta_1 * x

# Calculate SSE
SSE = np.sum((y - y_pred) ** 2)

# Calculate R-squared
SS_total = np.sum((y - y_mean) ** 2)
r_squared = 1 - (SSE / SS_total)

# Print results
print(f"Analytic Formulation Coefficients: beta_0 = {beta_0:.4f}, beta_1 = {beta_1:.4f}")
print(f"SSE: {SSE:.4f}, R-squared: {r_squared:.4f}")


Analytic Formulation Coefficients: beta_0 = 1.2364, beta_1 = 1.1697
SSE: 5.6242, R-squared: 0.9525


## Gradient Descent (Full-Batch)

In [32]:
import numpy as np

# Data
x = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
y = np.array([1, 3, 2, 5, 7, 8, 8, 9, 10, 12])

# Gradient Descent (Full-Batch)
alpha = 0.01
iterations = 1000
beta_0, beta_1 = 0.0, 0.0

for _ in range(iterations):
    y_pred = beta_0 + beta_1 * x
    error = y - y_pred
    beta_0 += (alpha / n) * np.sum(error)
    beta_1 += (alpha / n) * np.sum(error * x)

# Predictions
y_pred_gd = beta_0 + beta_1 * x

# Calculate SSE and R-squared
SSE_gd = np.sum((y - y_pred_gd) ** 2)
r_squared_gd = 1 - (SSE_gd / SS_total)

# Print results
print(f"Gradient Descent (Full-Batch) Coefficients: beta_0 = {beta_0:.4f}, beta_1 = {beta_1:.4f}")
print(f"SSE: {SSE_gd:.4f}, R-squared: {r_squared_gd:.4f}")


Gradient Descent (Full-Batch) Coefficients: beta_0 = 1.1758, beta_1 = 1.1794
SSE: 5.6349, R-squared: 0.9524


## Gradient Descent (Stochastic )

In [33]:
import numpy as np

# Data
x = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
y = np.array([1, 3, 2, 5, 7, 8, 8, 9, 10, 12])

# Parameters
alpha = 0.01  # Learning rate
iterations = 1000  # Number of epochs
beta_0, beta_1 = 0.0, 0.0  # Initial coefficients
n = len(x)

# Stochastic Gradient Descent
for epoch in range(iterations):
    for i in range(n):
        # Randomly select one data point
        random_index = np.random.randint(n)
        x_i = x[random_index]
        y_i = y[random_index]

        # Prediction for the selected point
        y_pred_i = beta_0 + beta_1 * x_i

        # Error calculation
        error_i = y_i - y_pred_i

        # Coefficient updates
        beta_0 += alpha * error_i
        beta_1 += alpha * error_i * x_i

# Predictions for the entire dataset
y_pred_sgd = beta_0 + beta_1 * x

# Calculate SSE and R-squared
SSE_sgd = np.sum((y - y_pred_sgd) ** 2)
y_mean = np.mean(y)
SS_total = np.sum((y - y_mean) ** 2)
r_squared_sgd = 1 - (SSE_sgd / SS_total)

# Print results
print(f"SGD Coefficients: beta_0 = {beta_0:.4f}, beta_1 = {beta_1:.4f}")
print(f"SSE: {SSE_sgd:.4f}, R-squared: {r_squared_sgd:.4f}")


SGD Coefficients: beta_0 = 1.1780, beta_1 = 1.2311
SSE: 6.4109, R-squared: 0.9459


# Question 2 - Boston Housing Rate Dataset.
 Importing necessary libraries

In [34]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error, r2_score

#Load dataset
dataset = pd.read_csv('/content/BostonHousing.csv')

#Inspect the dataset
print(dataset.head()) # Changed 'boston' to 'dataset'
print(dataset.info()) # Changed 'boston' to 'dataset'

# Boston housing prices dataset has been removed. Use the California housing dataset instead.
from sklearn.datasets import fetch_california_housing
housing = fetch_california_housing()

# Create DataFrame
df = pd.DataFrame(housing.data, columns=housing.feature_names)
df['PRICE'] = housing.target

      crim    zn  indus  chas    nox     rm   age     dis  rad  tax  ptratio  \
0  0.00632  18.0   2.31     0  0.538  6.575  65.2  4.0900    1  296     15.3   
1  0.02731   0.0   7.07     0  0.469  6.421  78.9  4.9671    2  242     17.8   
2  0.02729   0.0   7.07     0  0.469  7.185  61.1  4.9671    2  242     17.8   
3  0.03237   0.0   2.18     0  0.458  6.998  45.8  6.0622    3  222     18.7   
4  0.06905   0.0   2.18     0  0.458  7.147  54.2  6.0622    3  222     18.7   

        b  lstat  medv  
0  396.90   4.98  24.0  
1  396.90   9.14  21.6  
2  392.83   4.03  34.7  
3  394.63   2.94  33.4  
4  396.90   5.33  36.2  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   crim     506 non-null    float64
 1   zn       506 non-null    float64
 2   indus    506 non-null    float64
 3   chas     506 non-null    int64  
 4   nox      506 non-null    float6

## Correlation analysis

In [35]:

# Correlation analysis
correlations = df.corr()
print("Correlation with Price:\n", correlations['PRICE'])

# Find the feature with the highest correlation (excluding 'PRICE' itself)
best_correlated_feature = correlations['PRICE'].drop('PRICE').idxmax()
print(f"Best correlated feature with PRICE: {best_correlated_feature}")


Correlation with Price:
 MedInc        0.688075
HouseAge      0.105623
AveRooms      0.151948
AveBedrms    -0.046701
Population   -0.024650
AveOccup     -0.023737
Latitude     -0.144160
Longitude    -0.045967
PRICE         1.000000
Name: PRICE, dtype: float64
Best correlated feature with PRICE: MedInc



## Implement the Analytic Formulation

In [36]:
# The attribute with the highest correlation will be selected for regression
# 'MedInc' (median income) has the highest positive correlation with Price
X = df[['MedInc']].values  # Replace with the actual best-correlated feature
y = df['PRICE'].values.reshape(-1, 1)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Adding a bias column (intercept) to the training data
X_b_train = np.c_[np.ones((X_train.shape[0], 1)), X_train]

# Compute the theta (coefficients) using the Normal Equation (Analytic Formulation)
theta_analytic = np.linalg.inv(X_b_train.T.dot(X_b_train)).dot(X_b_train.T).dot(y_train)
print("Analytic solution coefficients:", theta_analytic)



Analytic solution coefficients: [[0.44459729]
 [0.41933849]]


## Implement Gradient Descent (Full-batch)

In [37]:
# Full-batch gradient descent for linear regression
def gradient_descent(X, y, lr=0.01, epochs=1000):
    m = len(y)
    X_b = np.c_[np.ones((m, 1)), X]  # Add bias column
    theta = np.random.randn(X_b.shape[1], 1)  # Random initialization of coefficients
    for epoch in range(epochs):
        gradients = 2/m * X_b.T.dot(X_b.dot(theta) - y)
        theta -= lr * gradients
    return theta

theta_gd_full = gradient_descent(X_train, y_train)
print("Full-batch Gradient Descent coefficients:", theta_gd_full)




Full-batch Gradient Descent coefficients: [[0.47189966]
 [0.41361139]]


## Implement Stochastic Gradient Descent

In [38]:
# Stochastic Gradient Descent (SGD)
def stochastic_gradient_descent(X, y, lr=0.01, epochs=50):
    m = len(y)
    X_b = np.c_[np.ones((m, 1)), X]  # Add bias column
    theta = np.random.randn(X_b.shape[1], 1)  # Random initialization
    for epoch in range(epochs):
        for i in range(m):
            random_index = np.random.randint(m)
            xi = X_b[random_index:random_index+1]
            yi = y[random_index:random_index+1]
            gradients = 2 * xi.T.dot(xi.dot(theta) - yi)
            theta -= lr * gradients
    return theta

theta_sgd = stochastic_gradient_descent(X_train, y_train)
print("Stochastic Gradient Descent coefficients:", theta_sgd)

Stochastic Gradient Descent coefficients: [[0.51044168]
 [0.95843231]]


## Predicting

In [39]:
# Predicting with Analytic, Full-batch, and SGD solutions
X_b_test = np.c_[np.ones((X_test.shape[0], 1)), X_test]

y_pred_analytic = X_b_test.dot(theta_analytic)
y_pred_gd_full = X_b_test.dot(theta_gd_full)
y_pred_sgd = X_b_test.dot(theta_sgd)

## Compare the results

In [40]:

print(f"Analytic solution coefficients:", r2_analytic)
print(f"Full-batch Gradient Descent coefficients:", r2_gd_full)
print(f" Stochastic Gradient Descent coefficients:", r2_sgd)

Analytic solution coefficients: 0.45885918903846656
Full-batch Gradient Descent coefficients: 0.45883106850758815
 Stochastic Gradient Descent coefficients: 0.4359871568434257
