<a href="https://colab.research.google.com/github/prashannachauhankshetri99/Worksheet1/blob/main/Worksheet5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from google.colab import drive

# Mounting Google Drive to access the dataset
drive.mount('/content/drive')

# Loading the dataset from Google Drive
dataset = pd.read_csv('/content/drive/MyDrive/5CS037 - Cohort 11+12 - 2024 - Materials/Week -5- Introduction to Linear Models for Regression/student.csv')

# Displaying the first 5 rows of the dataset
print("Displaying the first 5 rows of the dataset:")
print(dataset.head())

# Displaying the last 5 rows of the dataset
print("\nDisplaying the last 5 rows of the dataset:")
print(dataset.tail())

# Displaying basic info of the dataset
print("\nDataset Info:")
print(dataset.info())

# Showing basic statistics of the dataset
print("\nDescriptive Statistics:")
print(dataset.describe())

# Separating features and target variable
features = dataset[['Math', 'Reading']].values
target = dataset['Writing'].values

# Displaying the shapes of features and target
print("\nShape of Features:", features.shape)
print("Shape of Target:", target.shape)

# Transposing features matrix for calculations
features = features.T  # Now shape is (2, n)

# Initializing weight matrix with zeros
weights = np.zeros((features.shape[0], 1))  # Shape of weights: (2, 1)

# Predicting the target variable using the formula: target = weights^T * features
predictions = np.dot(weights.T, features)
predictions = predictions.T  # Converting to column vector

# Calculating Mean Squared Error (MSE)
mse_loss = np.mean((target - predictions) ** 2)
print("\nMean Squared Error (MSE) of the model:", mse_loss)

# Displaying initial weight values
print("\nInitial Weights:")
print(weights)

# Splitting the data into training and test sets (80% training, 20% testing)
X_train, X_test, Y_train, Y_test = train_test_split(features.T, target, test_size=0.2, random_state=42)

# Displaying the shapes of the training and testing sets
print("\nShape of Training Features:", X_train.shape)
print("Shape of Testing Features:", X_test.shape)
print("Shape of Training Target:", Y_train.shape)
print("Shape of Testing Target:", Y_test.shape)

# Gradient Descent function for optimizing the model
def cost_function(features, target, weights):
    predicted_values = np.dot(features, weights)  # Calculating predictions
    errors = target - predicted_values  # Calculating errors
    cost = np.mean(errors ** 2)  # Calculating MSE cost
    return cost

def gradient_descent(features, target, weights, learning_rate, iterations):
    cost_history = []  # To store the cost values at each iteration
    num_samples = len(target)
    for iteration in range(iterations):
        predictions = np.dot(features, weights)  # Calculating predictions
        loss = predictions - target  # Calculating the loss
        weight_gradient = (2 / num_samples) * np.dot(features.T, loss)  # Gradient calculation
        weights = weights - learning_rate * weight_gradient  # Updating weights
        cost = cost_function(features, target, weights)  # Calculating the cost
        cost_history.append(cost)  # Storing the cost
    return weights, cost_history

# Defining learning rate and number of iterations for gradient descent
learning_rate = 0.01
iterations = 1000
initial_weights = np.zeros(X_train.shape[1])  # Initializing weights as zeros

# Running gradient descent to optimize the weights
optimized_weights, cost_history = gradient_descent(X_train, Y_train, initial_weights, learning_rate, iterations)

# Displaying the optimized weights after gradient descent
print("\nOptimized Weights:")
print(optimized_weights)

# Function to calculate Root Mean Squared Error (RMSE)
def rmse(actual, predicted):
    squared_diff = (actual - predicted) ** 2
    mean_squared_error = np.mean(squared_diff)
    return np.sqrt(mean_squared_error)

# Function to calculate R-squared (R2) score
def r2(actual, predicted):
    total_variance = np.sum((actual - np.mean(actual)) ** 2)
    residual_variance = np.sum((actual - predicted) ** 2)
    return 1 - (residual_variance / total_variance)

# Making predictions using the optimized weights on the test set
predicted_test_values = np.dot(X_test, optimized_weights)

# Evaluating the model performance using RMSE and R2
model_rmse = rmse(Y_test, predicted_test_values)
model_r2 = r2(Y_test, predicted_test_values)

# Displaying the model evaluation metrics
print("\nModel Evaluation Metrics:")
print("RMSE on Test Set:", model_rmse)
print("R-Squared on Test Set:", model_r2)

# Main function to run the entire process
def main():
    dataset = pd.read_csv('/content/drive/MyDrive/5CS037 - Cohort 11+12 - 2024 - Materials/Week -5- Introduction to Linear Models for Regression/student.csv')
    features = dataset[['Math', 'Reading']].values
    target = dataset['Writing'].values
    X_train, X_test, Y_train, Y_test = train_test_split(features, target, test_size=0.2, random_state=42)

    initial_weights = np.zeros(X_train.shape[1])
    learning_rate = 0.01
    iterations = 1000

    optimized_weights, cost_history = gradient_descent(X_train, Y_train, initial_weights, learning_rate, iterations)

    predicted_values = np.dot(X_test, optimized_weights)

    model_rmse = rmse(Y_test, predicted_values)
    model_r2 = r2(Y_test, predicted_values)

    print("Final Optimized Weights:", optimized_weights)
    print("RMSE on Test Set:", model_rmse)
    print("R-Squared on Test Set:", model_r2)

# Running the main function
if __name__ == "__main__":
    main()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Displaying the first 5 rows of the dataset:
   Math  Reading  Writing
0    48       68       63
1    62       81       72
2    79       80       78
3    76       83       79
4    59       64       62

Displaying the last 5 rows of the dataset:
     Math  Reading  Writing
995    72       74       70
996    73       86       90
997    89       87       94
998    83       82       78
999    66       66       72

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   Math     1000 non-null   int64
 1   Reading  1000 non-null   int64
 2   Writing  1000 non-null   int64
dtypes: int64(3)
memory usage: 23.6 KB
None

Descriptive Statistics:
              Math      Reading      Writing
count  1000.000000  1000.000000  1000.000000


  ret = umr_sum(arr, axis, dtype, out, keepdims, where=where)
  cost = np.mean(errors ** 2)  # Calculating MSE cost
  weights = weights - learning_rate * weight_gradient  # Updating weights
  ret = umr_sum(arr, axis, dtype, out, keepdims, where=where)
  cost = np.mean(errors ** 2)  # Calculating MSE cost
  weights = weights - learning_rate * weight_gradient  # Updating weights
