In [1]:
import numpy as np
import pandas as pd

In [2]:
# writing train test split function
def train_test_split(
    X: np.ndarray, y: np.ndarray, test_size: float = 0.3, random_seed: int = 42
):
    """Separates the dataset into train and test sets
    Args:
        X (np.ndarray): Feature matrix
        y (np.ndarray): Label Vector
        test_size (float, optional):
          Proportion of dataset to include in the test split
          Defaults to 0.3.
        random_seed (int, optional):
          Seed for reproduceability
          Defaults to 41.
    Returns:
      X_train, X_test, y_train, y_test : np.ndarray
        Training and testing splits of features and target.
    """
    np.random.seed(random_seed)
    indices = np.arange(X.shape[0])
    np.random.shuffle(indices)
    test_split_size = int(len(X) * test_size)
    test_indices = indices[:test_split_size]
    train_indices = indices[test_split_size:]
    X_train, X_test = X[train_indices], X[test_indices]
    y_train, y_test = y[train_indices], y[test_indices]
    return X_train, X_test, y_train, y_test

In [3]:
# Define the cost function
def cost_function(X, Y, W):
    """
    Parameters:
      This function finds the Mean Square Error.
    Input parameters:
      X: Feature Matrix
      Y: Target Matrix
      W: Weight Matrix
    Output Parameters:
      cost: accumulated mean square error.
    """
    m = len(Y)
    predictions = X.dot(W)
    errors = Y - predictions
    squared_errors = errors**2
    cost = np.mean(squared_errors)
    return cost

In [4]:
# Test case
X_test = np.array([[1, 2], [3, 4], [5, 6]])
Y_test = np.array([3, 7, 11])
W_test = np.array([1, 1])
cost = cost_function(X_test, Y_test, W_test)
if cost == 0:
  print("Proceed Further")
else:
  print("something went wrong: Reimplement a cost function")
  print("Cost function output:", cost_function(X_test, Y_test, W_test))

Proceed Further


In [5]:
def gradient_descent(X, Y, W, alpha, iterations):
    """
    Perform gradient descent to optimize the parameters of a linear regression model.
    Parameters:
      X (numpy.ndarray): Feature matrix (m x n).
      Y (numpy.ndarray): Target vector (m x 1).
      W (numpy.ndarray): Initial guess for parameters (n x 1).
      alpha (float): Learning rate.
      iterations (int): Number of iterations for gradient descent.
    Returns:
      tuple: A tuple containing the final optimized parameters (W_update) and the history of cost values.
      W_update (numpy.ndarray): Updated parameters (n x 1).
      cost_history (list): History of cost values over iterations.
    """
    # Initialize cost history
    cost_history = [0] * iterations
    W_START = 0
    # Number of samples
    m = len(Y)
    for iteration in range(iterations):
        # Step 1: Hypothesis Values
        Y_pred = X.dot(W)
        # Step 2: Difference between Hypothesis and Actual Y
        loss = Y_pred - Y
        # Step 3: Gradient Calculation
        dw = (2 / m) * X.T.dot(loss)
        # Step 4: Updating Values of W using Gradient
        W_update = W_START - alpha * dw
        # Step 5: New Cost Value
        cost = cost_function(X, Y, W_update)
        cost_history[iteration] = cost
    return W_update, cost_history

In [6]:
# Generate random test data
np.random.seed(0) # For reproducibility
X = np.random.rand(100, 3) # 100 samples, 3 features
Y = np.random.rand(100)
W = np.random.rand(3) # Initial guess for parameters
# Set hyperparameters
alpha = 0.01
iterations = 1000
# Test the gradient_descent function
final_params, cost_history = gradient_descent(X, Y, W, alpha, iterations)
# Print the final parameters and cost history
print("Final Parameters:", final_params)
print("Cost History:", cost_history)

Final Parameters: [-0.00321979 -0.00367639 -0.0026994 ]
Cost History: [0.30907277044736753, 0.30907277044736753, 0.30907277044736753, 0.30907277044736753, 0.30907277044736753, 0.30907277044736753, 0.30907277044736753, 0.30907277044736753, 0.30907277044736753, 0.30907277044736753, 0.30907277044736753, 0.30907277044736753, 0.30907277044736753, 0.30907277044736753, 0.30907277044736753, 0.30907277044736753, 0.30907277044736753, 0.30907277044736753, 0.30907277044736753, 0.30907277044736753, 0.30907277044736753, 0.30907277044736753, 0.30907277044736753, 0.30907277044736753, 0.30907277044736753, 0.30907277044736753, 0.30907277044736753, 0.30907277044736753, 0.30907277044736753, 0.30907277044736753, 0.30907277044736753, 0.30907277044736753, 0.30907277044736753, 0.30907277044736753, 0.30907277044736753, 0.30907277044736753, 0.30907277044736753, 0.30907277044736753, 0.30907277044736753, 0.30907277044736753, 0.30907277044736753, 0.30907277044736753, 0.30907277044736753, 0.30907277044736753, 0.309

In [7]:
# Model Evaluation - RMSE
def rmse(Y, Y_pred):
    """
    This Function calculates the Root Mean Squres.
    Input Arguments:
      Y: Array of actual(Target) Dependent Varaibles.
      Y_pred: Array of predeicted Dependent Varaibles.
    Output Arguments:
      rmse: Root Mean Square.
    """
    errors = Y - Y_pred
    squared_errors = errors**2
    mse = np.mean(squared_errors)
    rmse = np.sqrt(mse)
    return rmse

In [8]:
# Model Evaluation - R2
def r2(Y, Y_pred):
    """
    This Function calculates the R Squared Error.
    Input Arguments:
      Y: Array of actual(Target) Dependent Varaibles.
      Y_pred: Array of predeicted Dependent Varaibles.
    Output Arguments:
      rsquared: R Squared Error.
    """
    mean_y = np.mean(Y)
    ss_tot = np.sum((Y - mean_y) ** 2)
    ss_res = np.sum((Y - Y_pred) ** 2)
    r2 = 1 - (ss_res / ss_tot)
    return r2

In [9]:
# Main Function
def main():
  # Step 1: Load the dataset
  data = pd.read_csv('../datasets/student.csv')

  # Step 2: Split the data into features (X) and target (Y)
  X = data[['Math', 'Reading']].values # Features: Math and Reading marks
  Y = data['Writing'].values # Target: Writing marks

  # Step 3: Split the data into training and test sets (80% train, 20% test)
  X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_seed=42)

  # Step 4: Initialize weights (W) to zeros, learning rate and number of iterations
  W = np.zeros(X_train.shape[1]) # Initialize weights
  alpha = 0.00001 # Learning rate
  iterations = 1000 # Number of iterations for gradient descent

  # Step 5: Perform Gradient Descent
  W_optimal, cost_history = gradient_descent(X_train, Y_train, W, alpha, iterations)

  # Step 6: Make predictions on the test set
  Y_pred = np.dot(X_test, W_optimal)

  # Step 7: Evaluate the model using RMSE and R-Squared
  model_rmse = rmse(Y_test, Y_pred)
  model_r2 = r2(Y_test, Y_pred)

  # Step 8: Output the results
  print("Final Weights:", W_optimal)
  print("Cost History (First 10 iterations):", cost_history[:10])
  print("RMSE on Test Set:", model_rmse)
  print("R-Squared on Test Set:", model_r2)

if __name__ == "__main__":
  main()

FileNotFoundError: [Errno 2] No such file or directory: '../datasets/student.csv'