In [1]:
import pandas as pd
import numpy as np

In [3]:
#toDo1
# Read and Observe the Dataset.
data=pd.read_csv('student.csv')
print(data)

# Print top(5) and bottom(5) of the dataset
print(data.head())
print(data.tail())

# Print the Information of Datasets.
print(data.info())

# Print the Descriptive info about the Dataset.
print(data.describe())

# Split your data into Feature (X) and Label (Y).
# assume last column is label
X=data.iloc[:,:-1]
Y=data.iloc[:,-1]
print("X shape:", X.shape)
print("Y shape:", Y.shape)


     Math  Reading  Writing
0      48       68       63
1      62       81       72
2      79       80       78
3      76       83       79
4      59       64       62
..    ...      ...      ...
995    72       74       70
996    73       86       90
997    89       87       94
998    83       82       78
999    66       66       72

[1000 rows x 3 columns]
   Math  Reading  Writing
0    48       68       63
1    62       81       72
2    79       80       78
3    76       83       79
4    59       64       62
     Math  Reading  Writing
995    72       74       70
996    73       86       90
997    89       87       94
998    83       82       78
999    66       66       72
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   Math     1000 non-null   int64
 1   Reading  1000 non-null   int64
 2   Writing  1000 non-null   int64
dtypes: int64(3)
memory usage:

In [5]:
#toDo-3
'''Split the dataset into training and test sets.
You can use an 80-20 or 70-30 split, with 80% (or 70%) of the data used for training and the rest
for testing.'''

# Function to split data into training and testing sets
def train_test_split(X, y, test_size=0.3, random_state=42):

    # Fix the random seed so results are same every time
    np.random.seed(random_state)

    # Create index numbers for all rows
    indices = np.arange(X.shape[0])

    # Shuffle the index numbers randomly
    np.random.shuffle(indices)

    # Find how many samples go into test set
    test_size_count = int(len(X) * test_size)

    # First part → test data, remaining → training data
    test_indices = indices[:test_size_count]
    train_indices = indices[test_size_count:]

    # Split features using indices
    X_train = X[train_indices]
    X_test = X[test_indices]

    # Split labels using indices
    y_train = y[train_indices]
    y_test = y[test_indices]

    # Return training and testing data
    return X_train, X_test, y_train, y_test


In [11]:
# Step -2- Build a Cost Function:
def cost_function(X, Y, W):

  """ Parameters:
  This function finds the Mean Square Error.
  Input parameters:
  X: Feature Matrix
  Y: Target Matrix
  W: Weight Matrix
  Output Parameters:
  cost: accumulated mean square error.
  """
  n = Y.shape[0]            # Number of data points

  # Predict output using y_pred = X @ W Matrix Multiplication
  y_pred = X @ W            # Shape: (n_samples,) 1D

  # Calculate error (difference between predicted and actual)
  error = y_pred - Y

  # Calculate Mean Squared Error cost: L(W) = 1/(2n) * sum((y_pred - Y)^2)
  cost = np.sum(error**2) / (2 * n)

  return cost

In [17]:
def gradient_descent(X, Y, W, alpha, iterations):


  """
  Perform gradient descent to optimize the parameters of a linear regression model.
  Parameters:
  X (numpy.ndarray): Feature matrix (m x n).
  Y (numpy.ndarray): Target vector (m x 1).
  W (numpy.ndarray): Initial guess for parameters (n x 1).
  alpha (float): Learning rate.
  iterations (int): Number of iterations for gradient descent.
  Returns tuple: A tuple containing the final optimized parameters (W_update) and the history of cost values.
  W_update (numpy.ndarray): Updated parameters (n x 1).
  cost_history (list): History of cost values over iterations.
  """
  # Initialize cost history
  cost_history = [0] * iterations
  # Number of samples
  m = len(Y)
  W_update = W.copy()
  for iteration in range(iterations):
    # Step 1: Hypothesis Values
    Y_pred = X @ W_update

    # Step 2: Difference between Hypothesis and Actual Y
    loss = Y_pred - Y

    # Step 3: Gradient Calculation
    dw = (1/m) * X.T.dot(loss)

    # Step 4: Updating Values of W using Gradient
    W_update = W_update - alpha * dw

    # Step 5: New Cost Value
    cost = cost_function(X, Y, W_update)
    cost_history[iteration] = cost
  return W_update, cost_history



In [18]:
# Model Evaluation - RMSE
def rmse(Y, Y_pred):
  """
  This Function calculates the Root Mean Squres.
  Input Arguments:
  Y: Array of actual(Target) Dependent Varaibles.
  Y_pred: Array of predeicted Dependent Varaibles.
  Output Arguments:
  rmse: Root Mean Square.
  """
  loss = (Y - Y_pred)**2
  rmse = np.sqrt(1/len(Y)*sum(loss))
  return rmse

In [20]:
# Model Evaluation - R2
def r2(Y, Y_pred):
  """
  This Function calculates the R Squared Error.
  Input Arguments:
  Y: Array of actual(Target) Dependent Varaibles.
  Y_pred: Array of predeicted Dependent Varaibles.
  Output Arguments:
  rsquared: R Squared Error.
  """
  mean_y = np.mean(Y)
  ss_tot = (Y - mean_y)**2
  ss_res = (Y - Y_pred)**2
  r2 = 1 - sum(ss_res)/sum(ss_tot)
  return r2

In [21]:
def main():
  # Step 1: Load the dataset
  data = pd.read_csv('student.csv')

  # Step 2: Split the data into features (X) and target (Y)
  X = data[['Math', 'Reading']].values # Features: Math and Reading marks
  Y = data['Writing'].values # Target: Writing marks

  # Step 3: Split the data into training and test sets (80% train, 20% test)
  X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

  # Step 4: Initialize weights (W) to zeros, learning rate and number of iterations
  W = np.zeros(X_train.shape[1]) # Initialize weights
  alpha = 0.0001 # Learning rate
  iterations = 1000 # Number of iterations for gradient descent

  # Step 5: Perform Gradient Descent
  W_optimal, cost_history = gradient_descent(X_train, Y_train, W, alpha, iterations)

  # Step 6: Make predictions on the test set
  Y_pred = np.dot(X_test, W_optimal)

  # Step 7: Evaluate the model using RMSE and R-Squared
  model_rmse = rmse(Y_test, Y_pred)
  model_r2 = r2(Y_test, Y_pred)

  # Step 8: Output the results
  print("Final Weights:", W_optimal)
  print("Cost History (First 10 iterations):", cost_history[:10])
  print("RMSE on Test Set:", model_rmse)
  print("R-Squared on Test Set:", model_r2)

# Execute the main function
if __name__ == "__main__":
  main()

Final Weights: [0.0894932  0.89504864]
Cost History (First 10 iterations): [np.float64(17.813797177522098), np.float64(16.9831490248783), np.float64(16.925140245010397), np.float64(16.867870818076216), np.float64(16.811093513105355), np.float64(16.754804026075387), np.float64(16.69899816573971), np.float64(16.64367177688582), np.float64(16.588820740001896), np.float64(16.53444097097003)]
RMSE on Test Set: 4.792607360540953
R-Squared on Test Set: 0.908240340333986
