In [2]:
#Implementation from Scratch
#1
import pandas as pd
data = pd.read_csv("/content/drive/MyDrive/Week5/student.csv")

print("The dataset has been successfully loaded.\n")

print("Displaying the first 5 rows of the dataset:\n", data.head())
print("\nDisplaying the last 5 rows of the dataset:\n", data.tail())

print("\nDetailed information about the dataset:")
data.info()

print("\nStatistical summary of numerical columns in the dataset:\n", data.describe())

X = data[["Reading", "Writing"]]
Y = data[["Math"]]
print("\nFeature set (X) containing 'Reading' and 'Writing' scores:\n", X.head())
print("\nTarget variable (Y) containing 'Math' scores:\n", Y.head())


The dataset has been successfully loaded.

Displaying the first 5 rows of the dataset:
    Math  Reading  Writing
0    48       68       63
1    62       81       72
2    79       80       78
3    76       83       79
4    59       64       62

Displaying the last 5 rows of the dataset:
      Math  Reading  Writing
995    72       74       70
996    73       86       90
997    89       87       94
998    83       82       78
999    66       66       72

Detailed information about the dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   Math     1000 non-null   int64
 1   Reading  1000 non-null   int64
 2   Writing  1000 non-null   int64
dtypes: int64(3)
memory usage: 23.6 KB

Statistical summary of numerical columns in the dataset:
               Math      Reading      Writing
count  1000.000000  1000.000000  1000.000000
mean     67.290000    69.8720

In [3]:
#2
import numpy as np
import pandas as pd

data = pd.read_csv("/content/drive/MyDrive/Week5/student.csv")

X = data[["Math"]].to_numpy().T
d = X.shape[0]

W = np.random.rand(d, 1)

Y = np.dot(W.T, X)

print("Feature Matrix (X) - Transposed 'Math' scores:")
print(X)

print("\nWeight Matrix (W) - Randomly initialized weights:")
print(W)

print("\nOutput Matrix (Y) - Result of dot product (W.T * X):")
print(Y)


Feature Matrix (X) - Transposed 'Math' scores:
[[ 48  62  79  76  59  69  70  46  61  86  62  72  56  81  61  49  60  45
   71  75  66  57  67  63  68  68  62  56  82  64  71  69  64  76  61  47
   50  75  69  42  73  78  65  51  61  69  51  64  90  58 100  73  62  45
   47  53  62  49  77  70  60  82 100  72  62  87  56  96  68  66  68  59
   52  60  60  46  62  30  83  69  82  64  57  71  37  69  72  73  70  75
   54  71  60  81  58  54  49  64  84  84  33  51  68  32  43  73  82  47
   74  36  58  68  92  75  43  68  49  69  91  83  65  79  74  81  41  48
   31  53  87  94  77  57  64  55  61  78  94  88  77  90  55  83  68  66
   56  65  84  90  80  47  67  79  68  62  62  44  74  67  46  71  73  49
   42  55  73  75  96  66  56 100  53  68  72  55  59  76  84  68  61  53
   73  61  73  63  51  70  69  61  88  90  60 100  68  85  41  70  91  68
   63  81  89  66  59  80  87  30  63  43  73  49  65  72  83  46  65 100
   40  73  58  87  61  71  72  81  67  69  53  70  56  84  44  63

In [4]:
#3
import numpy as np
import pandas as pd

data = pd.read_csv("/content/drive/MyDrive/Week5/student.csv")

train_ratio = 0.8
train_size = int(len(data) * train_ratio)

train_data = data.iloc[:train_size]
test_data = data.iloc[train_size:]

X_train = train_data.drop('Math', axis=1)
y_train = train_data['Math']

X_test = test_data.drop('Math', axis=1)
y_test = test_data['Math']

print(f"Training data size (X_train): {len(X_train)} rows")
print(f"Test data size (X_test): {len(X_test)} rows")


Training data size (X_train): 800 rows
Test data size (X_test): 200 rows


In [5]:
import numpy as np
import pandas as pd

def gradient_descent(X, y, W, learning_rate, num_iterations):
    m = len(y)
    for i in range(num_iterations):
        predictions = X.dot(W)
        errors = predictions - y
        gradient = (1/m) * X.T.dot(errors)
        W -= learning_rate * gradient
        if (i+1) % 100 == 0:
            loss = np.mean(errors**2)
            print(f"Iteration {i+1}: Loss = {loss}")
    return W

def rmse(y_true, y_pred):
    return np.sqrt(np.mean((y_true - y_pred) ** 2))

def r2_score_custom(y_true, y_pred):
    total_variance = np.sum((y_true - np.mean(y_true)) ** 2)
    unexplained_variance = np.sum((y_true - y_pred) ** 2)
    return 1 - (unexplained_variance / total_variance)

def train_and_evaluate_model(data, target_column, learning_rate=0.01, num_iterations=1000, train_ratio=0.8):
    X = data.drop(target_column, axis=1).values
    y = data[target_column].values.reshape(-1, 1)
    X = (X - X.mean(axis=0)) / X.std(axis=0)
    X = np.c_[np.ones((X.shape[0], 1)), X]
    train_size = int(len(data) * train_ratio)
    X_train, X_test = X[:train_size], X[train_size:]
    y_train, y_test = y[:train_size], y[train_size:]
    W = np.zeros((X_train.shape[1], 1))
    print("Starting Gradient Descent...")
    W = gradient_descent(X_train, y_train, W, learning_rate, num_iterations)
    print("Gradient Descent Completed.\n")
    y_pred = X_test.dot(W)
    rmse_value = rmse(y_test, y_pred)
    r2_value = r2_score_custom(y_test, y_pred)
    print(f"Model Evaluation Results:")
    print(f"RMSE (Root Mean Squared Error): {rmse_value:.4f}")
    print(f"R² (Coefficient of Determination): {r2_value:.4f}\n")
    return W

data = pd.read_csv("/content/drive/MyDrive/Week5/student.csv")
W = train_and_evaluate_model(data, target_column='Math', learning_rate=0.01, num_iterations=1000)


Starting Gradient Descent...
Iteration 100: Loss = 699.2962963635672
Iteration 200: Loss = 162.49871532444774
Iteration 300: Loss = 90.90732975530241
Iteration 400: Loss = 81.3166534173156
Iteration 500: Loss = 80.02741601616349
Iteration 600: Loss = 79.85066715182437
Iteration 700: Loss = 79.82332612061171
Iteration 800: Loss = 79.81633321119472
Iteration 900: Loss = 79.81236775342418
Iteration 1000: Loss = 79.80908195225282
Gradient Descent Completed.

Model Evaluation Results:
RMSE (Root Mean Squared Error): 8.0630
R² (Coefficient of Determination): 0.6688



In [6]:
import numpy as np
import pandas as pd

def gradient_descent(X, y, W, learning_rate, num_iterations):
    m = len(y)
    for i in range(num_iterations):
        predictions = X.dot(W)
        errors = predictions - y
        gradient = (1/m) * X.T.dot(errors)
        W -= learning_rate * gradient
        if (i+1) % 100 == 0:
            loss = np.mean(errors**2)
            print(f"Iteration {i+1}: Loss = {loss}")
    return W

def rmse(y_true, y_pred):
    return np.sqrt(np.mean((y_true - y_pred) ** 2))

def r2_score_custom(y_true, y_pred):
    total_variance = np.sum((y_true - np.mean(y_true)) ** 2)
    unexplained_variance = np.sum((y_true - y_pred) ** 2)
    return 1 - (unexplained_variance / total_variance)

def train_and_evaluate_model(data, target_column, learning_rate=0.01, num_iterations=1000, train_ratio=0.8):
    X = data.drop(target_column, axis=1).values
    y = data[target_column].values.reshape(-1, 1)
    X = (X - X.mean(axis=0)) / X.std(axis=0)
    X = np.c_[np.ones((X.shape[0], 1)), X]
    train_size = int(len(data) * train_ratio)
    X_train, X_test = X[:train_size], X[train_size:]
    y_train, y_test = y[:train_size], y[train_size:]
    W = np.zeros((X_train.shape[1], 1))
    print("Starting Gradient Descent...")
    W = gradient_descent(X_train, y_train, W, learning_rate, num_iterations)
    print("Gradient Descent Completed.\n")
    y_pred = X_test.dot(W)
    rmse_value = rmse(y_test, y_pred)
    r2_value = r2_score_custom(y_test, y_pred)
    print(f"Model Evaluation Results:")
    print(f"Learning Rate: {learning_rate}")
    print(f"RMSE (Root Mean Squared Error): {rmse_value:.4f}")
    print(f"R² (Coefficient of Determination): {r2_value:.4f}\n")
    return W

data = pd.read_csv("/content/drive/MyDrive/Week5/student.csv")

learning_rates = [0.001, 0.01, 0.1, 0.5, 1]

for lr in learning_rates:
    print(f"Evaluating model with learning rate: {lr}")
    W = train_and_evaluate_model(data, target_column='Math', learning_rate=lr, num_iterations=1000)
    print("-" * 50)


Evaluating model with learning rate: 0.001
Starting Gradient Descent...
Iteration 100: Loss = 3889.0670099762856
Iteration 200: Loss = 3181.745091095462
Iteration 300: Loss = 2608.1846780395454
Iteration 400: Loss = 2142.3042431056147
Iteration 500: Loss = 1763.3582670188446
Iteration 600: Loss = 1454.7681111661116
Iteration 700: Loss = 1203.2318619387745
Iteration 800: Loss = 998.0403036668321
Iteration 900: Loss = 830.546782573105
Iteration 1000: Loss = 693.7531351203352
Gradient Descent Completed.

Model Evaluation Results:
Learning Rate: 0.001
RMSE (Root Mean Squared Error): 26.4877
R² (Coefficient of Determination): -2.5742

--------------------------------------------------
Evaluating model with learning rate: 0.01
Starting Gradient Descent...
Iteration 100: Loss = 699.2962963635672
Iteration 200: Loss = 162.49871532444774
Iteration 300: Loss = 90.90732975530241
Iteration 400: Loss = 81.3166534173156
Iteration 500: Loss = 80.02741601616349
Iteration 600: Loss = 79.85066715182437


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
