In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

file_path = '/content/drive/MyDrive/5CS037/medical_students_dataset.csv'
df = pd.read_csv(file_path)

print("\nFirst 5 rows of the dataset:")
print(df.head())

print("\nLast 5 rows of the dataset:")
print(df.tail())

print("\nDataset Information:")
print(df.info())

print("\nDescriptive Statistics of the Dataset:")
print(df.describe())


target_column = 'Diabetes'
X = df.drop(columns=[target_column]).values
y = df[target_column].values

print(f"\nFeature matrix (X) shape: {X.shape}")
print(f"Label vector (y) shape: {y.shape}")



First 5 rows of the dataset:
   Student ID   Age  Gender      Height     Weight Blood Type        BMI  \
0         1.0  18.0  Female  161.777924  72.354947          O  27.645835   
1         2.0   NaN    Male  152.069157  47.630941          B        NaN   
2         3.0  32.0  Female  182.537664  55.741083          A  16.729017   
3         NaN  30.0    Male  182.112867  63.332207          B  19.096042   
4         5.0  23.0  Female         NaN  46.234173          O        NaN   

   Temperature  Heart Rate  Blood Pressure  Cholesterol Diabetes Smoking  
0          NaN        95.0           109.0        203.0       No     NaN  
1    98.714977        93.0           104.0        163.0       No      No  
2    98.260293        76.0           130.0        216.0      Yes      No  
3    98.839605        99.0           112.0        141.0       No     Yes  
4    98.480008        95.0             NaN        231.0       No      No  

Last 5 rows of the dataset:
        Student ID   Age  Gender  

In [None]:
import numpy as np

d = 3
n = 5

X = np.random.rand(d, n)
W = np.random.rand(d, 1)

Y = np.dot(W.T, X)

print("Feature Matrix X (d x n):")
print(X)

print("\nWeight Matrix W (d x 1):")
print(W)

print("\nOutput Matrix Y (n x 1):")
print(Y.T)


Feature Matrix X (d x n):
[[0.32344972 0.50834122 0.86015971 0.12385641 0.58814689]
 [0.87250475 0.05568095 0.344246   0.70687875 0.16119419]
 [0.36991907 0.44254806 0.09335767 0.97689939 0.83037758]]

Weight Matrix W (d x 1):
[[0.24031402]
 [0.02264753]
 [0.91607238]]

Output Matrix Y (n x 1):
[[0.43636222]
 [0.52882861]
 [0.30002714]
 [0.94068403]
 [0.90567656]]


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

target_column = 'Gender'
X = df.drop(columns=[target_column])
y = df[target_column]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")


Training set size: 160000
Test set size: 40000


In [None]:
import numpy as np

def cost_function(X, Y, W):
    """
    Parameters:
    This function calculates the Mean Squared Error (MSE).

    Input parameters:
    X: Feature matrix (n x d) where n is the number of examples and d is the number of features.
    Y: Target vector (n x 1), the actual values we are trying to predict.
    W: Weight vector (d x 1), the weights of the model.

    Output parameters:
    cost: The accumulated Mean Squared Error.
    """
    Y_pred = np.dot(X, W)

    squared_error = np.square(Y_pred - Y)

    cost = (1 / (2 * len(Y))) * np.sum(squared_error)

    return cost

X_test = np.array([[1, 2], [3, 4], [5, 6]])
Y_test = np.array([3, 7, 11])
W_test = np.array([1, 1])

cost = cost_function(X_test, Y_test, W_test)

if cost == 0:
    print("Proceed Further")
else:
    print("Something went wrong: Reimplement the cost function")

print("Cost function output:", cost)


Proceed Further
Cost function output: 0.0


In [None]:
import numpy as np

def cost_function(X, Y, W):
    """
    Parameters:
    This function calculates the Mean Squared Error (MSE).

    Input parameters:
    X: Feature matrix (n x d) where n is the number of examples and d is the number of features.
    Y: Target vector (n x 1), the actual values we are trying to predict.
    W: Weight vector (d x 1), the weights of the model.

    Output parameters:
    cost: The accumulated Mean Squared Error.
    """

    Y_pred = np.dot(X, W)


    squared_error = np.square(Y_pred - Y)

    cost = (1 / (2 * len(Y))) * np.sum(squared_error)

    return cost

def gradient_descent(X, Y, W, alpha, iterations):
    """
    Perform gradient descent to optimize the parameters of a linear regression model.

    Parameters:
    X (numpy.ndarray): Feature matrix (m x n).
    Y (numpy.ndarray): Target vector (m x 1).
    W (numpy.ndarray): Initial guess for parameters (n x 1).
    alpha (float): Learning rate.
    iterations (int): Number of iterations for gradient descent.

    Returns:
    tuple: A tuple containing the final optimized parameters (W_update) and the history of cost values.
    """

    cost_history = [0] * iterations

    m = len(Y)

    for iteration in range(iterations):

        Y_pred = np.dot(X, W)

        loss = Y_pred - Y

        dw = (1 / m) * np.dot(X.T, loss)

        W = W - alpha * dw

        cost = cost_function(X, Y, W)

        cost_history[iteration] = cost

    return W, cost_history


np.random.seed(0)
X = np.random.rand(100, 3)
Y = np.random.rand(100)
W = np.random.rand(3)


alpha = 0.01
iterations = 1000

final_params, cost_history = gradient_descent(X, Y, W, alpha, iterations)

print("Final Parameters:", final_params)
print("Cost History:", cost_history[-10:])


Final Parameters: [0.20551667 0.54295081 0.10388027]
Cost History: [0.054383665685533336, 0.0543804494134437, 0.054377238812615865, 0.05437403387293539, 0.054370834584306166, 0.05436764093665037, 0.054364452919908414, 0.05436127052403898, 0.05435809373901896, 0.05435492255484332]


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

def cost_function(X, Y, W):
    m = len(Y)
    Y_pred = np.dot(X, W)
    cost = (1/(2*m)) * np.sum((Y_pred - Y) ** 2)
    return cost

def gradient_descent(X, Y, W, alpha, iterations):
    m = len(Y)
    cost_history = []
    for i in range(iterations):
        Y_pred = np.dot(X, W)
        loss = Y_pred - Y
        dw = (1/m) * np.dot(X.T, loss)
        W = W - alpha * dw
        cost = cost_function(X, Y, W)
        cost_history.append(cost)
    return W, cost_history

def rmse(Y, Y_pred):
    return np.sqrt(np.mean((Y - Y_pred) ** 2))

def r2(Y, Y_pred):
    mean_y = np.mean(Y)
    ss_tot = np.sum((Y - mean_y) ** 2)
    ss_res = np.sum((Y - Y_pred) ** 2)
    return 1 - (ss_res / ss_tot)


def main():

    data = pd.read_csv('content/drive/MyDrive/5CS037/medical_students_dataset.csv')

    X = data[['Gender', 'Age']].values
    Y = data['Height'].values

    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)


    X_train = np.c_[np.ones(X_train.shape[0]), X_train]
    X_test = np.c_[np.ones(X_test.shape[0]), X_test]

    W = np.zeros(X_train.shape[1])
    alpha = 0.00001
    iterations = 1000

    W_optimal, cost_history = gradient_descent(X_train, Y_train, W, alpha, iterations)

    Y_pred = np.dot(X_test, W_optimal)

    model_rmse = rmse(Y_test, Y_pred)
    model_r2 = r2(Y_test, Y_pred)

    print("Final Weights:", W_optimal)
    print("Cost History (First 10 iterations):", cost_history[:10])
    print("RMSE on Test Set:", model_rmse)
    print("R-Squared on Test Set:", model_r2)

if __name__ == "__main__":
    main()


FileNotFoundError: [Errno 2] No such file or directory: 'content/drive/MyDrive/5CS037/medical_students_dataset.csv'