<a href="https://colab.research.google.com/github/saigowtham627/Fundementals/blob/main/PyTorch_4_Training_PipeLine.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Code Flow
*1. Load the dataset*

*2. Basic Preprocessing*

*3. Training Process*

    *a. Create the model*
    *b. Forward pass*
    *c. Loss computation*
    *d. Back Propagagation*
    *e. Parameters updation*
*4. Model Evaluation*

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

In [2]:
#Breast cancer dataset
df = pd.read_csv('https://raw.githubusercontent.com/gscdit/Breast-Cancer-Detection/refs/heads/master/data.csv')
df.sample(5)

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
152,8710441,B,9.731,15.34,63.78,300.2,0.1072,0.1599,0.4108,0.07857,...,19.49,71.04,380.5,0.1292,0.2772,0.8216,0.1571,0.3108,0.1259,
194,87556202,M,14.86,23.21,100.4,671.4,0.1044,0.198,0.1697,0.08878,...,27.78,118.6,784.7,0.1316,0.4648,0.4589,0.1727,0.3,0.08701,
430,907914,M,14.9,22.53,102.1,685.0,0.09947,0.2225,0.2733,0.09711,...,27.57,125.4,832.7,0.1419,0.709,0.9019,0.2475,0.2866,0.1155,
505,915276,B,9.676,13.14,64.12,272.5,0.1255,0.2204,0.1188,0.07038,...,18.04,69.47,328.1,0.2006,0.3663,0.2913,0.1075,0.2848,0.1364,
233,88206102,M,20.51,27.81,134.4,1319.0,0.09159,0.1074,0.1554,0.0834,...,37.38,162.7,1872.0,0.1223,0.2761,0.4146,0.1563,0.2437,0.08328,


In [3]:
df.shape #569 rows and 33 columns

(569, 33)

In [4]:
df.isna().sum()

Unnamed: 0,0
id,0
diagnosis,0
radius_mean,0
texture_mean,0
perimeter_mean,0
area_mean,0
smoothness_mean,0
compactness_mean,0
concavity_mean,0
concave points_mean,0


In [5]:
df.columns

Index(['id', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst', 'Unnamed: 32'],
      dtype='object')

In [6]:
#id and unnamed32 are not required
df = df.drop(columns = ['id', 'Unnamed: 32'], axis = 0) #We have specify as columns not, labels

#Train test split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, 1:], df.iloc[:, 0], test_size=0.2)

#Scaling

In [8]:
scaler = StandardScaler() #Initializing the Standard scaler
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [9]:
X_train

array([[-7.10506637e-01, -7.76811484e-01, -6.75707803e-01, ...,
        -7.35919918e-02, -1.92151002e-01, -5.29017213e-01],
       [-3.10110003e-01, -1.43994981e-03, -3.87560194e-01, ...,
        -1.58605439e+00, -7.46950230e-01, -1.14675235e+00],
       [-3.51817986e-01, -8.00379007e-01, -3.42020782e-01, ...,
        -2.15114281e-01,  1.02266800e+00, -1.53924961e-01],
       ...,
       [-1.22991005e+00, -8.59297817e-01, -1.23830090e+00, ...,
        -9.70051999e-01, -1.06061467e-01, -1.22983922e-01],
       [-8.76674295e-02, -9.74778684e-01, -1.30443867e-01, ...,
        -2.34440443e-01,  7.56831080e-02, -8.08572219e-01],
       [ 1.59799934e-01,  1.43853576e+00,  9.96712166e-02, ...,
        -8.88094423e-02, -2.95777294e-01, -7.84687908e-01]])

In [10]:
y_train

Unnamed: 0,diagnosis
422,B
58,B
143,B
427,B
461,M
...,...
184,M
40,M
273,B
279,B


*Target features are alphabetical, M and B. So we have to encode them into numeric form*

In [11]:
encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.fit_transform(y_test)

In [12]:
y_train

array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1,
       1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1,
       0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0,

*X_train, Y_train, X_test, Y_test are NumPy arrays, we need to convert them into PyTorch tensors*

In [13]:
X_train_tensor = torch.from_numpy(X_train)
X_test_tensor = torch.from_numpy(X_test)
y_train_tensor = torch.from_numpy(y_train)
y_test_tensor = torch.from_numpy(y_test)

#Defining the Model

In [24]:
import math

In [41]:
class MySimpleNN():

    def __init__(self, X):
        # Define weights and biases
        # Since we have 30 features, and we have only 1 neuron, We need 30 weights and 1 bias...As each neuron will have a bias
        self.weights = torch.rand(X.shape[1], 1, dtype=torch.float64, requires_grad=True) # requires_grad is set to True, because we are calculating gradients of Loss functions wrt to Weights
        self.bias = torch.rand(1, dtype=torch.float64, requires_grad=True)

        # We need 30 weights, each one for each feature ..So weight vector dimension would be (30, 1) vector. 30 is X.shape[1]
        # We need only 1 bias, i.e; for the single neuron, Since it is a scalar, We kept it as a single number

    def forward_pass(self, X):  # Step 1: Z = W.X+b......Step 2: Sigmoid(Z)
        # Calculate Z
        Z = torch.matmul(X, self.weights) + self.bias
        # Calculate y_pred
        y_pred = torch.sigmoid(Z)

        return y_pred

    def loss_function(self, y_pred, y):
        # Clamp predictions to avoid log(0)
        epsilon = 1e-7
        y_pred = torch.clamp(y_pred, epsilon, 1 - epsilon)

        # Calculate loss
        loss = -(y * torch.log(y_pred) + (1 - y) * torch.log(1 - y_pred)).mean()
        return loss

#Defining parameters

In [49]:
learning_rate = 0.1
epochs = 5000

#Training Pipeline

In [None]:
# Ensuring they are tensors without requiring gradient computation
X_train_tensor = X_train_tensor.clone().detach().type(torch.float64)
y_train_tensor = y_train_tensor.clone().detach().type(torch.float64)

# To create the training pipeline, we need a model
# So, we will initialize a model
model = MySimpleNN(X_train_tensor)

# We do the 4 things in a loop. How many times? Ans: No of epochs
for epoch in range(epochs):

    # Forward pass
    y_pred = model.forward_pass(X_train_tensor)

    # Loss computation
    # We have Original target(label) and y_pred, So we can calculate the loss
    loss = model.loss_function(y_pred, y_train_tensor)

    # Backward pass
    loss.backward()

    # Parameter update
    with torch.no_grad():
        model.weights -= learning_rate * model.weights.grad
        model.bias -= learning_rate * model.bias.grad

    # To avoid gradient accumulation
    model.weights.grad.zero_()
    model.bias.grad.zero_()

    # Print loss in each epoch
    print(f'Epoch : {epoch + 1}. Loss: {loss.item()}')


In [52]:
model.weights
model.bias

tensor([-0.4701], dtype=torch.float64, requires_grad=True)

#Model Evaluation

In [63]:
from sklearn.metrics import roc_curve

# Compute ROC curve and thresholds
fpr, tpr, thresholds = roc_curve(y_test_tensor.numpy(), y_pred.numpy())

# Find the threshold closest to top-left corner
optimal_idx = (tpr - fpr).argmax()
optimal_threshold = thresholds[optimal_idx]
print(f"Optimal Threshold: {optimal_threshold}")


Optimal Threshold: 0.3856090123673645


In [64]:
from sklearn.metrics import precision_recall_curve

# Compute precision, recall, and thresholds
precision, recall, thresholds = precision_recall_curve(y_test_tensor.numpy(), y_pred.numpy())

# Compute F1-score for each threshold
f1_scores = 2 * (precision * recall) / (precision + recall)
optimal_idx = f1_scores.argmax()
optimal_threshold = thresholds[optimal_idx]
print(f"Optimal Threshold: {optimal_threshold}")


Optimal Threshold: 0.3856090123673645


In [66]:
#Model evaluation
with torch.no_grad():
  y_pred = model.forward_pass(X_test_tensor)
  y_pred = (y_pred > 0.3856090123673645).float()

  #Accuracy calculation
  accuracy = (y_pred == y_test_tensor).float().mean()
  print(f'Accuracy: {accuracy.item()}')

Accuracy: 0.5430901646614075
