In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset,DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
torch.manual_seed(42)

<torch._C.Generator at 0x7a0bd8355810>

In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cpu


In [5]:
data=pd.read_csv("/content/risk_factors_cervical_cancer.csv")
data.head()

Unnamed: 0,Age,Number of sexual partners,First sexual intercourse,Num of pregnancies,Smokes,Smokes (years),Smokes (packs/year),Hormonal Contraceptives,Hormonal Contraceptives (years),IUD,...,STDs:Hepatitis B,STDs:HPV,STDs: Number of diagnosis,STDs: Time since first diagnosis,STDs: Time since last diagnosis,Dx:Cancer,Dx:CIN,Dx:HPV,Dx,Biopsy
0,18,4,15,1,0,0,0,0,0,0,...,0,0,0,?,?,0,0,0,0,0
1,15,1,14,1,0,0,0,0,0,0,...,0,0,0,?,?,0,0,0,0,0
2,34,1,?,1,0,0,0,0,0,0,...,0,0,0,?,?,0,0,0,0,0
3,52,5,16,4,1,37,37,1,3,0,...,0,0,0,?,?,1,0,1,0,0
4,46,3,21,4,0,0,0,1,15,0,...,0,0,0,?,?,0,0,0,0,0


In [6]:
data.isnull().sum()

Unnamed: 0,0
Age,0
Number of sexual partners,0
First sexual intercourse,0
Num of pregnancies,0
Smokes,0
Smokes (years),0
Smokes (packs/year),0
Hormonal Contraceptives,0
Hormonal Contraceptives (years),0
IUD,0


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 858 entries, 0 to 857
Data columns (total 33 columns):
 #   Column                              Non-Null Count  Dtype 
---  ------                              --------------  ----- 
 0   Age                                 858 non-null    int64 
 1   Number of sexual partners           858 non-null    object
 2   First sexual intercourse            858 non-null    object
 3   Num of pregnancies                  858 non-null    object
 4   Smokes                              858 non-null    object
 5   Smokes (years)                      858 non-null    object
 6   Smokes (packs/year)                 858 non-null    object
 7   Hormonal Contraceptives             858 non-null    object
 8   Hormonal Contraceptives (years)     858 non-null    object
 9   IUD                                 858 non-null    object
 10  IUD (years)                         858 non-null    object
 11  STDs                                858 non-null    object

In [8]:
X=data.drop("Biopsy",axis=1)
y=data["Biopsy"]

In [9]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [23]:
#creating custom class
class customDataset(Dataset):
    def __init__(self, features, labels):
        # Find columns with question marks
        question_mark_columns = features.apply(lambda col: col.astype(str).str.contains('\?').any()).pipe(lambda series: series[series].index)

        # Replace '?' based on column type
        for column in question_mark_columns:
            # Check if column is numerical or categorical
            if pd.api.types.is_numeric_dtype(features[column]):
                # Numerical column: replace with median
                median = features[column].median()
                features[column] = features[column].replace('?', median)
                features[column] = pd.to_numeric(features[column], errors='coerce').fillna(median) # Convert to numeric and handle remaining NaNs
            else:
                # Categorical column: replace with mode
                mode = features[column].mode()[0]
                features[column] = features[column].replace('?', mode)


        self.features = torch.tensor(features.apply(pd.to_numeric, errors='coerce').fillna(0).values, dtype=torch.float32)
        self.labels = torch.tensor(labels.values, dtype=torch.long)

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]


In [24]:
#create train and test dataset object
train_dataset=customDataset(X_train,y_train)
test_dataset=customDataset(X_test,y_test)

In [25]:
#creating train and test data loader
train_loader=DataLoader(train_dataset,batch_size=32,shuffle=True,pin_memory=True)
test_loader=DataLoader(test_dataset,batch_size=32,shuffle=False,pin_memory=True)

In [26]:
len(train_loader)

22

In [27]:
len(test_loader)

6

In [39]:
#custom nn model
class MyNN(nn.Module):
  def __init__(self,num_features):
    super().__init__()
    self.model=nn.Sequential(
        nn.Linear(num_features,128),
        nn.ReLU(),
        nn.Linear(128, 64),
        nn.ReLU(),
        nn.Linear(64, 2),

    )

  def forward(self,x):
    return self.model(x)


In [40]:
learning_rate=0.01
epochs=25

In [41]:
#instatiate the model
model=MyNN(X_train.shape[1])
model=model.to(device)
#loss
criterion=nn.CrossEntropyLoss()
#optimizer
optimizer=torch.optim.Adam(model.parameters(),learning_rate)

In [43]:
#training loop
for epoch in range(epochs):
  total_epoch_loss=0

  for batch_features,batch_labels in train_loader:
    batch_features=batch_features.to(device)
    batch_labels=batch_labels.to(device)

    outputs=model(batch_features)


    loss = criterion(outputs, batch_labels)

    #optimizer
    optimizer.zero_grad()
    loss.backward()

    #update weights
    optimizer.step()

    total_epoch_loss+=loss.item()

  print(f"Epoch {epoch+1}/{epochs}, Loss: {total_epoch_loss/len(train_loader)}")

Epoch 1/25, Loss: 0.2954622886397622
Epoch 2/25, Loss: 0.26079014858061617
Epoch 3/25, Loss: 0.24801747822626072
Epoch 4/25, Loss: 0.2477071676403284
Epoch 5/25, Loss: 0.22312088767913255
Epoch 6/25, Loss: 0.22221509333361278
Epoch 7/25, Loss: 0.2254055142402649
Epoch 8/25, Loss: 0.23966787255961786
Epoch 9/25, Loss: 0.2195035915144465
Epoch 10/25, Loss: 0.23116497018120505
Epoch 11/25, Loss: 0.24515801888297906
Epoch 12/25, Loss: 0.2237139114771377
Epoch 13/25, Loss: 0.21200894530523906
Epoch 14/25, Loss: 0.20114448209377853
Epoch 15/25, Loss: 0.20940049526027657
Epoch 16/25, Loss: 0.21388281475413928
Epoch 17/25, Loss: 0.20876412533900954
Epoch 18/25, Loss: 0.2023693783716722
Epoch 19/25, Loss: 0.20038405162366954
Epoch 20/25, Loss: 0.2028498790142211
Epoch 21/25, Loss: 0.23149664056572047
Epoch 22/25, Loss: 0.20216295156966557
Epoch 23/25, Loss: 0.2016214897686785
Epoch 24/25, Loss: 0.19954790699888358
Epoch 25/25, Loss: 0.20366551200012592


In [44]:
#evaluation
model.eval()

MyNN(
  (model): Sequential(
    (0): Linear(in_features=32, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=64, bias=True)
    (3): ReLU()
    (4): Linear(in_features=64, out_features=2, bias=True)
  )
)

In [45]:
#evaluation
total=0
correct=0

with torch.no_grad():
  for batch_features,batch_labels in test_loader:
    batch_features=batch_features.to(device)
    batch_labels=batch_labels.to(device)

    outputs=model(batch_features)

    _,predicted=torch.max(outputs,1)

    total+=batch_labels.size(0)

    correct+=(predicted==batch_labels).sum().item()

  print(f"Accuracy on test data: {100*correct/total}%")

Accuracy on test data: 93.6046511627907%


In [48]:
#evaluation

total=0
correct=0
with torch.no_grad():
  for batch_features,batch_labels in train_loader:
    batch_features=batch_features.to(device)
    batch_labels=batch_labels.to(device)

    outputs=model(batch_features)

    _,predicted=torch.max(outputs,1)

    total+=batch_labels.size(0)

    correct+=(predicted==batch_labels).sum().item()

  print(f"Accuracy on train data : {100*correct/total}%")

Accuracy on train data : 93.73177842565597%
