In [21]:
import pandas as pd

df = pd.read_csv("breast.csv", low_memory=False)


In [22]:
df = df.dropna(thresh=len(df) * 0.5, axis=1)

In [23]:
# 1. Identify object (string) columns
object_cols = df.select_dtypes(include=['object']).columns.tolist()
object_cols

['SITEO2V', 'ICDOT10V', 'PLC_BRTH_CNTRY', 'PLC_BRTH_STATE']

In [24]:
# 2. One-hot encode them
df = pd.get_dummies(df, columns=object_cols, drop_first=True)

In [25]:
# 1. Categorical columns
categorical_cols = ['MAR_STAT', 'RACE', 'ORIGIN', 'DX_CONF', 'RAC_RECA', 'RAC_RECY', 'IHS']
for col in categorical_cols:
    if col in df.columns and df[col].isnull().any():
        df[col].fillna(df[col].mode()[0], inplace=True)

# 2. Numeric columns
numeric_cols = ['AGE_DX', 'YR_BRTH', 'AGE_REC', 'ADJTM_6VALUE', 'ADJNM_6VALUE', 'ADJM_6VALUE', 'ADJAJCCSTG']
for col in numeric_cols:
    if col in df.columns and df[col].isnull().any():
        df[col].fillna(df[col].median(), inplace=True)

# 3. Binary flags
binary_cols = ['INTPRIM', 'ERSTATUS', 'PRSTATUS']
for col in binary_cols:
    if col in df.columns and df[col].isnull().any():
        df[col].fillna(df[col].mode()[0], inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting value

In [26]:
df.drop(columns=['EOD10_PN', 'EOD10_NE', 'EODCODE', 'SS_SURG', 'ICCC3WHO', 'ICCC3XWHO'], inplace=True)

In [27]:
df = df.dropna()

In [28]:
df['is_malignant'] = (df['BEHO3V'] == 3).astype(int)

In [29]:
df.drop(columns=['BEHO3V'], inplace=True)

In [30]:
import numpy as np

In [31]:
X = df.drop(columns=['is_malignant']).values.astype(np.float32)
y = df['is_malignant'].values.astype(np.float32)

In [32]:
from sklearn.preprocessing import StandardScaler
# Normalize features
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [33]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset, random_split

In [34]:
X = np.expand_dims(X, axis=1)  # (711653, 1, 326)

In [35]:
X_tensor = torch.tensor(X)
y_tensor = torch.tensor(y)

In [36]:
dataset = TensorDataset(X_tensor, y_tensor)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

In [37]:
class LSTMClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers=1):
        super(LSTMClassifier, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        _, (hn, _) = self.lstm(x)
        out = self.fc(hn[-1])
        return self.sigmoid(out)

In [38]:
X.shape

(711653, 1, 326)

In [39]:
input_dim = X.shape[2]
model = LSTMClassifier(input_dim=input_dim, hidden_dim=64)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [40]:
# 6. Training Loop
for epoch in range(10):  # you can increase epochs
    model.train()
    total_loss = 0
    for xb, yb in train_loader:
        optimizer.zero_grad()
        output = model(xb)
        loss = criterion(output, yb.unsqueeze(1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

Epoch 1, Loss: 109.8362
Epoch 2, Loss: 32.5974
Epoch 3, Loss: 28.8876
Epoch 4, Loss: 27.5986
Epoch 5, Loss: 26.1458
Epoch 6, Loss: 25.1880
Epoch 7, Loss: 24.9745
Epoch 8, Loss: 24.8540
Epoch 9, Loss: 24.3200
Epoch 10, Loss: 23.4456


In [41]:
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for xb, yb in val_loader:
        preds = model(xb)
        predicted = (preds > 0.5).float()
        correct += (predicted == yb).sum().item()
        total += yb.size(0)
        accuracy = correct / total
        print(f"Validation Accuracy: {accuracy:.2f}")

Validation Accuracy: 26.56
Validation Accuracy: 24.41
Validation Accuracy: 25.69
Validation Accuracy: 26.33
Validation Accuracy: 25.77
Validation Accuracy: 24.99
Validation Accuracy: 25.21
Validation Accuracy: 24.56
Validation Accuracy: 24.78
Validation Accuracy: 24.81
Validation Accuracy: 24.28
Validation Accuracy: 24.17
Validation Accuracy: 24.23
Validation Accuracy: 23.80
Validation Accuracy: 23.79
Validation Accuracy: 23.69
Validation Accuracy: 23.68
Validation Accuracy: 23.48
Validation Accuracy: 23.56
Validation Accuracy: 23.71
Validation Accuracy: 23.53
Validation Accuracy: 23.60
Validation Accuracy: 23.66
Validation Accuracy: 23.60
Validation Accuracy: 23.42
Validation Accuracy: 23.38
Validation Accuracy: 23.56
Validation Accuracy: 23.56
Validation Accuracy: 23.61
Validation Accuracy: 23.65
Validation Accuracy: 23.75
Validation Accuracy: 23.94
Validation Accuracy: 23.93
Validation Accuracy: 23.96
Validation Accuracy: 23.95
Validation Accuracy: 23.90
Validation Accuracy: 23.86
V

# GRU

In [42]:
class GRUClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers=1):
        super(GRUClassifier, self).__init__()
        self.gru = nn.GRU(input_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        _, hn = self.gru(x)
        out = self.fc(hn[-1])
        return self.sigmoid(out)


In [43]:
input_dim = X.shape[2]
model = GRUClassifier(input_dim=input_dim, hidden_dim=64)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [44]:
# 6. Training Loop
for epoch in range(10):  # you can increase epochs
    model.train()
    total_loss = 0
    for xb, yb in train_loader:
        optimizer.zero_grad()
        output = model(xb)
        loss = criterion(output, yb.unsqueeze(1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

Epoch 1, Loss: 69.5563
Epoch 2, Loss: 2.6504
Epoch 3, Loss: 1.5572
Epoch 4, Loss: 1.2549
Epoch 5, Loss: 0.7700
Epoch 6, Loss: 0.7454
Epoch 7, Loss: 0.4044
Epoch 8, Loss: 0.3192
Epoch 9, Loss: 0.2211


KeyboardInterrupt: 

In [None]:
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for xb, yb in val_loader:
        preds = model(xb)
        predicted = (preds > 0.5).float()
        correct += (predicted == yb).sum().item()
        total += yb.size(0)
        accuracy = correct / total
        print(f"Validation Accuracy: {accuracy:.2f}")

In [None]:
accuracy = correct / total
print(f"Validation Accuracy: {accuracy:.2f}")

Validation Accuracy: 24.10


# Transformers

In [None]:
class TransformerClassifier(nn.Module):
    def __init__(self, input_dim, nhead, hidden_dim, num_layers):
        super(TransformerClassifier, self).__init__()
        self.embedding = nn.Linear(input_dim, hidden_dim)
        encoder_layer = nn.TransformerEncoderLayer(d_model=hidden_dim, nhead=nhead)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.fc = nn.Linear(hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        # x shape: (batch, seq_len, input_dim)
        x = self.embedding(x)  # shape: (batch, seq_len, hidden_dim)
        x = x.permute(1, 0, 2)  # shape: (seq_len, batch, hidden_dim)
        x = self.transformer_encoder(x)
        out = x.mean(dim=0)  # Global average pooling
        out = self.fc(out)
        return self.sigmoid(out)


In [None]:
input_dim = X.shape[2]
model = TransformerClassifier(input_dim=input_dim, nhead=8,hidden_dim=64, num_layers=3)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)



In [None]:
# 6. Training Loop
for epoch in range(10):  # you can increase epochs
    model.train()
    total_loss = 0
    for xb, yb in train_loader:
        optimizer.zero_grad()
        output = model(xb)
        loss = criterion(output, yb.unsqueeze(1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

Epoch 1, Loss: 113.2884
Epoch 2, Loss: 45.6670
Epoch 3, Loss: 30.8676
Epoch 4, Loss: 16.3740
Epoch 5, Loss: 22.0249
Epoch 6, Loss: 16.3265
Epoch 7, Loss: 13.1858
Epoch 8, Loss: 23.8719
Epoch 9, Loss: 13.1485
Epoch 10, Loss: 9.6496


In [None]:
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for xb, yb in val_loader:
        preds = model(xb)
        predicted = (preds > 0.5).float()
        correct += (predicted == yb).sum().item()
        total += yb.size(0)
        accuracy = correct / total
        print(f"Validation Accuracy: {accuracy:.2f}")

In [None]:
accuracy = correct / total
print(f"Validation Accuracy: {accuracy:.2f}")