In [2]:
import pandas as pd

df = pd.read_csv("breast.csv", low_memory=False)

In [3]:
df = df.dropna(thresh=len(df) * 0.5, axis=1)

In [4]:
object_cols = df.select_dtypes(include=['object']).columns.tolist()
object_cols

['SITEO2V', 'ICDOT10V', 'PLC_BRTH_CNTRY', 'PLC_BRTH_STATE']

In [5]:
# One-hot encode
df = pd.get_dummies(df, columns=object_cols, drop_first=True)

In [6]:
# 1. Categorical columns
categorical_cols = ['MAR_STAT', 'RACE', 'ORIGIN', 'DX_CONF', 'RAC_RECA', 'RAC_RECY', 'IHS']
for col in categorical_cols:
    if col in df.columns and df[col].isnull().any():
        df[col].fillna(df[col].mode()[0], inplace=True)

# 2. Numeric columns
numeric_cols = ['AGE_DX', 'YR_BRTH', 'AGE_REC', 'ADJTM_6VALUE', 'ADJNM_6VALUE', 'ADJM_6VALUE', 'ADJAJCCSTG']
for col in numeric_cols:
    if col in df.columns and df[col].isnull().any():
        df[col].fillna(df[col].median(), inplace=True)

# 3. Binary flags
binary_cols = ['INTPRIM', 'ERSTATUS', 'PRSTATUS']
for col in binary_cols:
    if col in df.columns and df[col].isnull().any():
        df[col].fillna(df[col].mode()[0], inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting value

In [7]:
df.drop(columns=['EOD10_PN', 'EOD10_NE', 'EODCODE', 'SS_SURG', 'ICCC3WHO', 'ICCC3XWHO'], inplace=True)

In [8]:
df = df.dropna()

In [9]:
df['is_malignant'] = (df['BEHO3V'] == 3).astype(int)

In [10]:
df.drop(columns=['BEHO3V'], inplace=True)

In [11]:
X = df.drop(columns=['is_malignant'])
y = df['is_malignant']

In [12]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
import numpy as np

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [15]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
rf_train_preds = rf.predict_proba(X_train)
rf_test_preds = rf.predict_proba(X_test)

In [16]:
class SimpleNN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(SimpleNN, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, output_dim),
            nn.Softmax(dim=1)
        )

    def forward(self, x):
        return self.model(x)

In [17]:
input_dim = X_train.shape[1]
output_dim = len(np.unique(y))
model = SimpleNN(input_dim, output_dim)

In [18]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [19]:
X_train_tensor = torch.FloatTensor(X_train)
y_train_tensor = torch.LongTensor(y_train)
X_test_tensor = torch.FloatTensor(X_test)

In [20]:
for epoch in range(10):
    model.train()
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

In [21]:
model.eval()

SimpleNN(
  (model): Sequential(
    (0): Linear(in_features=326, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=32, bias=True)
    (3): ReLU()
    (4): Linear(in_features=32, out_features=2, bias=True)
    (5): Softmax(dim=1)
  )
)

In [22]:
with torch.no_grad():
    nn_train_preds = model(X_train_tensor).numpy()
    nn_test_preds = model(X_test_tensor).numpy()

In [23]:
meta_train = np.hstack([rf_train_preds, nn_train_preds])
meta_test = np.hstack([rf_test_preds, nn_test_preds])

In [24]:
meta_model = LogisticRegression()
meta_model.fit(meta_train, y_train)
final_preds = meta_model.predict(meta_test)

In [25]:
accuracy = accuracy_score(y_test, final_preds)
print("Stacked Ensemble Accuracy:", accuracy)

Stacked Ensemble Accuracy: 1.0


# LSTM

In [26]:
class LSTMClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers=1):
        super(LSTMClassifier, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        _, (hn, _) = self.lstm(x)
        out = self.fc(hn[-1])
        return self.sigmoid(out)

In [27]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
rf_train_preds = rf.predict_proba(X_train)[:, 1].reshape(-1, 1)
rf_test_preds = rf.predict_proba(X_test)[:, 1].reshape(-1, 1)

In [28]:
input_dim = X.shape[1]
hidden_dim = 64

lstm_model = LSTMClassifier(input_dim=input_dim, hidden_dim=hidden_dim)
criterion = nn.BCELoss()
optimizer = optim.Adam(lstm_model.parameters(), lr=0.001)

In [None]:
X_train_seq = torch.FloatTensor(X_train).unsqueeze(1)  # shape: (batch, seq_len=1, features)
X_test_seq = torch.FloatTensor(X_test).unsqueeze(1)
y_train_tensor = torch.FloatTensor(y_train).unsqueeze(1)  # shape: (batch, 1)


: 

In [None]:
for epoch in range(3):
    lstm_model.train()
    output = lstm_model(X_train_seq)
    loss = criterion(output, y_train_tensor)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    print(f"Epoch {epoch}, Loss: {loss.item():.4f}")

In [None]:
lstm_model.eval()


LSTMClassifier(
  (lstm): LSTM(326, 64, batch_first=True)
  (fc): Linear(in_features=64, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [None]:

with torch.no_grad():
    lstm_train_preds = lstm_model(X_train_seq).numpy()
    lstm_test_preds = lstm_model(X_test_seq).numpy()

In [None]:
meta_train = np.hstack([rf_train_preds, lstm_train_preds])
meta_test = np.hstack([rf_test_preds, lstm_test_preds])


In [None]:
meta_model = LogisticRegression()
meta_model.fit(meta_train, y_train)
final_preds = meta_model.predict(meta_test)

In [None]:
print("Stacked Ensemble Accuracy:", accuracy_score(y_test, final_preds))


Stacked Ensemble Accuracy: 1.0


# Transformer

In [None]:
class TransformerClassifier(nn.Module):
    def __init__(self, input_dim, nhead, hidden_dim, num_layers):
        super(TransformerClassifier, self).__init__()
        self.embedding = nn.Linear(input_dim, hidden_dim)
        encoder_layer = nn.TransformerEncoderLayer(d_model=hidden_dim, nhead=nhead)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.fc = nn.Linear(hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        # x shape: (batch, seq_len, input_dim)
        x = self.embedding(x)  # shape: (batch, seq_len, hidden_dim)
        x = x.permute(1, 0, 2)  # shape: (seq_len, batch, hidden_dim)
        x = self.transformer_encoder(x)
        out = x.mean(dim=0)  # Global average pooling
        out = self.fc(out)
        return self.sigmoid(out)


In [None]:
input_dim = X.shape[1]
hidden_dim = 64
nhead = 4
num_layers = 2

transformer_model = TransformerClassifier(input_dim=input_dim, nhead=nhead, hidden_dim=hidden_dim, num_layers=num_layers)
criterion = nn.BCELoss()
optimizer = optim.Adam(transformer_model.parameters(), lr=0.001)




In [None]:
for epoch in range(3):
    transformer_model.train()
    output = transformer_model(X_train_seq)
    loss = criterion(output, y_train_tensor)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    print(f"Epoch {epoch}, Loss: {loss.item():.4f}")


NameError: name 'transformer_model' is not defined

In [None]:
transformer_model.eval()

In [None]:
with torch.no_grad():
    transformer_train_preds = transformer_model(X_train_seq).numpy()
    transformer_test_preds = transformer_model(X_test_seq).numpy()


In [None]:
meta_train = np.hstack([rf_train_preds, transformer_train_preds])
meta_test = np.hstack([rf_test_preds, transformer_test_preds])

In [None]:
meta_model = LogisticRegression()
meta_model.fit(meta_train, y_train)
final_preds = meta_model.predict(meta_test)

In [None]:
print("Stacked Ensemble Accuracy:", accuracy_score(y_test, final_preds))
