In [6]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from torch.utils.data import TensorDataset, DataLoader

# Load the data
df1 = pd.read_csv('datasets/bcsc_risk_factors_expanded1.csv')
df2 = pd.read_csv('datasets/bcsc_risk_factors_expanded2.csv')
df3 = pd.read_csv('datasets/bcsc_risk_factors_expanded3.csv')
#slight cleaning
df = pd.concat([df1, df2, df3])
data = df[df.ne(9).all(1)]  #drop unknowns (9s)
data = data.drop('year', axis=1)

# Prepare the features and target
X = data.drop('breast_cancer_history', axis=1)
y = data['breast_cancer_history']

# Handle missing values (you may want to use more sophisticated imputation methods)
X = X.fillna(X.mean())

# Convert categorical variables to one-hot encoding
#X = pd.get_dummies(X)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Balance the dataset using SMOTE
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_scaled, y_train)

# Convert to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train_balanced)
y_train_tensor = torch.FloatTensor(y_train_balanced.values).unsqueeze(1)
X_test_tensor = torch.FloatTensor(X_test_scaled)
y_test_tensor = torch.FloatTensor(y_test.values).unsqueeze(1)

# Create DataLoader
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

# Define the neural network
class Net(nn.Module):
    def __init__(self, input_size):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 32)
        self.fc4 = nn.Linear(32, 1)
        self.dropout = nn.Dropout(0.2)
        
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        x = torch.relu(self.fc2(x))
        x = self.dropout(x)
        x = torch.relu(self.fc3(x))
        x = torch.sigmoid(self.fc4(x))
        return x

# Initialize the model
model = Net(X_train_tensor.shape[1])

# Define loss function and optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.1)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
    
    #if (epoch + 1) % 10 == 0:
    model.eval()
    with torch.no_grad():
        train_pred = model(X_train_tensor)
        train_accuracy = ((train_pred.round() == y_train_tensor).float().mean())
        test_pred = model(X_test_tensor)
        test_accuracy = ((test_pred.round() == y_test_tensor).float().mean())
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}, Train Accuracy: {train_accuracy.item():.4f}, Test Accuracy: {test_accuracy.item():.4f}')

# Final evaluation
model.eval()
with torch.no_grad():
    y_pred = model(X_test_tensor)
    accuracy = ((y_pred.round() == y_test_tensor).float().mean())
    print(f'Final Test Accuracy: {accuracy.item():.4f}')

Epoch [1/10], Loss: 0.6704, Train Accuracy: 0.5000, Test Accuracy: 0.0578
Epoch [2/10], Loss: 0.6983, Train Accuracy: 0.5000, Test Accuracy: 0.0578
Epoch [3/10], Loss: 0.6892, Train Accuracy: 0.5000, Test Accuracy: 0.0578
Epoch [4/10], Loss: 0.6836, Train Accuracy: 0.5000, Test Accuracy: 0.0578
Epoch [5/10], Loss: 0.7113, Train Accuracy: 0.5000, Test Accuracy: 0.9422
Epoch [6/10], Loss: 0.6961, Train Accuracy: 0.5000, Test Accuracy: 0.0578
Epoch [7/10], Loss: 0.6920, Train Accuracy: 0.5000, Test Accuracy: 0.9422
Epoch [8/10], Loss: 0.6818, Train Accuracy: 0.5000, Test Accuracy: 0.0578
Epoch [9/10], Loss: 0.6944, Train Accuracy: 0.5000, Test Accuracy: 0.9422
Epoch [10/10], Loss: 0.6854, Train Accuracy: 0.5000, Test Accuracy: 0.9422
Final Test Accuracy: 0.9422
