In [1]:
import numpy as np
import pandas as pd
import torch as T
import torch.nn as nn
import copy
import torch.optim as optim
import tqdm
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import StandardScaler
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import StratifiedKFold, train_test_split
import warnings
warnings.filterwarnings('ignore')
np.set_printoptions(threshold=10_000)

In [2]:
df1 = pd.read_csv('datasets/bcsc_risk_factors_expanded1.csv')
df2 = pd.read_csv('datasets/bcsc_risk_factors_expanded2.csv')
df3 = pd.read_csv('datasets/bcsc_risk_factors_expanded3.csv')

In [3]:
#slight cleaning
df = pd.concat([df1, df2, df3])
df = df[df.ne(9).all(1)] #drop unknowns (9s)
df.drop(['year'], axis=1, inplace=True)

In [4]:
history_counts = df.breast_cancer_history.value_counts()
history_counts[1]/history_counts.sum()

## We have about 5.7% 1s, this is a rare event

# Pytorch NN, Confusion Matrix, and Predict Proba

In [5]:
X = df.iloc[:, 0:df.shape[1]-1]
y = df.iloc[:, df.shape[1]-1]

In [6]:
from imblearn.over_sampling import SMOTE

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Balance the dataset using SMOTE
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_scaled, y_train)
X_test_balanced, y_test_balanced = smote.fit_resample(X_test_scaled, y_test)

In [7]:
y_df = pd.DataFrame(y_train_balanced)
y_df.breast_cancer_history.value_counts()

In [8]:
# convert data to Pytorch tensors and move it to GPU
X_train = T.tensor(X_train_balanced, dtype=T.float32)
y_train = T.tensor(y_train_balanced.values, dtype=T.float32)
X_test = T.tensor(X_test_scaled, dtype=T.float32)
y_test = T.tensor(y_test.values, dtype=T.float32)

In [9]:
y_test

In [10]:
class NeuralNet(nn.Module):

  def __init__(self, input_size, hidden_size):
    super(NeuralNet, self).__init__()
    self.hidden1 = nn.Linear(input_size, hidden_size)
    self.relu1 = nn.ReLU()
    self.hidden2 = nn.Linear(hidden_size, input_size)
    self.relu2 = nn.ReLU()
    self.output = nn.Linear(input_size, 1)
    self.sigmoid = nn.Sigmoid()

  def forward(self, x):
    x = self.relu1(self.hidden1(x))
    x = self.relu2(self.hidden2(x))
    x = self.sigmoid(self.output(x))
    #out = self.fc1(x)
    #out = self.relu(out)
    #out = self.fc2(out)
    #out = self.sigmoid(out)
    return x

In [11]:
class Deep(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(Deep, self).__init__()
        self.layer1 = nn.Linear(input_size, hidden_size)
        self.act1 = nn.ReLU()
        self.layer2 = nn.Linear(hidden_size, input_size)
        self.act2 = nn.ReLU()
        self.layer3 = nn.Linear(input_size, hidden_size)
        self.act3 = nn.ReLU()
        self.output = nn.Linear(hidden_size, 1)
        self.sigmoid = nn.Sigmoid()
 
    def forward(self, x):
        x = self.act1(self.layer1(x))
        x = self.act2(self.layer2(x))
        x = self.act3(self.layer3(x))
        x = self.sigmoid(self.output(x))
        return x

In [12]:
class BinaryClassification(nn.Module):
    def __init__(self):
        super(BinaryClassification, self).__init__()
        # Number of input features is 12.
        self.layer_1 = nn.Linear(10, 64) 
        self.layer_2 = nn.Linear(64, 64)
        self.layer_out = nn.Linear(64, 1) 
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.1)
        self.batchnorm1 = nn.BatchNorm1d(64)
        self.batchnorm2 = nn.BatchNorm1d(64)
        
    def forward(self, inputs):
        x = self.relu(self.layer_1(inputs))
        x = self.batchnorm1(x)
        x = self.relu(self.layer_2(x))
        x = self.batchnorm2(x)
        x = self.dropout(x)
        x = self.layer_out(x)
        
        return x

In [13]:
class Net(nn.Module):
  def __init__(self):
    super(Net, self).__init__()
    self.hid1 = nn.Linear(10, 8)  # 4-(8-8)-1
    self.hid2 = nn.Linear(8, 8)
    self.oupt = nn.Linear(8, 1)
    nn.init.xavier_uniform_(self.hid1.weight)
    nn.init.zeros_(self.hid1.bias)
    nn.init.xavier_uniform_(self.hid2.weight)
    nn.init.zeros_(self.hid2.bias)
    nn.init.xavier_uniform_(self.oupt.weight)
    nn.init.zeros_(self.oupt.bias)
  def forward(self, x):
    z = T.tanh(self.hid1(x))
    z = T.tanh(self.hid2(z))
    z = T.sigmoid(self.oupt(z))  # necessary
    return z

In [14]:
class Batcher:
  def __init__(self, num_items, batch_size, seed=0):
    self.indices = np.arange(num_items)
    self.num_items = num_items
    self.batch_size = batch_size
    self.rnd = np.random.RandomState(seed)
    self.rnd.shuffle(self.indices)
    self.ptr = 0
  def __iter__(self):
    return self
  def __next__(self):
    if self.ptr + self.batch_size > self.num_items:
      self.rnd.shuffle(self.indices)
      self.ptr = 0
      raise StopIteration  # exit calling for-loop
    else:
      result = self.indices[self.ptr:self.ptr+self.batch_size]
      self.ptr += self.batch_size
      return result

In [15]:
def akkuracy(model, data_x, data_y):
  # data_x and data_y are numpy array-of-arrays matrices
  X = data_x
  Y = data_y   # a Tensor of 0s and 1s
  oupt = model(X)            # a Tensor of floats
  pred_y = oupt >= 0.5       # a Tensor of 0s and 1s
  num_correct = T.sum(Y==pred_y)  # a Tensor
  acc = (num_correct.item() * 100.0 / len(data_y))  # scalar
  return acc

In [16]:
#define hyperparameters
input_size = X_train.shape[1]
hidden_size = 164
learning_rate = .001
num_epochs = 100
bat_size = 16

In [17]:
# initialize the neural network and move it the GPU
#model = NeuralNet(input_size, hidden_size)
#model = Deep(input_size, hidden_size)
#model = BinaryClassification()
model = Net()

In [18]:
criterion = nn.BCELoss()
#criterion = nn.CrossEntropyLoss()
#criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
losses = []
running_loss = 0.0

model.train()
n_items = len(X_train)
batcher = Batcher(n_items, bat_size)

for epoch in range(num_epochs):
    if epoch > 0 and epoch % 2 == 0:
        print("epoch = %6d" % epoch, end="")
        print("  batch loss = %7.4f" % loss.item(), end="")
        acc = akkuracy(model, X_train, y_train)
        print("  accuracy = %0.2f%%" % acc)
    for curr_batch in batcher:
        optimizer.zero_grad()
        outputs = model(X_train[curr_batch])
        loss = criterion(outputs, y_train[curr_batch].unsqueeze(1))
  
        losses.append(loss.item())
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        
print("Training complete \n")
# 4. evaluate model
net = model.eval()  # set eval mode
acc = akkuracy(net, X_train, y_train)
print("Accuracy on test data = %0.2f%%" % acc)

        # calculate accuracy
  #  with T.no_grad():
  #      predicted = outputs.round()
  #      T.set_printoptions(threshold=10_000)
  #          #print(predicted.abs().sum().item() == 0)
  #          #print(predicted)
  #      correct = (predicted == y_train[curr_batch].view(-1,1)).float().sum()
  #      accuracy = correct/y_train[curr_batch].size(0)
  #  net = net.eval()  # set eval mode
  #  acc = akkuracy(net, test_x, test_y)  
  #  #print(model.predict([1.0]))
  #  print(f'Epoch [{epoch+1}/{num_epochs}], Loss : {loss.item():.4f}, Accuracy: {accuracy.item() * 100:.2f}%')

In [1908]:
plt.plot(losses)
plt.title('loss vs epochs')
plt.xlabel('epochs')
plt.ylabel('loss')
plt.savefig('loss_vs_epochs.png')


In [1910]:
model.eval()
with T.no_grad():
  outputs = model(X_train)
  predicted = outputs.round()
  correct = (predicted == y_train.view(-1,1)).float().sum()
  accuracy = correct/y_train.size(0)
  print(f'Accuracy on training data: {accuracy.item() * 100:.2f}%')

In [1911]:

# evaluation on test set
model.eval()
with T.no_grad():
  outputs = model(X_test)
  predicted = outputs.round()
  correct = (predicted == y_test.view(-1,1)).float().sum()
  accuracy = correct/y_test.size(0)
  print(f'Accuracy on test data: {accuracy.item() * 100:.2f}%')

In [1912]:
from sklearn.metrics import confusion_matrix
import seaborn as sns


def plot_confusion_matrix(model_f, X_test_tensor_f, y_test_tensor_f):
    # Set the model to evaluation mode
    model_f.eval()

    # Get predictions
    with torch.no_grad():
        y_pred = model_f(X_test_tensor_f)
        y_pred_class = y_pred.round()

    # Convert tensors to numpy arrays
    y_true = y_test_tensor_f.numpy()
    y_pred = y_pred_class.numpy()

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)

    # Plot the confusion matrix
    plt.figure(figsize=(10,7))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title('Confusion Matrix')
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    #plt.show()
    plt.savefig('Confusion_Matrix.png')

    # Print classification report
    from sklearn.metrics import classification_report
    print(classification_report(y_true, y_pred))

In [1913]:
plot_confusion_matrix(model, X_test, y_test)


In [1686]:
def predict_proba(model, features):
    model.eval()  # Set the model to evaluation mode
    with torch.no_grad():
        # Convert features to a PyTorch tensor
        if isinstance(features, np.ndarray):
            features = torch.FloatTensor(features)
        elif isinstance(features, pd.DataFrame):
            features = torch.FloatTensor(features.values)
        
        # Ensure features is 2D
        if features.dim() == 1:
            features = features.unsqueeze(0)
        
        # Get the raw output (logits) from the model
        logits = model(features)
        
        # Apply sigmoid to get probabilities
        probs = torch.sigmoid(logits)
        
        # Return probabilities for both classes
        return torch.cat([1 - probs, probs], dim=1)

In [1693]:
# For a single row
single_row = X.iloc[0]  # Get the first row of your features
scaled_row = scaler.transform(single_row.values.reshape(1, -1))
probabilities = predict_proba(model, scaled_row)
print(f"Probabilities for single row: {probabilities.numpy()}")


In [1702]:
y_df = pd.DataFrame(y, columns=['breast_cancer_history']) 

In [1703]:
y_df.index[y_df['breast_cancer_history'] == 1].tolist()

In [1706]:
single_row = X.iloc[71960]
print(single_row)
scaled_row = scaler.transform(single_row.values.reshape(1, -1))
probabilities = predict_proba(model, scaled_row)

print("Probabilities for single row:")
print(f"Class 0 (No breast cancer history): {probabilities[0, 0].item():.4f}")
print(f"Class 1 (Breast cancer history): {probabilities[0, 1].item():.4f}")

# Determine the predicted class
predicted_class = probabilities.argmax(dim=1).item()
print(f"\nPredicted class: {predicted_class}")

# Compare to actual target
actual_class = y.iloc[71960]
print(f"Actual class: {actual_class}")

# Print interpretation
print("\nInterpretation:")
if predicted_class == 1:
    print(f"The model predicts a {probabilities[0, 1].item()*100:.2f}% chance of breast cancer history.")
else:
    print(f"The model predicts a {probabilities[0, 0].item()*100:.2f}% chance of no breast cancer history.")

In [1710]:
import torch
import pandas as pd
import numpy as np

def create_feature_prediction_df(model, X_row, y_actual, scaler):
    """
    Create a DataFrame containing the features, predicted probabilities, and actual value for a single row.
    
    :param model: Trained PyTorch model
    :param X_row: Single row of features (pandas Series or DataFrame)
    :param y_actual: Actual target value
    :param scaler: Fitted StandardScaler used to preprocess the data
    :return: pandas DataFrame
    """
    # Ensure X_row is a DataFrame
    if isinstance(X_row, pd.Series):
        X_row = X_row.to_frame().T
    
    # Scale the features
    X_scaled = scaler.transform(X_row)
    
    # Get predictions
    model.eval()
    with torch.no_grad():
        X_tensor = torch.FloatTensor(X_scaled)
        logits = model(X_tensor)
        probs = torch.sigmoid(logits)
    
    # Create a dictionary to store all data
    data = {}
    
    # Add features to the dictionary
    for col in X_row.columns:
        data[col] = X_row[col].values[0]
    
    # Add predicted probabilities
    data['Prob_No_History'] = (1 - probs).item()
    data['Prob_History'] = probs.item()
    
    # Add actual value
    data['Actual_Value'] = y_actual
    
    # Create DataFrame
    df = pd.DataFrame([data])
    
    return df


In [1712]:
result_dfs = []
for i in range(500):
    X_single = X.iloc[i]
    y_single = y.iloc[i]
    df = create_feature_prediction_df(model, X_single, y_single, scaler)
    result_dfs.append(df)

# Combine all results into a single DataFrame
all_results = pd.concat(result_dfs, ignore_index=True)


In [1713]:
all_results