In [6]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load data
data_train = pd.read_csv('../data/heart_disease/data_train.csv')
data_test = pd.read_csv('../data/heart_disease/data_test.csv')

# Simulation parameters
sample_sizes = np.linspace(50, 200, 4, dtype=int)  # Training sizes from 50 to 200 in 4 steps
num_tests = 180  # Number of test instances
num_simulations = 1  # Perform 1 simulation for each size

# Initialize Random Forest model
random_forest = RandomForestClassifier()

# Function to train and predict
def train_and_predict(model, X_train, y_train, X_test):
    model.fit(X_train, y_train)
    return model.predict(X_test)

# Main simulation loop
results = []
for size in sample_sizes:
    for sim in range(num_simulations):  # This now runs only once per sample size
        # Randomly sample training data
        X_train_sample, _, y_train_sample, _ = train_test_split(
            data_train.drop('num', axis=1), data_train['num'], train_size=size, random_state=sim)
        
        # Select a fixed number of test instances
        X_test_sample = data_test.drop('num', axis=1).sample(n=num_tests, random_state=sim)
        y_test_sample = data_test.loc[X_test_sample.index, 'num']
        
        # Train and predict with Random Forest
        rf_predictions = train_and_predict(random_forest, X_train_sample, y_train_sample, X_test_sample)
        
        # Evaluate predictions using accuracy
        rf_accuracy = accuracy_score(y_test_sample, rf_predictions)
        
        # Store results
        results.append({
            'train_size': size,
            'simulation': sim,
            'rf_accuracy': rf_accuracy
        })

# Convert results to DataFrame for analysis
results_df = pd.DataFrame(results)

# Display the results
print(results_df)


   train_size  simulation  rf_accuracy
0          50           0     0.877778
1         100           0     0.883333
2         150           0     0.872222
3         200           0     0.905556


In [None]:
from transformers import BertTokenizer

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Function to tokenize each row individually
def tokenize_row(row, tokenizer):
    # Convert the row to string, as BERT expects textual input
    row_str = " ".join(map(str, row.values))
    inputs = tokenizer(row_str, padding='max_length', truncation=True, return_tensors="pt")
    return inputs

# Tokenize the training and testing data
train_inputs = [tokenize_row(row, tokenizer) for _, row in data_train.iterrows()]
test_inputs = [tokenize_row(row, tokenizer) for _, row in data_test.iterrows()]


import torch
from torch.utils.data import Dataset, DataLoader

class HeartDiseaseDataset(Dataset):
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        input_ids = self.inputs[idx]['input_ids'].squeeze(0)  # Remove batch dimension
        attention_mask = self.inputs[idx]['attention_mask'].squeeze(0)
        label = torch.tensor(self.labels.iloc[idx])
        return input_ids, attention_mask, label

# Create dataset and dataloader
train_dataset = HeartDiseaseDataset(train_inputs, y_train)
test_dataset = HeartDiseaseDataset(test_inputs, y_test)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)




In [None]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertForSequenceClassification, BertTokenizer, AdamW
from sklearn.model_selection import train_test_split

# Tokenize function
def tokenize_data(data, tokenizer):
    return [tokenize_row(row, tokenizer) for _, row in data.iterrows()]

def tokenize_row(row, tokenizer):
    row_str = " ".join(map(str, row.values))
    inputs = tokenizer(row_str, padding='max_length', truncation=True, return_tensors="pt")
    return inputs

# Dataset class
class HeartDiseaseDataset(Dataset):
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        input_ids = self.inputs[idx]['input_ids'].squeeze(0)  # Remove batch dimension
        attention_mask = self.inputs[idx]['attention_mask'].squeeze(0)
        label = torch.tensor(self.labels.iloc[idx])
        return input_ids, attention_mask, label

# Training function with gradient accumulation
def train(model, train_loader, optimizer, device, accumulation_steps=4):
    model.train()
    total_loss = 0
    optimizer.zero_grad()
    for i, batch in enumerate(train_loader):
        input_ids, attention_mask, labels = [x.to(device) for x in batch]
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()

        if (i + 1) % accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()
    return total_loss / len(train_loader)

# Prepare tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Simulation parameters
sample_sizes = np.linspace(50, 200, 4, dtype=int)
num_simulations = 4

# Initialize BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
optimizer = AdamW(model.parameters(), lr=5e-5)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

results = []
for size in sample_sizes:
    for sim in range(num_simulations):
        # Sample and tokenize training data
        train_data, _ = train_test_split(data_train, train_size=size, random_state=sim)
        train_inputs = tokenize_data(train_data, tokenizer)
        train_labels = train_data['label']  # Assume label column is named 'label'

        # Create train loader
        train_dataset = HeartDiseaseDataset(train_inputs, train_labels)
        train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

        # Training BERT
        avg_loss = train(model, train_loader, optimizer, device)
        
        # Simulate test set predictions, assuming you need predictions for a fixed number of test instances
        test_inputs = tokenize_data(data_test.sample(30, random_state=sim), tokenizer)
        test_labels = data_test['label'].iloc[:30]  # Modify as per your test sampling logic
        test_dataset = HeartDiseaseDataset(test_inputs, test_labels)
        test_loader = DataLoader(test_dataset, batch_size=16)
        test_predictions = get_predictions(model, test_loader)

        # Evaluate the predictions as required (e.g., accuracy)
        # Store results
        results.append({
            'train_size': size,
            'simulation': sim,
            'loss': avg_loss,
            'predictions': test_predictions  # Or any other metric
        })

# Convert results to DataFrame for analysis
results_df = pd.DataFrame(results)
print(results_df)
