In [25]:
# Load necessar libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import torch


In [None]:
#Loading the heart_disease_uci dataset
train_data = pd.read_csv('..\data\heart_disease\data_train.csv')
test_data = pd.read_csv('..\data\heart_disease\data_test.csv')
train_data.head()


In [None]:
#Splitting the data into features and target
X_train = train_data.iloc[:,:-1]
y_train = train_data.iloc[:,-1]

X_test = test_data.iloc[:,:-1]
y_test = test_data.iloc[:,-1]
X_train.head()

In [28]:
#Scaling the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [29]:
# Convert to DataFrame for easier handling
train_data = pd.DataFrame(X_train)
train_data['target'] = y_train.values

test_data = pd.DataFrame(X_test)
test_data['target'] = y_test.values



In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoModelForCausalLM

# Load the Mistral model and tokenizer from Hugging Face
model_name = "mistralai/Mistral-7B-v0.1"
token = "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"

# Ensure compatibility by using AutoTokenizer and AutoModel
tokenizer = AutoTokenizer.from_pretrained(model_name, token=token, device_map="auto")
model = AutoModelForCausalLM.from_pretrained(model_name, num_labels=2,token=token,device_map="auto")

# Ensure pad_token_id is set
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

In [11]:
# Modify the model for sequence classification
class MistralForClassification(torch.nn.Module):
    def __init__(self, model):
        super(MistralForClassification, self).__init__()
        self.model = model
        # Add a linear layer for classification (2 classes for heart disease)
        self.classifier = torch.nn.Linear(model.config.hidden_size, 2)
    
    def forward(self, input_ids, attention_mask=None, labels=None):
        # Get outputs from the Mistral model
        outputs = self.model(input_ids, attention_mask=attention_mask, output_hidden_states=False)
        
        # Take the hidden states (last layer) from the model's output
        logits = self.classifier(outputs.logits[:, -1, :])  # Classification on [CLS] token
        
        # Calculate loss if labels are provided
        loss = None
        if labels is not None:
            loss_fct = torch.nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, 2), labels.view(-1))
        
        return loss, logits

# Initialize the model with the classification head
classification_model = MistralForClassification(model)

In [12]:
import torch
from torch.utils.data import Dataset, DataLoader

# Custom dataset class for PyTorch
class HeartDiseaseDataset(Dataset):
    def __init__(self, features, targets, tokenizer):
        self.features = features
        self.targets = targets
        self.tokenizer = tokenizer
    
    def __len__(self):
        return len(self.targets)
    
    def __getitem__(self, idx):
        # Convert row of features to string and tokenize
        row_str = " ".join(map(str, self.features[idx]))
        inputs = self.tokenizer(row_str, padding="max_length", truncation=True, return_tensors="pt")
        label = torch.tensor(self.targets[idx], dtype=torch.long)
        return inputs["input_ids"].squeeze(0), inputs["attention_mask"].squeeze(0), label


# Prepare dataset and dataloaders
train_dataset = HeartDiseaseDataset(X_train, y_train, tokenizer)
test_dataset = HeartDiseaseDataset(X_test, y_test, tokenizer)



In [13]:
# defining a custom collate function to pad the sequences
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    input_ids = [item[0] for item in batch]
    attention_masks = [item[1] for item in batch]
    labels = [item[2] for item in batch]

    input_ids_padded = pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_masks_padded = pad_sequence(attention_masks, batch_first=True, padding_value=0)
    labels = torch.tensor(labels)

    return input_ids_padded, attention_masks_padded, labels

In [14]:
from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=2, collate_fn=collate_fn)


In [None]:
from transformers import AdamW
import torch.nn.functional as F

# Set up the optimizer
optimizer = AdamW(classification_model.parameters(), lr=5e-5)

# Training function with gradient accumulation
def train(model, train_loader, optimizer, epochs=3, accumulation_steps=4):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        optimizer.zero_grad()
        for i, (input_ids, attention_mask, labels) in enumerate(train_loader):
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()
            loss.backward()

            if (i + 1) % accumulation_steps == 0:
                optimizer.step()
                optimizer.zero_grad()

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1}, Loss: {avg_loss:.4f}")

# Train the model with gradient accumulation
train(classification_model, train_loader, optimizer)

In [None]:
# Function to get predictions
def get_predictions(model, data_loader):
    model.eval()
    predictions = []
    for batch in data_loader:
        input_ids, attention_mask, labels = batch
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, pred = torch.max(outputs.logits, dim=1)
        predictions.extend(pred.tolist())
    return predictions

# Get predictions on test set
test_predictions = get_predictions(classification_model, test_loader)

# save predictions to results folder names bert.csv
pd.DataFrame(test_predictions).to_csv('results/mistral.csv', index=False)



In [None]:
def evaluate(model, test_loader):
    model.eval()
    total_correct = 0
    total_examples = 0
    all_labels = []
    all_preds = []
    with torch.no_grad():
        for batch in test_loader:
            input_ids, attention_mask, labels = batch
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            predictions = torch.argmax(outputs.logits, dim=-1)
            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(predictions.cpu().numpy())
            total_correct += (predictions == labels).sum().item()
            total_examples += labels.size(0)
    accuracy = total_correct / total_examples
    precision = precision_score(all_labels, all_preds)
    recall = recall_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)
    return accuracy, precision, recall, f1

# Evaluate the Mistral model
mistral_accuracy, mistral_precision, mistral_recall, mistral_f1 = evaluate(classifiication_model, test_loader)
print(f'Mistral Model - Accuracy: {mistral_accuracy:.4f}, Precision: {mistral_precision:.4f}, Recall: {mistral_recall:.4f}, F1-Score: {mistral_f1:.4f}')
