In [1]:
# Load necessar libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import torch
import numpy as np
import re


In [None]:
#Loading the heart_disease_uci dataset
train_data = pd.read_csv('../../data/breast_cancer/breast_cancer_data_train.csv')
test_data = pd.read_csv('../../data/breast_cancer/breast_cancer_data_test.csv')
train_data.head()



In [None]:
#Splitting the data into features and target
X_train = train_data.iloc[:,:-1]
y_train = train_data.iloc[:,-1]

X_test = test_data.iloc[:,:-1]
y_test = test_data.iloc[:,-1]
X_train.head()


In [4]:
#Scaling the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

#Converting the data into dataframes
X_train = pd.DataFrame(X_train, columns = train_data.columns[:-1])
X_test = pd.DataFrame(X_test, columns = test_data.columns[:-1])


In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoModelForCausalLM

# Load the Mistral model and tokenizer from Hugging Face
model_name = "mistralai/Mistral-7B-v0.1"

# Ensure compatibility by using AutoTokenizer and AutoModel
tokenizer = AutoTokenizer.from_pretrained(model_name, token=token, device_map="auto")
model = AutoModelForCausalLM.from_pretrained(model_name, num_labels=2,token=token,device_map="auto")

# Ensure pad_token_id is set
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

In [None]:
# Function to create the prompt for in-context learning
# Create a new function for constructing prompts
def create_prompt(X_train, y_train, test_row):
    """
    This function generates a prompt for the model by providing examples
    of input features and their corresponding labels from the training set, followed by
    the test instance for which we want the model to predict the label.
    """
    prompt = ""

    # Add the test instance for prediction
    test_features = ", ".join([f"{col}={test_row[col]}" for col in X_train.columns])
    prompt += f"Input: {test_features} -> Output: ? \n"  

    # Add examples from the training set to the prompt
    for i, train_row in X_train.iterrows():
        input_features = ", ".join([f"{col}={train_row[col]}" for col in X_train.columns])
        label = y_train.iloc[i]
        prompt += f"Input: {input_features} -> Output: {label}\n"
    
    return prompt

# Example: Create a prompt for the first test instance
prompt = create_prompt(X_train, y_train, X_test.iloc[0])  # Correct usage of .iloc
print(prompt)



In [None]:
from transformers import set_seed
# Set the seed for reproducibility
set_seed(42)
# Tokenize the prompt with truncation
inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=300)
# Generate prediction
input_length = inputs['input_ids'].shape[1]
print(input_length)
#output = model.generate(**inputs, max_new_tokens=10)  # Control how many tokens are generated
output = model.generate(**inputs,min_length=input_length, max_new_tokens= 400, do_sample=True, temperature=0.7, top_k=50, pad_token_id=tokenizer.pad_token_id)


# Decode the generated prediction
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(generated_text)




In [None]:

def extract_prediction(generated_text, prompt):
    """
    Extracts the predicted label from the generated text.
    Assumes the generated output follows the pattern "Input: <features> -> Output: <prediction>".
    """
    try:
        # Use regular expression to find the value after "output="
        #match = re.search(r'(Output|class|target)\s*[:=]\s*([-+]?\d*\.\d+|\d+)', generated_text)
        match = re.search(r'Output\s*:\s*([-+]?\d*\.\d+|\d+)', generated_text)
        if match:
            prediction = match.group(1)
            print(prediction)
            return (int(round(float(prediction))) if prediction.replace('.', '', 1).isdigit() else None)
        else:
            return None
    except Exception as e:
        print(f"Error extracting prediction: {e}")
        return None

In [None]:
from transformers import set_seed
# Function to generate predictions for a batch of test data
def in_context_learning(X_train, y_train, X_test, model, tokenizer):
    """
    Generates predictions for the test set using in-context learning.
    """
    # Set the seed for reproducibility
    # set_seed(42)
    
    
    predictions = []
    temp = []
    for _, test_row in X_test.iterrows():
        prompt = create_prompt(X_train, y_train, test_row)
        
        # Tokenize the prompt and generate output
        inputs = tokenizer(prompt, return_tensors="pt",truncation=True, max_length=300)
        input_length = inputs['input_ids'].shape[1]
        output = model.generate(**inputs,min_length=input_length, max_new_tokens=200, do_sample=True, temperature=0.7, top_k=50,) # Adjust parameters as needed, these are working parameters
        
        # Decode the generated output
        generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
        temp.append(generated_text)
        # Extract the prediction from the generated text
        predicted_label = extract_prediction(generated_text, prompt)
        predictions.append(predicted_label)
    
    return predictions, temp




# Generate predictions for the test set
batch_size = 8  # Adjust batch size as needed
predictions = []
temp = []
pd.DataFrame(predictions).to_csv('results/mistralmain.csv', index=False)
pd.DataFrame(temp).to_csv('results/textmistral.csv', index=False)

for i in range(0, len(X_test), batch_size):
    X_batch = X_test[i:i + batch_size]
    batch_predictions, textsp = in_context_learning(X_train, y_train, X_batch, model, tokenizer)
    predictions.extend(batch_predictions)
    temp.extend(textsp)
    
# Append the predictions to the results folder in mistral.csv file
pd.DataFrame(predictions).to_csv('results/mistralmain.csv', mode='a', header=False, index=False)

# Append the text to the results folder in textmistral.csv file
pd.DataFrame(temp).to_csv('results/textmistral.csv', mode='a', header=False, index=False)






## This part of the code might throw an error if the predictions donot match the specified format in this case manual work to scrape the predictions must be performed from the generated csv 

In [None]:
# Evaluate the Mistral model with confusion matrix and classification report
from sklearn.metrics import confusion_matrix, classification_report

# Calculate confusion matrix
confusion = confusion_matrix(y_test, predictions)
print("Confusion Matrix:")

# Display confusion matrix
print(confusion)

# Calculate classification report
report = classification_report(y_test, predictions)
print("\nClassification Report:")
# Display classification report
print(report)

# Calculate accuracy, precision, recall, and F1 score
accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions)
recall = recall_score(y_test, predictions)
f1 = f1_score(y_test, predictions)

# Display metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
