In [None]:
from transformers import TrainingArguments, Trainer, AutoTokenizer, AutoModel, AutoModelForSequenceClassification
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
# from transformers import RobertaTokenizer, RobertaForSequenceClassification, RobertaModel
from transformers import  BertModel, BertForSequenceClassification, DistilBertModel, GPT2Model, GPT2Tokenizer, GPT2Config, GPT2ForSequenceClassification
from sklearn.model_selection import StratifiedKFold
from lime.lime_text import LimeTextExplainer
from sklearn.metrics import classification_report
import warnings
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from sklearn.metrics import precision_recall_fscore_support, matthews_corrcoef, balanced_accuracy_score


from SupCL_Seq import SupCsTrainer


warnings.filterwarnings('ignore')

# file_path = 'Hall_2012_cleaned.csv'
file_path = 'Jeyaraman_2020_cleaned.csv'
# file_path = 'Radjenovic_2013_cleaned.csv'
# file_path = 'Smid_2020_cleaned.csv'

df = pd.read_csv(file_path, delimiter=',')
df = df.dropna(axis=0)

df_sample = df.copy()
df_sample = df_sample.sample(frac=1).reset_index(drop=True)
class_counts = df_sample['label_included'].value_counts()
print(class_counts)


In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary') # or 'micro', 'macro', 'weighted' based on your needs
    mcc = matthews_corrcoef(labels, predictions)
    balanced_acc = balanced_accuracy_score(labels, predictions)
    
    return {
        'mcc': mcc,
        'balanced_accuracy': balanced_acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Data augmentation methodes

# Synonmsy replacement

In [None]:
from nltk.corpus import wordnet, words
import random
import pandas as pd

def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonym = lemma.name().replace('_', ' ')
            synonyms.add(synonym)
    if word in synonyms:
        synonyms.remove(word)
    return list(synonyms)

def synonym_replacement(sentence, n):
    words = sentence.split()
    new_words = words.copy()
    random_word_list = list(set([word for word in words if word.isalpha()]))
    random.shuffle(random_word_list)
    num_replaced = 0
    for random_word in random_word_list:
        synonyms = get_synonyms(random_word)
        if len(synonyms) >= 1:
            synonym = random.choice(synonyms)
            new_words = [synonym if word == random_word else word for word in new_words]
            num_replaced += 1
        if num_replaced >= n:
            break
    return ' '.join(new_words)

def random_insertion(sentence, n):
    words = sentence.split()
    for _ in range(n):
        new_synonyms = []
        random_word = random.choice(words)
        synonyms = get_synonyms(random_word)
        if synonyms:
            new_synonym = random.choice(synonyms)
            insert_position = random.randint(0, len(words))
            words.insert(insert_position, new_synonym)
    return ' '.join(words)

def random_deletion(sentence, p):
    words = sentence.split()
    if len(words) == 1:
        return sentence
    new_words = []
    for word in words:
        r = random.uniform(0, 1)
        if r > p:
            new_words.append(word)
    if len(new_words) == 0:
        return random.choice(words)
    return ' '.join(new_words)

def augment_text(df, minority_class, augment_by):
    minority_df = df[df['label_included'] == minority_class]
    n_minority = len(minority_df)
    n_augmentations = int(n_minority * augment_by)
    augmented_texts = []
    for _ in range(n_augmentations):
        original_text = random.choice(minority_df['Corpus'].tolist())
        augmented_text = original_text
        # Choose a random augmentation technique
        augmentation_type = random.choice(['synonym_replacement', 'random_insertion', 'random_deletion'])
        if augmentation_type == 'synonym_replacement':
            augmented_text = synonym_replacement(augmented_text, n=1)
        elif augmentation_type == 'random_insertion':
            augmented_text = random_insertion(augmented_text, n=1)
        elif augmentation_type == 'random_deletion':
            augmented_text = random_deletion(augmented_text, p=0.25)
        augmented_texts.append(augmented_text)
    augmented_df = pd.DataFrame(augmented_texts, columns=['Corpus'])
    augmented_df['label_included'] = minority_class
    return augmented_df


# Sentences Shuffling

In [None]:
# import random
# def shuffle_sentence(sentence):

    
#     """
#     Shuffles the words in a sentence, preserving the meaning but altering the structure.
#     """
#     words = sentence.split()
#     random.shuffle(words)
#     return ' '.join(words)

# def augment_text_with_shuffling(df, minority_class, augment_by=0.5):
#     """
#     Augments the given dataframe by shuffling sentences of the minority class.
#     """
#     minority_df = df[df['label_included'] == minority_class]
#     n_augmentations = int(len(minority_df) * augment_by)
    
#     augmented_texts = []
#     for _, row in minority_df.sample(n_augmentations, replace=True).iterrows():
#         original_text = row['Corpus']
#         augmented_text = shuffle_sentence(original_text)
#         augmented_texts.append(augmented_text)
    
#     augmented_df = pd.DataFrame(augmented_texts, columns=['Corpus'])
#     augmented_df['label_included'] = minority_class
#     return augmented_df


# EDA

In [None]:
# from nltk.corpus import wordnet
# import random

# def get_synonyms(word):
#     synonyms = set()
#     for syn in wordnet.synsets(word):
#         for lemma in syn.lemmas():
#             synonym = lemma.name().replace('_', ' ')
#             synonyms.add(synonym)
#     if word in synonyms:
#         synonyms.remove(word)
#     return list(synonyms)

# def synonym_replacement(sentence, n):
#     words = sentence.split()
#     new_words = words.copy()
#     random_word_list = list(set([word for word in words if word.isalpha()]))
#     random.shuffle(random_word_list)
#     num_replaced = 0
#     for random_word in random_word_list:
#         synonyms = get_synonyms(random_word)
#         if len(synonyms) >= 1:
#             synonym = random.choice(list(synonyms))
#             new_words = [synonym if word == random_word else word for word in new_words]
#             num_replaced += 1
#         if num_replaced >= n: # only replace up to n words
#             break

#     sentence = ' '.join(new_words)
#     return sentence

# def augment_text(df, minority_class, augment_by):
#     minority_df = df[df['label_included'] == minority_class]
    
#     n_minority = len(minority_df)
#     n_augmentations = int(n_minority * augment_by)
    
#     augmented_texts = []
#     for _ in range(n_augmentations):
#         original_text = random.choice(minority_df['Corpus'].tolist())
#         augmented_text = synonym_replacement(original_text, n=1) # You can adjust n for more replacements
#         augmented_texts.append(augmented_text)
    
#     # Create a DataFrame for augmented minority samples
#     augmented_df = pd.DataFrame(augmented_texts, columns=['Corpus'])
#     augmented_df['label_included'] = minority_class
    
#     return augmented_df


# Text summarization

In [None]:
# from transformers import pipeline
# import pandas as pd
# from sklearn.model_selection import train_test_split

# # Initialize the summarization pipeline with a pre-trained model
# summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# def summarize_text(text):
#     """
#     Summarizes the given text using a pre-trained summarization model.
#     """
#     summary_list = summarizer(text, max_length=50, min_length=25, do_sample=False)
#     summary_text = summary_list[0]['summary_text']
#     return summary_text

# def augment_text_with_summarization(df, minority_class, augment_by=0.5):
#     """
#     Augments the given dataframe by summarizing texts of the minority class.
#     """
#     minority_df = df[df['label_included'] == minority_class]
#     n_augmentations = int(len(minority_df) * augment_by)
    
#     augmented_texts = []
#     for _, row in minority_df.sample(n_augmentations, replace=True).iterrows():
#         original_text = row['Corpus']
#         try:
#             augmented_text = summarize_text(original_text)
#             augmented_texts.append(augmented_text)
#         except Exception as e:
#             print(f"An error occurred during summarization: {e}")
    
#     augmented_df = pd.DataFrame(augmented_texts, columns=['Corpus'])
#     augmented_df['label_included'] = minority_class
#     return augmented_df




In [None]:
df_augmented = augment_text(df, minority_class=1, augment_by=0.5)
df_sample = pd.concat([df, df_augmented], ignore_index=True)

df_sample = df_sample.sample(frac=1).reset_index(drop=True)
class_counts = df_sample['label_included'].value_counts()
print(class_counts)

In [None]:

texts = df_sample['Corpus'].tolist()
labels = df_sample['label_included'].tolist()
max_sequence_length = max(len(text.split()) for text in df_sample['Corpus'])  
num_classes = 1
input_shape = (max_sequence_length,)
from collections import Counter
token_counts = Counter(word for sentence in  df_sample['Corpus'] for word in sentence.split())
vocab_size = len(token_counts)
input_shape


In [None]:

model_name = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)


In [None]:
# model_name = "bert-base-uncased"  # Changed to use BERT base model
# tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = torch.tensor(labels)  # Convert labels to a tensor for consistent indexing

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]  # Direct tensor indexing
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
from transformers import TrainerCallback



class MetricsLogger(TrainerCallback):
    def __init__(self):
        super().__init__()
        self.training_loss = []
        self.validation_loss = []
        self.accuracy = []

    def on_log(self, args, state, control, logs=None, **kwargs):
        # Logs might contain training loss, validation loss, and validation metrics
        if 'loss' in logs:  # Training loss
            self.training_loss.append(logs['loss'])
        if 'eval_loss' in logs:  # Validation loss
            self.validation_loss.append(logs['eval_loss'])
        if 'eval_accuracy' in logs:  # Accuracy
            self.accuracy.append(logs['eval_accuracy'])

In [None]:
def extract_embeddings(model, dataloader):
    model.eval()
    model.to('cuda')

    embeddings = []
    for batch in dataloader:
        inputs = {key: val.to('cuda') for key, val in batch.items() if key != 'labels'}
        with torch.no_grad():
            outputs = model(**inputs)
            # Take the embeddings from the last hidden state for the [CLS] token
            cls_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
            embeddings.append(cls_embeddings)

    # Convert list of embeddings into a single numpy array
    return np.vstack(embeddings)

In [None]:
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
fold_metrics = {
    'mcc': [],
     'balanced_accuracy': [],
    'f1': [],
    'precision': [],
    'recall': []
 }
for fold, (train_idx, val_idx) in enumerate(skf.split(texts, labels), 1):
    print(f"Fold {fold}/{n_splits}")

    # Splitting the data
    train_texts, val_texts = np.array(texts)[train_idx], np.array(texts)[val_idx]
    train_labels, val_labels = np.array(labels)[train_idx], np.array(labels)[val_idx]
    tokenizer.pad_token = tokenizer.eos_token
    # Tokenization
    train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, return_tensors='pt')
    val_encodings = tokenizer(list(val_texts), truncation=True, padding=True, return_tensors='pt')

    # Dataset preparation
    train_dataset = CustomDataset(train_encodings, train_labels.tolist())
    val_dataset = CustomDataset(val_encodings, val_labels.tolist())

    # Initialize the model for contrastive learning
    train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=False)
#     model = BertModel.from_pretrained(model_name)  # Changed to BERT model
    model = RobertaModel.from_pretrained(model_name)
   


    pre_train_embeddings = extract_embeddings(model, train_dataloader)
#

    # TrainingArguments for SupCsTrainer
    CL_args = TrainingArguments(
        output_dir=f'./results/fold_{fold}',
        save_total_limit = 1,
        num_train_epochs=5,
        per_device_train_batch_size=8,  
        evaluation_strategy = 'no',
        logging_strategy='epoch',
        
        learning_rate = 5e-05,
        eval_steps = 500,
        warmup_steps=50, 
        report_to ='tensorboard',
        weight_decay=0.01,                  
        logging_dir=f'./logs/fold_{fold}',
      
    )

    # Initialize and train SupCsTrainer
    SupCL_trainer = SupCsTrainer.SupCsTrainer(
        w_drop_out=[0.0,0.05],
        temperature= 0.05,
        def_drop_out=0.1,
        pooling_strategy='mean',
        model = model,
        args = CL_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )
    SupCL_trainer.train()
    SupCL_trainer.save_model(f'./cs_baseline/fold_{fold}')

#     Load the trained model for fine-tuning
    fine_tune_model = RobertaForSequenceClassification.from_pretrained(f'./cs_baseline/fold_{fold}', num_labels=2)
    fine_tuned_base_model = fine_tune_model.roberta

#     fine_tune_model = BertForSequenceClassification.from_pretrained(f'./cs_baseline/fold_{fold}', num_labels=2)
#     fine_tuned_base_model = fine_tune_model.bert  # Adjusted to use BERT

    

    # Freeze the base model's parameters
    for param in fine_tune_model.base_model.parameters():
        param.requires_grad = False
        
    post_train_embeddings = extract_embeddings(fine_tuned_base_model, train_dataloader)

    # TrainingArguments for fine-tuning
    fine_tune_args = TrainingArguments(
        output_dir = f'./fine_tuned/fold_{fold}',
        save_total_limit = 1,
        num_train_epochs=10,
        per_device_train_batch_size=30,  
        per_device_eval_batch_size=10,
        evaluation_strategy = 'epoch',
        eval_steps = 500,
 
        learning_rate = 1e-03,
        logging_strategy='epoch',
     
        report_to ='tensorboard',
        weight_decay=0.01, 
    
        logging_dir=f'./logs/fine_tuned/fold_{fold}',
    )
    metrics_logger = MetricsLogger()

    # Initialize Trainer for fine-tuning
    fine_tune_trainer = Trainer(
        model=fine_tune_model,
        args=fine_tune_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        callbacks=[metrics_logger],
        compute_metrics=compute_metrics,
    )
    fine_tune_trainer.train()
    # Assuming fine_tune_trainer.evaluate() returns a dictionary with your metrics
    eval_metrics = fine_tune_trainer.evaluate()

    # Store the metrics
    fold_metrics['precision'].append(eval_metrics['eval_precision'])
    fold_metrics['recall'].append(eval_metrics['eval_recall'])
    fold_metrics['f1'].append(eval_metrics['eval_f1'])
    fold_metrics['mcc'].append(eval_metrics['eval_mcc'])
    fold_metrics['balanced_accuracy'].append(eval_metrics['eval_balanced_accuracy'])


In [None]:
# Calculate aggregate metrics across all folds
aggregate_metrics = {metric: np.mean(values) for metric, values in fold_metrics.items()}

print("Aggregate Metrics Across All Folds:")
for metric, value in aggregate_metrics.items():
    print(f"{metric}: {value:.4f}")

In [None]:
std_dev_metrics = {
    'precision': np.std(fold_metrics['precision']),
    'recall': np.std(fold_metrics['recall']),
    'f1': np.std(fold_metrics['f1']),
    'mcc': np.std(fold_metrics['mcc']),
    'balanced_accuracy': np.std(fold_metrics['balanced_accuracy'])
}

# Optionally, you can print these values to see them
for metric, std_dev in std_dev_metrics.items():
    print(f"The standard deviation for {metric} is {std_dev:.4f}")

In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

# Use t-SNE to reduce dimensionality for visualization
tsne = TSNE(n_components=2, random_state=42)
tsne_initial_embeddings = tsne.fit_transform(pre_train_embeddings)
tsne_final_embeddings = tsne.fit_transform(post_train_embeddings)

# Visualize the embeddings
plt.figure(figsize=(16, 8))

plt.subplot(1, 2, 1)
plt.scatter(tsne_initial_embeddings[:, 0], tsne_initial_embeddings[:, 1], c=train_labels, cmap='coolwarm')
plt.colorbar()
plt.title('Initial Embeddings')

plt.subplot(1, 2, 2)
plt.scatter(tsne_final_embeddings[:, 0], tsne_final_embeddings[:, 1], c=train_labels, cmap='coolwarm')
plt.colorbar()
plt.title('Final Embeddings After Training')

plt.show()

In [None]:
from sklearn.metrics import silhouette_score

pre_silhouette = silhouette_score(pre_train_embeddings, train_labels)
post_silhouette = silhouette_score(post_train_embeddings, train_labels)

print(f"Silhouette score before training: {pre_silhouette}")
print(f"Silhouette score after training: {post_silhouette}")

In [None]:
import matplotlib.pyplot as plt

# Plot training and validation loss
plt.figure(figsize=(10, 6))
plt.plot(metrics_logger.training_loss, label='Training Loss')
plt.plot(metrics_logger.validation_loss, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss per Epoch')
plt.legend()
plt.show()


In [None]:
)