In [None]:
# Import required libraries
from transformers import BigBirdForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments, DataCollatorWithPadding
import torch
from torch.utils.data import Dataset
import pandas as pd
from google.colab import files
import joblib
import os
import torch.nn.functional as F

In [None]:
# Step 2: Class for defining the custom dataset
class DialogueDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        context = self.dataframe.iloc[idx, 0]
        response = self.dataframe.iloc[idx, 1]
        label = self.dataframe.iloc[idx, 2]

        combined_text = context + " " + self.tokenizer.sep_token + " " + response
        encoding = self.tokenizer(
            combined_text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )

        input_ids = encoding["input_ids"].squeeze(0)
        attention_mask = encoding["attention_mask"].squeeze(0)

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": torch.tensor(label, dtype=torch.long),
        }

# Step 3: Class for model training
class ModelTrainer:
    def __init__(self, train_dataset):
        self.tokenizer = AutoTokenizer.from_pretrained('google/bigbird-roberta-base')
        self.model = BigBirdForSequenceClassification.from_pretrained('google/bigbird-roberta-base')
        self.train_dataset = train_dataset
        self.training_args = self._setup_training_args()
        for name, param in self.model.named_parameters():
          if not param.is_contiguous():
            print(f'Making contiguous:{name}')
            param.data = param.data.contiguous()
        for name, param in self.model.named_parameters():
            print(f'Layer:{name}, Contiguous:{param.is_contiguous()}')

    def _setup_training_args(self):
        # Set up training arguments, limiting to 1 epoch for quick testing
        return TrainingArguments(
            output_dir='./results',
            num_train_epochs=1,  # Quick testing with 1 epoch
            per_device_train_batch_size=8,
            learning_rate=2e-5,
            warmup_steps=500,
            weight_decay=0.01,
            logging_dir='./logs',
            logging_steps=50,
            save_total_limit=2,
            save_steps=200,
            evaluation_strategy="no",
        )

    def fine_tune_model(self):
        trainer = Trainer(
            model=self.model,
            args=self.training_args,
            train_dataset=self.train_dataset
        )
        trainer.train()
        return self.model

    def save_model(self, save_path):
        self.model.save_pretrained(save_path)
        self.tokenizer.save_pretrained(save_path)
        print(f"Model saved to {save_path}")

# Step 4: Class for coherence evaluation
class CoherenceEvaluator:
    def __init__(self, model_path):
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        self.model = BigBirdForSequenceClassification.from_pretrained(model_path)

    def tokenize_input(self, context, response):
        return self.tokenizer(context, response, return_tensors='pt', max_length=1024, truncation=True, padding='max_length')

    def compute_logits(self, inputs):
        outputs = self.model(**inputs)
        return outputs.logits

    def apply_softmax(self, logits):
        probabilities = F.softmax(logits, dim=1)
        return probabilities[0][1].item()

# Step 5: Main pipeline class to encapsulate the entire process
class CoherencePipeline:
    def __init__(self, dataset_path, model_save_path, train_model=True):
        self.file_path = dataset_path
        self.model_save_path = model_save_path
        self.train_model = train_model
        self.model_trainer = None
        self.coherence_evaluator = None

    def prepare_dataset(self):
        df = pd.read_csv(self.file_path)
        tokenizer = AutoTokenizer.from_pretrained('google/bigbird-roberta-base')
        train_dataset = DialogueDataset(df, tokenizer, max_length=256)
        return train_dataset

    def train_and_save_model(self, train_dataset):
        self.model_trainer = ModelTrainer(train_dataset)
        trained_model = self.model_trainer.fine_tune_model()
        self.model_trainer.save_model(self.model_save_path)
        return trained_model

    def evaluate_coherence(self):
        #file_name = list(self.file_path.keys())[0]
        with open(self.file_path, 'r') as file:
            dialogue = file.readlines()

        self.coherence_evaluator = CoherenceEvaluator(self.model_save_path)
        pairs = [(dialogue[i].strip(), dialogue[i + 1].strip()) for i in range(len(dialogue) - 1)]

        scores = []
        for context, response in pairs:
            inputs = self.coherence_evaluator.tokenize_input(context, response)
            logits = self.coherence_evaluator.compute_logits(inputs)
            score = self.coherence_evaluator.apply_softmax(logits)
            scores.append(score)

        # Create DataFrame to store results
        df_results = pd.DataFrame({
            'Pair Number': [f'Pair {i+1}' for i in range(len(pairs))],
            'Context': [pair[0] for pair in pairs],
            'Response': [pair[1] for pair in pairs],
            'Coherence Score': scores
        })

        # Calculate overall coherence score
        overall_score = sum(scores) / len(scores)
        df_results.loc['Overall'] = ['', '', 'Overall Coherence Score', overall_score]

        return df_results

    def run_pipeline(self):
        if self.train_model:
            # Train model if flag is set to True
            train_dataset = self.prepare_dataset()
            self.train_and_save_model(train_dataset)
        else:
            # Check if fine-tuned model exists
            if self.model_save_path.startswith('google/'):
              print(f'Using pretrained model from Hugging Face:{self.model_save_path}')
            else:
                if not os.path.exists(self.model_save_path):
                    raise FileNotFoundError(f"No fine-tuned model found at {self.model_save_path}. Please train the model first.")
                print(f"Using existing model from {self.model_save_path}")

        # Proceed to evaluate test data
        df_results = self.evaluate_coherence()
        print(df_results)
        return df_results


In [None]:
# Step 6: Run the pipeline
pipeline = CoherencePipeline(
    dataset_path = '/content/dialogues_dataset.csv',
    model_save_path='./coherence_model',
    train_model=True  # Set to True if you want to train, False to use existing model
)

# Run the pipeline
df_results = pipeline.run_pipeline()

Some weights of BigBirdForSequenceClassification were not initialized from the model checkpoint at google/bigbird-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Making contiguous:bert.encoder.layer.0.attention.self.query.weight
Making contiguous:bert.encoder.layer.0.attention.self.key.weight
Making contiguous:bert.encoder.layer.0.attention.self.value.weight
Making contiguous:bert.encoder.layer.0.attention.output.dense.weight
Making contiguous:bert.encoder.layer.0.intermediate.dense.weight
Making contiguous:bert.encoder.layer.0.output.dense.weight
Making contiguous:bert.encoder.layer.1.attention.self.query.weight
Making contiguous:bert.encoder.layer.1.attention.self.key.weight
Making contiguous:bert.encoder.layer.1.attention.self.value.weight
Making contiguous:bert.encoder.layer.1.attention.output.dense.weight
Making contiguous:bert.encoder.layer.1.intermediate.dense.weight
Making contiguous:bert.encoder.layer.1.output.dense.weight
Making contiguous:bert.encoder.layer.2.attention.self.query.weight
Making contiguous:bert.encoder.layer.2.attention.self.key.weight
Making contiguous:bert.encoder.layer.2.attention.self.value.weight
Making contiguous

Attention type 'block_sparse' is not possible if sequence_length: 256 <= num global tokens: 2 * config.block_size + min. num sliding tokens: 3 * config.block_size + config.num_random_blocks * config.block_size + additional buffer: config.num_random_blocks * config.block_size = 704 with config.block_size = 64, config.num_random_blocks = 3. Changing attention type to 'original_full'...


Step,Training Loss
50,0.6609
100,0.5022


Model saved to ./coherence_model


KeyboardInterrupt: 

In [None]:
# Step 6: Run the pipeline (using trained model)
pipeline = CoherencePipeline(
    dataset_path = '/content/dialogue1.txt',
    model_save_path='./coherence_model',
    train_model=False  # Set to True if you want to train, False to use existing model
)

# Run the pipeline
df_result = pipeline.run_pipeline()

Using existing model from ./coherence_model
        Pair Number                                            Context  \
0            Pair 1  ["AI: Hi, my name is Lila. I'm Octivo's AI age...   
1            Pair 2  "Caller: Hey, nice to meet you. My name is Mic...   
2            Pair 3  "AI: Thank you for introducing yourself Michae...   
3            Pair 4  "Caller: Yeah, sure. I'm 27 but I feel like I ...   
4            Pair 5  "AI: I completely understand your hesitation a...   
5            Pair 6  "Caller: Ok, that's fair enough. So I'm earnin...   
6            Pair 7  "AI: Thank you for sharing your income range t...   
7            Pair 8  "Caller: I will retire at around 65 and I woul...   
Overall                                                                  

                                                  Response  Coherence Score  
0        "Caller: Hey, nice to meet you. My name is Mic...         0.590054  
1        "AI: Thank you for introducing yourself Michae... 

In [None]:
# Step 6: Run the pipeline (using bigbird pretrained model)
pipeline = CoherencePipeline(
    dataset_path = '/content/dialogue1.txt',
    model_save_path='.google/bigbird-roberta-base',
    train_model=False  # Set to True if you want to train, False to use existing model
)

# Run the pipeline
df_results = pipeline.run_pipeline()

Using pretrained model from Hugging Face:google/bigbird-roberta-base


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of BigBirdForSequenceClassification were not initialized from the model checkpoint at google/bigbird-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


        Pair Number                                            Context  \
0            Pair 1  ["AI: Hi, my name is Lila. I'm Octivo's AI age...   
1            Pair 2  "Caller: Hey, nice to meet you. My name is Mic...   
2            Pair 3  "AI: Thank you for introducing yourself Michae...   
3            Pair 4  "Caller: Yeah, sure. I'm 27 but I feel like I ...   
4            Pair 5  "AI: I completely understand your hesitation a...   
5            Pair 6  "Caller: Ok, that's fair enough. So I'm earnin...   
6            Pair 7  "AI: Thank you for sharing your income range t...   
7            Pair 8  "Caller: I will retire at around 65 and I woul...   
Overall                                                                  

                                                  Response  Coherence Score  
0        "Caller: Hey, nice to meet you. My name is Mic...         0.520310  
1        "AI: Thank you for introducing yourself Michae...         0.543495  
2        "Caller: Yeah, s