In [None]:
!pip install rouge-score

In [None]:
!pip install transformers
!pip install torch

In [None]:
actual_speeches_dict = {'101112641': ["I thank my colleague. the senior Senator from Alaska. and I thank the Chair. As I think my colleagues have observed. the concern expressed by the senior Senator is not a concern he shares alone. I find It utterly amazing that this agricultural bill there should suddenly be subject to a proposal on fisheries.", "Mr. Leader. for those of us who are on the west coast. Washington. Oregon. Alaska. and Hawaii. as he contemplates the schedule for coming back on Tuesday. would he give some consideration to votes on Tuesday. if he so desires. to coincide with our ability to get back. we can all get back prior to 4:15. but unless we leave on Monday. it is impossible to be here early.", "I commend him for his cooperation and his working with the chairman to the point where we are today. Other Members as well. including Senator MErZENBAUM. Thank you."],
                     '101109290': ["Mr. Chairman. I yield 4 minutes to the gentleman from Louisiana .", "Mr. Chairman. I ask unanimous consent that the amendment in the nature of a substitute made in order as an original text by the rule be printed in the REcoRD and open to amendment at any point.", "Mr. Chairman. I offer amendments and I ask unanimous consent that they be considered en bloc."],
                     '104118220': ["We have touched on the fact that these cuts will be devastating for Medicare recipients.", "If the gentleman will yield. when we are talking about education. we are not just talking about student loans.", "Here you have schools all over America and in the State of Vermont working extremely hard to keep kids off of drugs."]}

generated_speeches_dict = {'101112641': ["I thank my colleague, the hon. member for Saint-Maurice—Champlain, for the opportunity to speak to Bill C-218, An Act to amend the Criminal Code and the Controlled Drugs and Substances Act (sentencing for offences involving trafficking in persons).", "Mr. Leader. for those of us who have been here a long time, this is not the first time that we have had to deal with this issue. We have dealt with it in the past, and we will deal with it in the future.", "I commend him for his cooperation and his willingness to work with us."],
                      '101109290': ["Mr. Chairman. I yield myself such time as I may consume.", "Mr. Chairman. I ask unanimous consent that all Members may have 5 legislative days in which to revise and extend their remarks and to include extraneous material on H.R. 2499, as amended.", 'Mr. Chairman. I offer an amendment to H.R. 1011.\nThe amendment is as follows:\n Page 2, line 1, strike out the word "the" and insert in lieu thereof the word "that".'],
                      '104118220': ["We have touched on this subject before.", "If the gentleman will permit me to say a few words, I shall be glad to do so.", "Here you have schools, colleges, and universities."]}

In [None]:
# Preprocess the texts (optional but recommended)
def preprocess_text(text):
    text = text.lower()  # Convert text to lowercase
    text = " ".join(nltk.word_tokenize(text))  # Tokenize text into words
    return text

In [None]:
import nltk
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer

def calculate_rouge_bleu_score(id):
    
    # Sample speeches (replace these with your actual speeches)
    actual_speeches = actual_speeches_dict[id]

    generated_speeches = generated_speeches_dict[id]

    actual_speeches = [preprocess_text(text) for text in actual_speeches]
    generated_speeches = [preprocess_text(text) for text in generated_speeches]

    # Calculate BLEU score for each generated speech compared to its corresponding actual speech
    bleu_scores = []
    for actual_speech, generated_speech in zip(actual_speeches, generated_speeches):
        bleu_score = sentence_bleu([actual_speech.split()], generated_speech.split())
        bleu_scores.append(bleu_score)

    # Create a RougeScorer instance
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

    # Calculate ROUGE scores for each generated speech compared to its corresponding actual speech
    rouge_scores = []
    for actual_speech, generated_speech in zip(actual_speeches, generated_speeches):
        scores = scorer.score(generated_speech, actual_speech)
        rouge_scores.append(scores)

    # Print the BLEU scores
    print("BLEU Scores:")
    for i, bleu_score in enumerate(bleu_scores, 1):
        print(f"Speech {i}: {bleu_score}")

    # Print the ROUGE scores
    for i, scores in enumerate(rouge_scores, 1):
        print(f"ROUGE Scores for speech {i}:")
        print(f"ROUGE-1: {scores['rouge1'].fmeasure}")
        print(f"ROUGE-2: {scores['rouge2'].fmeasure}")
        print(f"ROUGE-L: {scores['rougeL'].fmeasure}")

In [None]:
import nltk
import numpy as np
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from transformers import BartTokenizer, BartModel
import torch

def create_embeddings_and_calculate_cos_sim(id):
    # Sample speeches (replace these with your actual speeches)
    actual_speeches = actual_speeches_dict[id]

    generated_speeches = generated_speeches_dict[id]

    # Download 'punkt' tokenizer data (if not already downloaded)
    nltk.download('punkt')

    # Preprocess the actual and generated speeches
    actual_speeches = [preprocess_text(text) for text in actual_speeches]
    generated_speeches = [preprocess_text(text) for text in generated_speeches]

    # Load BART tokenizer and model
    tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
    model = BartModel.from_pretrained('facebook/bart-large')

    # Function to create embeddings from text using BART
    def create_bart_embeddings(text):
        input_ids = tokenizer.encode(text, return_tensors="pt", add_special_tokens=True)
        with torch.no_grad():
            output = model(input_ids)
        embeddings = output.last_hidden_state.mean(dim=1).squeeze().numpy()
        return embeddings

    # Create embeddings for actual speeches and generated speeches
    actual_embeddings = [create_bart_embeddings(text) for text in actual_speeches]
    generated_embeddings = [create_bart_embeddings(text) for text in generated_speeches]

    # Remove None embeddings (if any)
    actual_embeddings = [embed for embed in actual_embeddings if embed is not None]
    generated_embeddings = [embed for embed in generated_embeddings if embed is not None]

    # Calculate cosine similarity between embeddings
    cosine_similarity = np.dot(actual_embeddings, np.array(generated_embeddings).T) / (
        np.linalg.norm(actual_embeddings, axis=1)[:, None] * np.linalg.norm(np.array(generated_embeddings), axis=1)
    )

    # Print cosine similarity for each speech pair
    print("Cosine Similarity:")
    for i in range(len(actual_speeches)):
        for j in range(len(generated_speeches)):
            print(f"Speech Pair ({i+1}, {j+1}): {cosine_similarity[i, j]}")

    # Apply PCA to reduce dimensionality to 3D
    pca = PCA(n_components=3)
    pca_embeddings = pca.fit_transform(np.concatenate((actual_embeddings, generated_embeddings), axis=0))

    # Separate the PCA embeddings back to actual and generated
    actual_pca_embeddings = pca_embeddings[: len(actual_embeddings)]
    generated_pca_embeddings = pca_embeddings[len(actual_embeddings) :]

    # Visualize the embeddings in a 3D scatter plot
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    ax.scatter(actual_pca_embeddings[:, 0], actual_pca_embeddings[:, 1], actual_pca_embeddings[:, 2], c='blue', label='Actual Speeches')
    ax.scatter(generated_pca_embeddings[:, 0], generated_pca_embeddings[:, 1], generated_pca_embeddings[:, 2], c='red', label='Generated Speeches')
    ax.set_xlabel('PCA Component 1')
    ax.set_ylabel('PCA Component 2')
    ax.set_zlabel('PCA Component 3')
    ax.legend()
    plt.show()


In [None]:
def print_results(id):
    print('Speaker id:' + str(id))
    print('========== Rouge and Bleu Score ==========')
    calculate_rouge_bleu_score(str(id))
    print('========== Cosine Similarities and Embeddings ==========')
    create_embeddings_and_calculate_cos_sim(str(id))


In [None]:
print_results(101112641)
print_results(101109290)
print_results(104118220)