# ROBERTA model tests

In [None]:
# !pip install transformers
# !pip install nltk
import torch
import torch.nn as nn
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from nltk.tokenize import sent_tokenize
from sklearn.metrics import accuracy_score, f1_score
import nltk

nltk.download('punkt')

class ParaphraseModel(nn.Module):
    def __init__(self):
        super(ParaphraseModel, self).__init__()
        self.bert = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)
        self.dropout = nn.Dropout(0.3) 

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        pooled_output = self.dropout(outputs.logits) 
        return outputs.loss, pooled_output

# Load the tokenizer and the model
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
best_model = ParaphraseModel().to(device)
best_model.load_state_dict(torch.load('best_model_fold.bin'))
best_model.eval()


In [None]:
def compare_texts(text1, text2, model, tokenizer, device, threshold=0.8):
    inputs = tokenizer.encode_plus(
        text1, text2, add_special_tokens=True, max_length=128, truncation=True, padding='max_length', return_tensors='pt'
    )
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    with torch.no_grad():
        loss, logits = model(input_ids, attention_mask)
        probs = torch.nn.functional.softmax(logits, dim=1)
        prediction = torch.argmax(probs, dim=1).cpu().numpy()[0]
        score = probs.cpu().numpy()[0][1]  # Probability of being a paraphrase

    return score, "Paraphrase" if score > threshold else "Not Paraphrase"

def display_results(text_pairs, model, tokenizer, device):
    for text1, text2 in text_pairs:
        score, result = compare_texts(text1, text2, model, tokenizer, device)
        print(f"Text 1: {text1}\nText 2: {text2}\nScore: {score:.2f}\nResult: {result}\n")

## Tests

In [None]:
texts = [
    # Cellular Respiration
    "Cellular respiration is the process by which cells break down glucose and other molecules from food in the presence of oxygen to produce ATP, which is the main energy currency of the cell. This process involves glycolysis, the citric acid cycle, and the electron transport chain.",
    "In cellular respiration, cells convert glucose and other nutrients into ATP, the primary energy source, using oxygen. This process includes three main stages: glycolysis, the citric acid cycle, and the electron transport chain.",
    "Cellular respiration is a crucial metabolic pathway that allows cells to extract energy from nutrients. This process begins with glycolysis in the cytoplasm, where glucose is broken down into pyruvate. The pyruvate then enters the mitochondria, where it undergoes the citric acid cycle and the electron transport chain, resulting in the production of ATP.",
    "The process of cellular respiration involves several stages, starting with glycolysis in the cell's cytoplasm, where glucose is split into two molecules of pyruvate. These molecules are then transported into the mitochondria, where the citric acid cycle and the electron transport chain take place, producing ATP and releasing carbon dioxide and water as byproducts.",
    # # Photosynthesis
    # "Photosynthesis is the process by which green plants and some other organisms use sunlight to synthesize foods with the help of chlorophyll. It involves the intake of carbon dioxide and water, which are converted into glucose and oxygen.",
    # "In photosynthesis, green plants and other organisms utilize sunlight to create food using chlorophyll. They take in carbon dioxide and water and convert them into glucose and oxygen.",
    # "Photosynthesis is a complex process that takes place in the chloroplasts of plant cells. It involves several steps, starting with the absorption of light by chlorophyll, followed by the conversion of carbon dioxide and water into glucose and oxygen through the Calvin cycle.",
    # "The process of photosynthesis in plants involves capturing light energy and converting it into chemical energy. This process starts with the absorption of light by chlorophyll and results in the production of glucose and oxygen from carbon dioxide and water.",
    # # Regular Exercise
    # "Regular exercise has numerous benefits, including improving cardiovascular health, enhancing muscle strength, and boosting mental well-being. Exercise helps in weight management, reduces the risk of chronic diseases, and improves sleep quality.",
    # "Engaging in regular physical activity offers several advantages such as better heart health, stronger muscles, and improved mental health. It aids in maintaining a healthy weight, lowers the risk of chronic illnesses, and enhances sleep quality.",
    # "Exercise provides many benefits like improving heart health, increasing muscle strength, and enhancing mental well-being. It can help with weight management, reduce the risk of diseases, and promote better sleep.",
    # "Physical activity is beneficial for cardiovascular health, muscle strength, and mental health. It supports weight control, decreases the chances of chronic diseases, and helps with better sleep.",
    # # Water Cycle
    # "The water cycle is the continuous process by which water moves from the Earth's surface to the atmosphere and back. It includes processes such as evaporation, condensation, precipitation, and runoff.",
    # "The hydrological cycle involves the movement of water between the Earth's surface and the atmosphere. Key stages include evaporation, where water turns into vapor; condensation, where vapor forms clouds; precipitation, where water falls as rain or snow; and runoff, where water flows back to bodies of water.",
    # "The water cycle describes how water evaporates from the surface of the Earth, rises into the atmosphere, cools and condenses into rain or snow in clouds, and falls again to the surface as precipitation.",
    # "The cycle of water involves evaporation, condensation, precipitation, and collection. Water from oceans, rivers, and lakes turns into vapor and rises into the atmosphere. This vapor forms clouds, which then precipitate as rain or snow, eventually returning the water to Earth’s surface."
]
from itertools import combinations
text_pairs = list(combinations(texts, 2))

# Run the model and display results
display_results(text_pairs, best_model, tokenizer, device)


# Test texts

Text 1 (Paraphrased):
"The process of cellular respiration involves breaking down glucose and other molecules from food to produce ATP, the main energy currency of cells. This process requires oxygen and includes stages such as glycolysis, the citric acid cycle, and the electron transport chain."

Text 2 (Paraphrased):
"Cellular respiration is a method by which cells break down glucose and other food molecules in the presence of oxygen to generate ATP, the primary energy source for the cell. This involves glycolysis, the citric acid cycle, and the electron transport chain."

Text 3 (Original):
"Photosynthesis in green plants involves capturing light energy and converting it into chemical energy in the form of glucose. This process takes place in the chloroplasts and requires carbon dioxide and water, releasing oxygen as a byproduct."

Text 4 (Original):
"The water cycle describes the continuous movement of water on, above, and below the surface of the Earth. It includes processes such as evaporation, condensation, precipitation, and runoff, which help distribute water around the planet."

Question: "What are the benefits of a healthy diet?"

Paraphrased Answers:

Text 1:
A healthy diet provides numerous benefits, such as improving overall health, boosting energy levels, and maintaining a healthy weight. It reduces the risk of chronic diseases, enhances mental well-being, and supports a strong immune system.

Text 2:
Eating a healthy diet offers many advantages, including better overall health, increased energy, and weight management. It lowers the risk of chronic illnesses, improves mental health, and strengthens the immune system.

Unique Answers:

Text 3:
A balanced diet rich in nutrients helps in maintaining optimal body functions, enhances mood, and promotes longevity. It is essential for the prevention of various health conditions and contributes to better physical and mental performance.

Text 4:
Consuming a diet that includes a variety of fruits, vegetables, whole grains, and lean proteins is crucial for sustaining good health. This kind of diet aids in the prevention of obesity, cardiovascular diseases, and diabetes, and it also supports cognitive function and overall vitality.

# Interface (RoBERTa and SBERT)

In [1]:
# !pip install sentence-transformers gradio plotly kaleido

import gradio as gr
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from sentence_transformers import SentenceTransformer
from torch.utils.data import DataLoader
from sklearn.cluster import AgglomerativeClustering
from sklearn.manifold import MDS
from scipy.spatial.distance import cosine
import plotly.express as px

### RoBERTa Model Setup ###
class ParaphraseModel(nn.Module):
    def __init__(self):
        super(ParaphraseModel, self).__init__()
        self.bert = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)
        self.dropout = nn.Dropout(0.3)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        pooled_output = self.dropout(outputs.logits)
        return outputs.loss, pooled_output

# Load RoBERTa model
tokenizer_roberta = RobertaTokenizer.from_pretrained('roberta-base')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
best_model_roberta = ParaphraseModel().to(device)
best_model_roberta.load_state_dict(torch.load('best_model_fold.bin', map_location=device))
best_model_roberta.eval()

def compare_texts_roberta(text1, text2, model, tokenizer, device):
    inputs = tokenizer.encode_plus(
        text1, text2, add_special_tokens=True, max_length=128, truncation=True, padding='max_length', return_tensors='pt'
    )
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    with torch.no_grad():
        loss, logits = model(input_ids, attention_mask)
        probs = torch.nn.functional.softmax(logits, dim=1)
        score = probs.cpu().numpy()[0][1]  # Probability of being a paraphrase

    return score

### SBERT Model Setup ###
# Load the SBERT model
model_sbert = SentenceTransformer('all-MiniLM-L6-v2')

# Load the model state dictionary if you have a saved model
model_save_path = 'best_model_sbert.bin'
model_sbert.load_state_dict(torch.load(model_save_path, map_location=torch.device('cpu')))
model_sbert.eval()

def compare_texts_sbert(text1, text2, model):
    embeddings = model.encode([text1, text2], convert_to_tensor=True)
    embeddings = embeddings.cpu()  # Move embeddings to CPU
    cosine_sim = 1 - cosine(embeddings[0], embeddings[1])
    return cosine_sim

# Shared functions
def update_dataframe(df):
    df["Number"] = range(1, len(df) + 1)
    return df

def check_similarity(df, model_choice):
    texts = df["Answers"].tolist()
    n = len(texts)
    similarity_matrix = np.zeros((n, n))

    for i in range(n):
        for j in range(i + 1, n):
            if model_choice == 'RoBERTa':
                score = compare_texts_roberta(texts[i], texts[j], best_model_roberta, tokenizer_roberta, device)
            else:
                score = compare_texts_sbert(texts[i], texts[j], model_sbert)
            similarity_matrix[i][j] = score
            similarity_matrix[j][i] = score

    clustering = AgglomerativeClustering(metric='precomputed', linkage='average', n_clusters=None, distance_threshold=0.2)
    clusters = clustering.fit_predict(1 - similarity_matrix)  # Convert similarity to distance

    cluster_texts = {i: [] for i in np.unique(clusters)}
    cluster_scores = {i: [] for i in np.unique(clusters)}
    for idx, cluster_id in enumerate(clusters):
        cluster_texts[cluster_id].append(f"Text {idx + 1}")

    for i in range(n):
        for j in range(i + 1, n):
            if clusters[i] == clusters[j]:
                cluster_scores[clusters[i]].append(similarity_matrix[i][j])

    cluster_data = {
        "Cluster": [],
        "Texts": [],
        "Average Similarity": []
    }

    for cluster_id, texts in cluster_texts.items():
        cluster_data["Cluster"].append(cluster_id)
        cluster_data["Texts"].append(", ".join(texts))
        cluster_data["Average Similarity"].append(np.mean(cluster_scores[cluster_id]) if cluster_scores[cluster_id] else 0)

    # Create the MDS plot
    mds = MDS(n_components=2, dissimilarity="precomputed", random_state=42)
    mds_results = mds.fit_transform(1 - similarity_matrix)  # Convert similarity to distance

    fig = px.scatter(
        x=mds_results[:, 0],
        y=mds_results[:, 1],
        color=[f"Cluster {c}" for c in clusters],  # Ensure clusters are treated as categorical
        text=[f"Text {i + 1}" for i in range(n)],
        labels={'color': 'Cluster'},
        title="MDS Visualization of Text Clusters"
    )

    # Update layout to show clusters as discrete categories with distinct colors
    fig.update_traces(marker=dict(size=12, opacity=0.8))
    fig.update_layout(
        legend_title_text="Cluster",
        legend=dict(
            itemsizing='constant',
            title=dict(text="Cluster"),
            font=dict(size=12),
        ),
        coloraxis_showscale=False  # Disable the color scale bar
    )

    return pd.DataFrame(cluster_data), fig



# HTML and markdown content for the interface
intro_html = """
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Paraphrase Detection Project</title>
</head>
<body>
<h1>Welcome to the Paraphrase Detection Project</h1>
<p>My name is Sangjun Ko, and this project is part of my work as a member of the ATLAS Machine Learning Team at the University of Illinois Urbana-Champaign (UIUC).</p>
<p>This project focuses on detecting paraphrases using advanced Natural Language Processing (NLP) techniques. I use pre-trained RoBERTa and SBERT models to analyze and compare different text inputs to determine if they are paraphrases of each other.</p>
<h2>About NLP and Paraphrase Detection</h2>
<p>Natural Language Processing (NLP) is a field of artificial intelligence that focuses on the interaction between computers and humans through natural language. The ultimate goal of NLP is to enable computers to understand, interpret, and generate human languages in a way that is both valuable and meaningful.</p>
<p>Paraphrase detection is a specific task in NLP where the goal is to determine if two pieces of text have the same meaning but are expressed differently. This can be useful in various applications such as plagiarism detection, information retrieval, and question-answering systems.</p>
<h2>Methods Used</h2>
<ul>
<li><strong>RoBERTa Model:</strong> A robustly optimized BERT approach, which is a state-of-the-art model for NLP tasks.</li>
<li><strong>SBERT Model:</strong> A Sentence-BERT model for NLP tasks.</li>
<li><strong>Hugging Face Transformers:</strong> I leverage the Transformers library by Hugging Face, which provides pre-trained models and tools for various NLP tasks.</li>
<li><strong>Dataset Source:</strong> The dataset used for training the models is sourced from the <a href="https://www.microsoft.com/en-us/download/details.aspx?id=52398" target="_blank">Microsoft Paraphrase Corpus</a>.</li>
</ul>
<h2>Learn More</h2>
<p>If you are interested in learning more about NLP and how to build projects like these, here are some resources:</p>
<ul>
<li><a href="https://www.coursera.org/specializations/natural-language-processing" target="_blank">Coursera: Natural Language Processing</a></li>
<li><a href="https://huggingface.co/transformers/" target="_blank">Hugging Face Transformers Documentation</a></li>
<li><a href="https://arxiv.org/abs/1907.11692" target="_blank">RoBERTa: A Robustly Optimized BERT Pretraining Approach</a></li>
</ul>
</body>
</html>
"""

instructions = """
## Instructions
1. Enter the texts you want to compare in the "Input DataFrame" below. 
2. Each row represents a different text.
3. Select the model you want to use for comparison (RoBERTa or SBERT).
4. Click the "Check for Similarity" button.
5. The results will show the identified clusters and a visualization of the text similarities.
"""

description = """
## Interpretation Guide

### Results DataFrame:
The "Paraphrase Check Results" table lists the clusters identified among the input texts. Each row in the table represents a cluster, and the "Texts" column lists the text numbers that belong to that cluster.

### Similarity Scores:
The similarity between texts is calculated pairwise, with scores ranging from 0 to 1. A higher score indicates a higher probability that the texts are paraphrases of each other.

### Cluster Visualization:
The MDS visualization plot provides a graphical representation of the clusters. Each point corresponds to a text, and points that are closer together represent texts that are more similar to each other. The color indicates the cluster to which each text belongs.
"""

# Create the Gradio interface
with gr.Blocks() as demo:
    with gr.Tabs():
        with gr.Tab("Introduction"):
            gr.HTML(intro_html)
        with gr.Tab("Paraphrase Detection"):
            gr.Markdown("# Paraphrase Detection Interface")
            gr.Markdown(instructions)
            
            data = {
                "Number": [1, 2, 3, 4, 5],
                "Answers": [""] * 5  
            }
            dataframe_input = gr.Dataframe(value=pd.DataFrame(data), label="Input DataFrame")
            model_choice = gr.Dropdown(label="Select Model", choices=["RoBERTa", "SBERT"], value="RoBERTa")

            btn_check_similarity = gr.Button("Check for Similarity")

            gr.Markdown(description)
            
            results_output = gr.Dataframe(label="Paraphrase Check Results")
            plot_output = gr.Plot(label="Cluster Visualization")

            dataframe_input.change(fn=update_dataframe, inputs=dataframe_input, outputs=dataframe_input)

            btn_check_similarity.click(
                fn=check_similarity, 
                inputs=[dataframe_input, model_choice], 
                outputs=[results_output, plot_output]
            )

demo.launch(share=True)


  from .autonotebook import tqdm as notebook_tqdm
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Running on local URL:  http://127.0.0.1:7860
Running on public URL: https://6cf0b9456880a70f65.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




**MDS**
- Multidimensional Scaling is a technique used for dimensionality reduction. It seeks to place each object in N-dimensional space such that the between-object distances are preserved as well as possible. In this case, we use it to reduce our similarity matrix to 2 dimensions for visualization.
- n_components=2: This specifies that we want to reduce our data to 2 dimensions, which is useful for visualization in a 2D plot.
- dissimilarity="precomputed": This indicates that we are providing a precomputed dissimilarity matrix rather than raw data. Our similarity_matrix is converted to a dissimilarity matrix by subtracting it from 1.