In [3]:
# !pip install sentence-transformers

import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader
import torch
from torch.optim import AdamW
from sklearn.metrics import accuracy_score, f1_score
from scipy.spatial.distance import cosine

# Load your dataset
def load_dataset(file_path):
    data = []
    with open(file_path, 'r') as file:
        next(file)
        for line in file:
            split_line = line.strip().split('\t')
            if len(split_line) == 5:
                data.append(split_line)
    columns = ["Quality", "#1 ID", "#2 ID", "#1 String", "#2 String"]
    df = pd.DataFrame(data, columns=columns)
    df['Quality'] = df['Quality'].astype(int)
    return df

df_train = load_dataset('msr_paraphrase_train.txt')
df_test = load_dataset('msr_paraphrase_test.txt')

# Prepare the dataset for training
train_samples = []
for i in range(len(df_train)):
    train_samples.append(InputExample(texts=[df_train.iloc[i]['#1 String'], df_train.iloc[i]['#2 String']], label=int(df_train.iloc[i]['Quality'])))

test_samples = []
for i in range(len(df_test)):
    test_samples.append(InputExample(texts=[df_test.iloc[i]['#1 String'], df_test.iloc[i]['#2 String']], label=int(df_test.iloc[i]['Quality'])))

# Load the SBERT model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Prepare the dataloader
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=16)
train_loss = losses.CosineSimilarityLoss(model=model)

# Train the model
model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=1, warmup_steps=100)

# Evaluate the model
def evaluate_model(model, test_samples):
    predictions = []
    labels = []
    for sample in test_samples:
        embeddings = model.encode(sample.texts)
        cosine_sim = 1 - cosine(embeddings[0], embeddings[1])
        pred = 1 if cosine_sim > 0.5 else 0  # Threshold of 0.5 for similarity
        predictions.append(pred)
        labels.append(sample.label)
    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions)
    return accuracy, f1

accuracy, f1 = evaluate_model(model, test_samples)
print(f'Accuracy: {accuracy}')
print(f'F1 Score: {f1}')

# Save the model
model_save_path = 'best_model_sbert.bin'
torch.save(model.state_dict(), model_save_path)

# Load the model
loaded_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
loaded_model.load_state_dict(torch.load(model_save_path))

# Verify the loaded model
accuracy, f1 = evaluate_model(loaded_model, test_samples)
print(f'Loaded Model Accuracy: {accuracy}')
print(f'Loaded Model F1 Score: {f1}')


100%|██████████| 255/255 [00:24<00:00, 10.26it/s]


{'train_runtime': 24.8374, 'train_samples_per_second': 164.108, 'train_steps_per_second': 10.267, 'train_loss': 0.1707887088551241, 'epoch': 1.0}
Accuracy: 0.7559420289855072
F1 Score: 0.8404698749526336




Loaded Model Accuracy: 0.7559420289855072
Loaded Model F1 Score: 0.8404698749526336


# Gradio

In [5]:
# !pip install sentence-transformers gradio

import gradio as gr
import pandas as pd
import numpy as np
import torch
from sentence_transformers import SentenceTransformer, losses, InputExample
from torch.utils.data import DataLoader
from sklearn.cluster import AgglomerativeClustering
from sklearn.manifold import TSNE
from sklearn.metrics import accuracy_score, f1_score
from scipy.spatial.distance import cosine
import matplotlib.pyplot as plt

# Load the SBERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Load the model state dictionary if you have a saved model
model_save_path = 'best_model_sbert.bin'
model.load_state_dict(torch.load(model_save_path, map_location=torch.device('cpu')))

# Set the model to evaluation mode
model.eval()

# Function to compare texts using the SBERT model
def compare_texts(text1, text2, model):
    embeddings = model.encode([text1, text2], convert_to_tensor=True)
    embeddings = embeddings.cpu()  # Move embeddings to CPU
    cosine_sim = 1 - cosine(embeddings[0], embeddings[1])
    return cosine_sim

# Function to update the dataframe with text numbers
def update_dataframe(df):
    df["Number"] = range(1, len(df) + 1)
    return df

# Function to determine perplexity for t-SNE based on number of samples
def determine_perplexity(num_samples):
    if num_samples < 10:
        return 2
    elif num_samples < 50:
        return 5
    elif num_samples < 100:
        return 10
    elif num_samples < 500:
        return 30
    else:
        return 50

# Function to check similarity and generate clusters
def check_similarity(df):
    texts = df["Answers"].tolist()
    n = len(texts)
    similarity_matrix = np.zeros((n, n))

    for i in range(n):
        for j in range(i + 1, n):
            score = compare_texts(texts[i], texts[j], model)
            similarity_matrix[i][j] = score
            similarity_matrix[j][i] = score

    clustering = AgglomerativeClustering(metric='precomputed', linkage='average', n_clusters=None, distance_threshold=0.5)
    clusters = clustering.fit_predict(1 - similarity_matrix)  # Convert similarity to distance

    cluster_texts = {i: [] for i in np.unique(clusters)}
    cluster_scores = {i: [] for i in np.unique(clusters)}
    for idx, cluster_id in enumerate(clusters):
        cluster_texts[cluster_id].append(f"Text {idx + 1}")

    for i in range(n):
        for j in range(i + 1, n):
            if clusters[i] == clusters[j]:
                cluster_scores[clusters[i]].append(similarity_matrix[i][j])

    cluster_data = {
        "Cluster": [],
        "Texts": [],
        "Average Similarity": []
    }

    for cluster_id, texts in cluster_texts.items():
        cluster_data["Cluster"].append(cluster_id)
        cluster_data["Texts"].append(", ".join(texts))
        cluster_data["Average Similarity"].append(np.mean(cluster_scores[cluster_id]))

    perplexity = determine_perplexity(n)

    tsne = TSNE(n_components=2, metric="precomputed", perplexity=perplexity, init='random')
    tsne_results = tsne.fit_transform(1 - similarity_matrix)  # Convert similarity to distance

    fig, ax = plt.subplots(figsize=(10, 7))
    scatter = ax.scatter(tsne_results[:, 0], tsne_results[:, 1], c=clusters, cmap='viridis')
    for i, text in enumerate([f"Text {i + 1}" for i in range(n)]):
        ax.annotate(text, (tsne_results[i, 0], tsne_results[i, 1]))
    plt.title("t-SNE Visualization of Text Clusters")
    plt.xlabel("Dimension 1")
    plt.ylabel("Dimension 2")
    plt.colorbar(scatter)
    plt.tight_layout()
    fig.savefig("tsne_clusters.png")
    
    return pd.DataFrame(cluster_data), "tsne_clusters.png"

# HTML and markdown content for the interface
intro_html = """
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Paraphrase Detection Project</title>
</head>
<body>
<h1>Welcome to the Paraphrase Detection Project</h1>
<p>My name is Sangjun Ko, and this project is part of my work as a member of the ATLAS Machine Learning Team at the University of Illinois Urbana-Champaign (UIUC).</p>
<p>This project focuses on detecting paraphrases using advanced Natural Language Processing (NLP) techniques. I use a pre-trained SBERT model to analyze and compare different text inputs to determine if they are paraphrases of each other.</p>
<h2>About NLP and Paraphrase Detection</h2>
<p>Natural Language Processing (NLP) is a field of artificial intelligence that focuses on the interaction between computers and humans through natural language. The ultimate goal of NLP is to enable computers to understand, interpret, and generate human languages in a way that is both valuable and meaningful.</p>
<p>Paraphrase detection is a specific task in NLP where the goal is to determine if two pieces of text have the same meaning but are expressed differently. This can be useful in various applications such as plagiarism detection, information retrieval, and question-answering systems.</p>
<h2>Methods Used</h2>
<ul>
<li><strong>SBERT Model:</strong> A Sentence-BERT model for NLP tasks.</li>
<li><strong>Hugging Face Transformers:</strong> I leverage the Transformers library by Hugging Face, which provides pre-trained models and tools for various NLP tasks.</li>
<li><strong>Dataset Source:</strong> The dataset used for training the model is sourced from the <a href="https://www.microsoft.com/en-us/download/details.aspx?id=52398" target="_blank">Microsoft Paraphrase Corpus</a>.</li>
</ul>
<h2>Learn More</h2>
<p>If you are interested in learning more about NLP and how to build projects like these, here are some resources:</p>
<ul>
<li><a href="https://www.coursera.org/specializations/natural-language-processing" target="_blank">Coursera: Natural Language Processing</a></li>
<li><a href="https://huggingface.co/transformers/" target="_blank">Hugging Face Transformers Documentation</a></li>
<li><a href="https://arxiv.org/abs/1907.11692" target="_blank">SBERT: Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks</a></li>
</ul>
</body>
</html>
"""

instructions = """
## Instructions
1. Enter the texts you want to compare in the "Input DataFrame" below. 
2. Each row represents a different text.
3. After entering your texts, click the "Check for Similarity" button.
4. The results will show the identified clusters and a visualization of the text similarities.
"""

description = """
## Interpretation Guide

### Results DataFrame:
The "Paraphrase Check Results" table lists the clusters identified among the input texts. Each row in the table represents a cluster, and the "Texts" column lists the text numbers that belong to that cluster.

### Similarity Scores:
The similarity between texts is calculated pairwise, with scores ranging from 0 to 1. A higher score indicates a higher probability that the texts are paraphrases of each other.

### Cluster Visualization:
The t-SNE visualization plot provides a graphical representation of the clusters. Each point corresponds to a text, and points that are closer together represent texts that are more similar to each other. The color indicates the cluster to which each text belongs.

"""

# Create the Gradio interface
with gr.Blocks() as demo:
    with gr.Tabs():
        with gr.Tab("Introduction"):
            gr.HTML(intro_html)
        with gr.Tab("Paraphrase Detection"):
            gr.Markdown("# Paraphrase Detection Interface")
            gr.Markdown(instructions)
            
            data = {
                "Number": [1, 2, 3, 4, 5],
                "Answers": [""] * 5  
            }
            dataframe_input = gr.Dataframe(value=pd.DataFrame(data), label="Input DataFrame")

            btn_check_similarity = gr.Button("Check for Similarity")

            gr.Markdown(description)
            
            results_output = gr.Dataframe(label="Paraphrase Check Results")
            image_output = gr.Image(label="Cluster Visualization")

            dataframe_input.change(fn=update_dataframe, inputs=dataframe_input, outputs=dataframe_input)

            btn_check_similarity.click(
                fn=check_similarity, 
                inputs=dataframe_input, 
                outputs=[results_output, image_output]
            )

demo.launch(share=True)




Running on local URL:  http://127.0.0.1:7861


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Running on public URL: https://c0d34fda03feca8e91.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
