# BERT

In [3]:
import torch.nn as nn
import gradio as gr
import torch
from transformers import BertTokenizer, BertForSequenceClassification

# Assuming the ParaphraseModel is defined as in your provided code
class ParaphraseModel(nn.Module):
    def __init__(self):
        super(ParaphraseModel, self).__init__()
        self.bert = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        return outputs.loss, outputs.logits

# Load the tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = ParaphraseModel()
model.load_state_dict(torch.load("best_model_BERT.pth"))  # Load your trained model weights
model.eval()

# Text cleaning and preprocessing functions
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download required NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Initialize stop words and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Function to clean text data
def clean_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'https?://\S+|www\.\S+', '', text)  # Remove URLs
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

# Function to preprocess text data
def preprocess_text(text):
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split() if word.lower() not in stop_words])
    return text

# Define the function for Gradio interface with text cleaning and preprocessing
def classify_paraphrase_BERT(text1, text2):
    # Clean and preprocess the input texts
    text1_cleaned = preprocess_text(clean_text(text1))
    text2_cleaned = preprocess_text(clean_text(text2))

    inputs = tokenizer.encode_plus(
        text1_cleaned, text2_cleaned,
        max_length=128,
        truncation=True,
        padding='max_length',
        return_tensors='pt',
        return_attention_mask=True
    )

    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']

    with torch.no_grad():
        loss, logits = model(input_ids, attention_mask)
        probs = torch.softmax(logits, dim=1)
        pred = torch.argmax(probs, dim=1).item()
        confidence = probs[0][pred].item()
    
    return ("True Paraphrase" if pred == 0 else "False Paraphrase", confidence)

# Create the Gradio interface
iface = gr.Interface(
    fn=classify_paraphrase_BERT,
    inputs=[
        gr.inputs.Textbox(lines=5, label="Text 1"),
        gr.inputs.Textbox(lines=5, label="Text 2")
    ],
    outputs=[
        gr.outputs.Textbox(label="Result"),
        gr.outputs.Textbox(label="Confidence")
    ],
    title="Paraphrase Detection",
    description="Enter two sentences to check if they are paraphrases."
)

# Launch the interface
iface.launch(share = True)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/jovyan/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


IMPORTANT: You are using gradio version 3.14.0, however version 4.29.0 is available, please upgrade.
--------
Running on local URL:  http://127.0.0.1:7860

Setting up a public link... we have recently upgraded the way public links are generated. If you encounter any problems, please report the issue and downgrade to gradio version 3.13.0
.
Running on public URL: https://0accc2b2-11a0-4a78.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades (NEW!), check out Spaces: https://huggingface.co/spaces




# ROBERTA

In [2]:
import gradio as gr
import torch
import torch.nn as nn
from transformers import RobertaTokenizer, RobertaForSequenceClassification

# Define the ParaphraseModel class using RoBERTa
class ParaphraseModel(nn.Module):
    def __init__(self):
        super(ParaphraseModel, self).__init__()
        self.bert = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        return outputs.loss, outputs.logits

# Load the tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = ParaphraseModel()
model.load_state_dict(torch.load("best_model_ROBERTA.pth"))  # Load your trained model weights
model.eval()

# Text cleaning and preprocessing functions
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download required NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Initialize stop words and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Function to clean text data
def clean_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'https?://\S+|www\.\S+', '', text)  # Remove URLs
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

# Function to preprocess text data
def preprocess_text(text):
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split() if word.lower() not in stop_words and len(word) > 1])
    return text

# Define the function for Gradio interface with text cleaning and preprocessing
def classify_paraphrase_ROBERTA(text1, text2):
    # Clean and preprocess the input texts
    text1_cleaned = preprocess_text(clean_text(text1))
    text2_cleaned = preprocess_text(clean_text(text2))

    inputs = tokenizer.encode_plus(
        text1_cleaned, text2_cleaned,
        max_length=128,
        add_special_tokens = True,
        truncation=True,
        padding='max_length',
        return_tensors='pt',
        return_attention_mask=True
    )

    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']

    with torch.no_grad():
        loss, logits = model(input_ids, attention_mask)
        probs = torch.softmax(logits, dim=1)
        pred = torch.argmax(probs, dim=1).item()
        confidence = probs[0][pred].item()
    
    return ("True Paraphrase" if pred == 0 else "False Paraphrase", confidence)

# Create the Gradio interface
iface = gr.Interface(
    fn=classify_paraphrase_ROBERTA,
    inputs=[
        gr.inputs.Textbox(lines=5, label="Text 1"),
        gr.inputs.Textbox(lines=5, label="Text 2")
    ],
    outputs=[
        gr.outputs.Textbox(label="Result"),
        gr.outputs.Textbox(label="Confidence")
    ],
    title="Paraphrase Detection",
    description="Enter two sentences to check if they are paraphrases."
)

# Launch the interface
# iface.launch(share=True)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/jovyan/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


IMPORTANT: You are using gradio version 3.14.0, however version 4.29.0 is available, please upgrade.
--------


In [4]:
# Test the model with provided examples
test_data = [
    ("A BMI of 25 or above is considered overweight; 30 or above is considered obese.", "A BMI between 18.5 and 24.9 is considered normal, over 25 is considered overweight and 30 or greater is defined as obese.", 0),
    ("The dollar was at 116.92 yen against the yen , flat on the session, and at 1.2891 against the Swiss franc , also flat.", "The dollar was at 116.78 yen JPY= , virtually flat on the session, and at 1.2871 against the Swiss franc CHF= , down 0.1 percent.", 0),
    ("Six months ago, the IMF and Argentina struck a bare-minimum $6.8-billion debt rollover deal that expires in August.", "But six months ago, the two sides managed to strike a $6.8-billion debt rollover deal, which expires in August.", 1),
    ("Inhibited children tend to be timid with new people, objects, and situations, while uninhibited children spontaneously approach them.", "Simply put, shy individuals tend to be more timid with new people and situations.", 1),
    ("I wanted to bring the most beautiful people into the most beautiful building, he said Sunday inside the Grand Central concourse.", "\"I wanted to bring the most beautiful people into the most beautiful building,\" Tunick said Sunday.", 1),
    ("The broad Standard & Poor's 500 <.SPX> fell 10.75 points, or 1.02 percent, to 1,039.32.", "The S&P 500 index was up 1.26, or 0.1 percent, to 1,039.32 after sinking 10.75 yesterday.", 0),
    ("Duque will return to Earth Oct. 27 with the station's current crew, U.S. astronaut Ed Lu and Russian cosmonaut Yuri Malenchenko.", "Currently living onboard the space station are American astronaut Ed Lu and Russian cosmonaut Yuri Malenchenko.", 0),
    ("Singapore is already the United States' 12th-largest trading partner, with two-way trade totaling more than $34 billion.", "Although a small city-state, Singapore is the 12th-largest trading partner of the United States, with trade volume of $33.4 billion last year.", 1),
    ("The AFL-CIO is waiting until October to decide if it will endorse a candidate.", "The AFL-CIO announced Wednesday that it will decide in October whether to endorse a candidate before the primaries.", 1),
    ("No dates have been set for the civil or the criminal trial.", "No dates have been set for the criminal or civil cases, but Shanley has pleaded not guilty.", 0),
    ("The largest gains were seen in prices, new orders, inventories and exports.", "Sub-indexes measuring prices, new orders, inventories and exports increased.", 1),
    ("Trading in Loral was halted yesterday; the shares closed on Monday at $3.01.", "The New York Stock Exchange suspended trading yesterday in Loral, which closed at $3.01 Friday.", 1),
    ("Earnings per share from recurring operations will be 13 cents to 14 cents.", "That beat the company's April earnings forecast of 8 to 9 cents a share.", 0),
    ("He plans to have dinner with troops at Kosovo's U.S. military headquarters, Camp Bondsteel.", "After that, he plans to have dinner at Camp Bondsteel with U.S. troops stationed there.", 1),
    ("Retailers J.C. Penney Co. Inc. (JCP) and Walgreen Co. (WAG) kick things off on Monday.", "Retailers J.C. Penney Co. Inc. JCP.N and Walgreen Co. WAG.N kick things off on Monday.", 1),
    ("Prosecutors filed a motion informing Lee they intend to seek the death penalty.", "He added that prosecutors will seek the death penalty.", 1),
    ("Last year the court upheld Cleveland's school voucher program, ruling 5-4 that vouchers are constitutional if they provide parents a choice of religious and secular schools.", "Last year, the court ruled 5-4 in an Ohio case that government vouchers are constitutional if they provide parents with choices among a range of religious and secular schools.", 0),
    ("He beat testicular cancer that had spread to his lungs and brain.", "Armstrong, 31, battled testicular cancer that spread to his brain.", 0),
    ("Sorkin, who faces charges of conspiracy to obstruct justice and lying to a grand jury, was to have been tried separately.", "Sorkin was to have been tried separately on charges of conspiracy and lying to a grand jury.", 1),
    ("Graves reported from Albuquerque, Villafranca from Austin and Ratcliffe from Laredo.", "Pete Slover reported from Laredo and Gromer Jeffers from Albuquerque.", 0),
    ("The US chip market is expected to decline 2.1 percent this year, then grow 15.7 percent in 2004.", "The Americas market will decline 2.1 percent to $30.6 billion in 2003, and then grow 15.7 percent to $35.4 billion in 2004.", 1),
    ("The group will be headed by State Department official John S. Wolf, who has served in Australia, Vietnam, Greece and Pakistan.", "The group will be headed by John S. Wolf, an assistant secretary of state who has served in Australia, Vietnam, Greece and Pakistan.", 0),
    ("The commission must work out the plan's details, but the average residential customer paying $840 a year would get a savings of about $30 annually.", "An average residential customer paying $840 a year for electricity could see a savings of $30 annually.", 1),
    ("The company has said it plans to restate its earnings for 2000 through 2002.", "The company had announced in January that it would have to restate earnings for 2002, 2001 and perhaps 2000.", 1),
    ("Results from No. 2 U.S. soft drink maker PepsiCo Inc. PEP.N were likely to be in the spotlight.", "Results from No. 2 U.S. soft drink maker PepsiCo Inc. (nyse: PEP - news - people) were likely to be in the spotlight.", 1),
    ("The result is an overall package that will provide significant economic growth for our employees over the next four years.", "\"The result is an overall package that will provide a significant economic growth for our employees over the next few years,\" he said.", 1),
    ("Wal-Mart said it would check all of its million-plus domestic workers to ensure they were legally employed.", "It has also said it would review all of its domestic employees more than 1 million to ensure they have legal status.", 1),
    ("The songs are on offer for 99 cents each, or $9.99 for an album.", "The company will offer songs for 99 cents and albums for $9.95.", 1),
    ("However, the talk was downplayed by PBL which said it would focus only on smaller purchases that were immediately earnings and cash flow accretive.", "The talk, however,has been downplayed by PBL which said it would focus only on smaller purchases that were immediately earnings and cash flow-accretive.", 1),
    ("Comcast Class A shares were up 8 cents at $30.50 in morning trading on the Nasdaq Stock Market.", "The stock rose 48 cents to $30 yesterday in Nasdaq Stock Market trading.", 1)
]

for text1, text2, true_label in test_data:
    prediction, confidence = classify_paraphrase_ROBERTA(text1, text2)
    print(f"Text 1: {text1}\nText 2: {text2}\nTrue Label: {'True Paraphrase' if true_label == 0 else 'False Paraphrase'}")
    print(f"Prediction: {prediction}, Confidence: {confidence:.4f}\n")

Text 1: A BMI of 25 or above is considered overweight; 30 or above is considered obese.
Text 2: A BMI between 18.5 and 24.9 is considered normal, over 25 is considered overweight and 30 or greater is defined as obese.
True Label: True Paraphrase
Prediction: False Paraphrase, Confidence: 0.5239

Text 1: The dollar was at 116.92 yen against the yen , flat on the session, and at 1.2891 against the Swiss franc , also flat.
Text 2: The dollar was at 116.78 yen JPY= , virtually flat on the session, and at 1.2871 against the Swiss franc CHF= , down 0.1 percent.
True Label: True Paraphrase
Prediction: True Paraphrase, Confidence: 0.7190

Text 1: Six months ago, the IMF and Argentina struck a bare-minimum $6.8-billion debt rollover deal that expires in August.
Text 2: But six months ago, the two sides managed to strike a $6.8-billion debt rollover deal, which expires in August.
True Label: False Paraphrase
Prediction: False Paraphrase, Confidence: 0.9101

Text 1: Inhibited children tend to be t