In [2]:
# Download required NLTK resources and import libraries

import pandas as pd
from tqdm import tqdm
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Lambda
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.optimizers import Adam
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW, RobertaTokenizer, RobertaForSequenceClassification, AutoTokenizer
from torch.utils.data import DataLoader, Dataset, TensorDataset
import matplotlib.pyplot as plt

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

### Logistic Regression

In [3]:
# Load the dataset

train_df = pd.read_csv('/content/train.tsv', sep='\t')
val_df = pd.read_csv('/content/validation.tsv', sep='\t')
test_df = pd.read_csv('/content/test.tsv', sep='\t')

# Combine the train and the validation
train_df = pd.concat([train_df, val_df], ignore_index=True)

# Preprocess the text data
def preprocess_text(text):
    tokens = nltk.word_tokenize(text)
    preprocessed_tokens = [token.lower() for token in tokens if token.isalpha()]
    return ' '.join(preprocessed_tokens)

train_df['preprocessed_sentence1'] = train_df['sentence1'].apply(preprocess_text)
train_df['preprocessed_sentence2'] = train_df['sentence2'].apply(preprocess_text)
test_df['preprocessed_sentence1'] = test_df['sentence1'].apply(preprocess_text)
test_df['preprocessed_sentence2'] = test_df['sentence2'].apply(preprocess_text)

# Create feature vectors using TF-IDF
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train_df['preprocessed_sentence1'] + ' ' + train_df['preprocessed_sentence2'])
X_test = vectorizer.transform(test_df['preprocessed_sentence1'] + ' ' + test_df['preprocessed_sentence2'])

# Scale the dense feature matrices (use with_mean=False to avoid centering sparse matrices)
scaler = StandardScaler(with_mean=False)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Prepare the target variable
y_train = train_df['label']
y_test = test_df['label']

# Train a logistic regression model
lr_model = LogisticRegression(max_iter=5000)  # Increase the number of iterations
lr_model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = lr_model.predict(X_test_scaled)

# Evaluate the model
lr_accuracy = round(accuracy_score(y_test, y_pred),4)
print("Logistic Regression Accuracy:", lr_accuracy)

Logistic Regression Accuracy: 0.5119


### Siamese NN

In [4]:
# Configure GPU memory growth
physical_devices = tf.config.experimental.list_physical_devices('GPU')
if len(physical_devices) > 0:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)

# Load the dataset
train_df = pd.read_csv('/content/train.tsv', sep='\t')
val_df = pd.read_csv('/content/validation.tsv', sep='\t')
test_df = pd.read_csv('/content/test.tsv', sep='\t')

# Combine the train and the validation
train_df = pd.concat([train_df, val_df], ignore_index=True)

# Preprocess the text data
def preprocess_text(text):
    tokens = word_tokenize(text)
    tokens = [token.lower() for token in tokens if token.isalpha()]
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    return ' '.join(tokens)

# Apply text preprocessing to the dataset
train_df['preprocessed_sentence1'] = train_df['sentence1'].apply(preprocess_text)
train_df['preprocessed_sentence2'] = train_df['sentence2'].apply(preprocess_text)
test_df['preprocessed_sentence1'] = test_df['sentence1'].apply(preprocess_text)
test_df['preprocessed_sentence2'] = test_df['sentence2'].apply(preprocess_text)

# Create vocabulary and tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_df['preprocessed_sentence1'] + train_df['preprocessed_sentence2'])

vocab_size = len(tokenizer.word_index) + 1

# Convert sentences to sequences
train_seq1 = tokenizer.texts_to_sequences(train_df['preprocessed_sentence1'])
train_seq2 = tokenizer.texts_to_sequences(train_df['preprocessed_sentence2'])
test_seq1 = tokenizer.texts_to_sequences(test_df['preprocessed_sentence1'])
test_seq2 = tokenizer.texts_to_sequences(test_df['preprocessed_sentence2'])

# Pad sequences
max_seq_length = 50
train_seq1 = pad_sequences(train_seq1, maxlen=max_seq_length, padding='post')
train_seq2 = pad_sequences(train_seq2, maxlen=max_seq_length, padding='post')
test_seq1 = pad_sequences(test_seq1, maxlen=max_seq_length, padding='post')
test_seq2 = pad_sequences(test_seq2, maxlen=max_seq_length, padding='post')

# Prepare the target variable
y_train = train_df['label']
y_test = test_df['label']

# Siamese neural network model
embedding_dim = 100
lstm_units = 64

input1 = Input(shape=(max_seq_length,))
input2 = Input(shape=(max_seq_length,))

# Embedding layer to convert words to dense vectors
embedding_layer = Embedding(vocab_size, embedding_dim)

# LSTM layer to process sequences
lstm_layer = LSTM(lstm_units)

# Process inputs through embedding and LSTM layers
encoded1 = lstm_layer(embedding_layer(input1))
encoded2 = lstm_layer(embedding_layer(input2))

# Calculate absolute difference between encoded vectors
merged = Lambda(lambda x: abs(x[0] - x[1]))([encoded1, encoded2])

# Predict the probability of paraphrase
preds = Dense(1, activation='sigmoid')(merged)

# Create the Siamese model
siamese_model = Model(inputs=[input1, input2], outputs=preds)
siamese_model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])

# Training
print("Training started...")
siamese_model.fit([train_seq1, train_seq2], y_train, epochs=50, batch_size=64, verbose=1)

# Testing
print("Testing started...")
y_pred = siamese_model.predict([test_seq1, test_seq2])

# Convert predicted probabilities to binary predictions
y_pred = [1 if pred > 0.5 else 0 for pred in y_pred]

# Evaluate the model
snn_accuracy = round(accuracy_score(y_test, y_pred),4)
print("Siamese NN Accuracy:", snn_accuracy)

Training started...
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Testing started...
Siamese NN Accuracy: 0.6105


### DistilBERT

In [None]:
# Load the training and test data
train_df = pd.read_csv('/content/train.tsv', sep='\t')
val_df = pd.read_csv('/content/validation.tsv', sep='\t')
test_df = pd.read_csv('/content/test.tsv', sep='\t')

# Combine the train and the validation
train_df = pd.concat([train_df, val_df], ignore_index=True)

# Preprocess the data
X_train = train_df[['sentence1', 'sentence2']]
y_train = train_df['label']
X_test = test_df[['sentence1', 'sentence2']]
y_test = test_df['label']

# Define a custom dataset for PyTorch
class ParaphraseDataset(Dataset):
    def __init__(self, tokenizer, sentences1, sentences2, labels):
        self.sentences1 = sentences1
        self.sentences2 = sentences2
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        sentence1 = self.sentences1.iloc[idx]
        sentence2 = self.sentences2.iloc[idx]
        label = self.labels.iloc[idx]

        encoding = self.tokenizer.encode_plus(
            sentence1,
            sentence2,
            add_special_tokens=True,
            return_tensors="pt",
            padding="max_length",
            max_length=128,
            truncation=True
        )

        input_ids = encoding["input_ids"].squeeze()
        attention_mask = encoding["attention_mask"].squeeze()

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": label
        }

# Initialize the tokenizer and model
distilbert_tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
distilbert_model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

# Move the model to the GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
distilbert_model.to(device)

# Create DataLoader objects for the training and test datasets with increased batch size
train_dataset = ParaphraseDataset(distilbert_tokenizer, X_train["sentence1"], X_train["sentence2"], y_train)
test_dataset = ParaphraseDataset(distilbert_tokenizer, X_test["sentence1"], X_test["sentence2"], y_test)

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16)

# Fine-tuning and Training the model
distilbert_model.train()
optimizer = AdamW(distilbert_model.parameters(), lr=2e-5)

num_epochs = 10

for epoch in range(num_epochs):
    running_loss = 0.0
    for i, batch in enumerate(tqdm(train_dataloader, desc=f"Epoch {epoch + 1}")):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()

        outputs = distilbert_model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f"Epoch {epoch + 1} Loss: {running_loss / len(train_dataloader)}")


# Save the fine-tuned model
model_save_path = "distilbert_model.pth"
# from google.colab import files
# files.download('distilbert_model.pth')
torch.save(distilbert_model.state_dict(), model_save_path)

# Load the fine-tuned model
distilbert_model.load_state_dict(torch.load(model_save_path))
distilbert_model.eval()

# Testing the model
y_true = []
y_pred = []

with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Testing"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = distilbert_model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        y_true.extend(labels.cpu().numpy())
        y_pred.extend(logits.argmax(1).cpu().numpy())

# Convert predictions from numerical to binary labels (0 or 1)
y_pred = [1 if pred == 1 else 0 for pred in y_pred]

# Compute performance metrics
db_accuracy = round(accuracy_score(y_true, y_pred),4)
print("DistilBERT Accuracy:", db_accuracy)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'classifier.bias', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1:   0%|          | 15/3588 [00:02<11:10,  5.33it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Epoch 1:  29%|██▊       | 1027/3588 [03:28<08:26,  5.06it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Epoch 1:  50%|████▉     | 1787/3588 [05:55<06:01,  4.98it/s]Be aware, overflo

Epoch 1 Loss: 0.5498794106291057


Epoch 2:  16%|█▋        | 591/3588 [01:53<09:20,  5.34it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Epoch 2:  20%|█▉        | 701/3588 [02:14<09:01,  5.34it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Epoch 2:  23%|██▎       | 829/3588 [02:39<08:39,  5.31it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Epoch 2:  70%|██████▉   | 2498/3588 [08:00<03:43,  4.87it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. s