In [2]:
import os
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer, losses
from sentence_transformers.trainer import SentenceTransformerTrainer
from sentence_transformers.training_args import SentenceTransformerTrainingArguments
import random
from sklearn.metrics import accuracy_score, classification_report
import torch
from torch.utils.data import DataLoader
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from datasets import Dataset

In [3]:
# Setting file paths and method to read all of the reviews in each file
base_dir = "aclImdb"
train_pos_dir = os.path.join(base_dir, "train", "pos")
train_neg_dir = os.path.join(base_dir, "train", "neg")

test_pos_dir = os.path.join(base_dir, "test", "pos")
test_neg_dir = os.path.join(base_dir, "test", "neg")

def read_reviews(path, label):
    examples = []
    for file in os.listdir(path):
        if file.endswith(".txt"):
            file = os.path.join(path, file)
            with open(file, "r", encoding="utf-8") as f:
                examples.append((f.read().strip(), label))
    return examples
        

In [4]:
# positive is 1 and negative is 0 in terms of the label
train_pos = read_reviews(train_pos_dir, 1)
train_neg = read_reviews(train_neg_dir, 0)

test_pos = read_reviews(test_pos_dir, 1)
test_neg = read_reviews(test_neg_dir, 0)

# If there are 12500 in each array then everything transferred properly
print("Lengths of the arrays")
print("Train Positive:", len(train_pos), ",", "Train Negative:", len(train_neg))
print("Test Positive:", len(test_pos), ",", "Test Negative:", len(test_neg))

Lengths of the arrays
Train Positive: 12500 , Train Negative: 12500
Test Positive: 12500 , Test Negative: 12500


In [6]:
# Prep data into train_data and shuffle, split then turn each data point into a way the Transformer can read it
train_data = train_pos + train_neg
test_data = test_pos + test_neg

train_texts = [text for text, label in train_data]
train_labels = [label for text, label, in train_data]

test_texts = [text for text, label in test_data]
test_labels = [label for text, label in test_data]

print("Train data Length:", len(train_data))
print("Test Data Length:", len(test_data))

Train data Length: 25000
Test Data Length: 25000


In [7]:
model = SentenceTransformer("all-mpnet-base-v2")

In [8]:
hf_train = Dataset.from_dict({"text1":train_texts,"text2":train_texts, "label":train_labels})
hf_test = Dataset.from_dict({"text1":test_texts,"text2":test_texts, "label":test_labels})
                            
train_loss = losses.SoftmaxLoss(
    model=model,
    sentence_embedding_dimension=model.get_sentence_embedding_dimension(),
    num_labels=2,
    concatenation_sent_rep=True,
    concatenation_sent_difference=False,
    concatenation_sent_multiplication=False
)

args = SentenceTransformerTrainingArguments(
    output_dir="fine_tuned_sbert_imdb",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    warmup_ratio=0.1,
    learning_rate=2e-5,
    use_cpu=True,
    dataloader_pin_memory=False,
    fp16=False,
    bf16=False,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)

In [10]:
hf_train_small=hf_train.shuffle(seed=42).select(range(10000))
hf_test_small=hf_test.shuffle(seed=42).select(range(10000))

trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=hf_train_small,
    eval_dataset=hf_test_small,
    loss=train_loss
)

trainer.train()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Epoch,Training Loss,Validation Loss
1,0.3886,0.269062


TrainOutput(global_step=625, training_loss=0.3685121459960938, metrics={'train_runtime': 8823.0272, 'train_samples_per_second': 1.133, 'train_steps_per_second': 0.071, 'total_flos': 0.0, 'train_loss': 0.3685121459960938, 'epoch': 1.0})

In [11]:
model.save_pretrained("fine_tuned_sbert_imbd")

In [12]:
ft_model = SentenceTransformer("fine_tuned_sbert_imbd")

In [13]:
X_train = ft_model.encode(train_texts, batch_size=64, convert_to_numpy=True, show_progress_bar=True)
X_test = ft_model.encode(test_texts, batch_size=64, convert_to_numpy=True, show_progress_bar=True)

Batches:   0%|          | 0/391 [00:00<?, ?it/s]

Batches:   0%|          | 0/391 [00:00<?, ?it/s]

In [15]:
clf = LogisticRegression()
clf.fit(X_train, train_labels)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [16]:
y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(test_labels, y_pred))
print("\nClassification Report:\n", classification_report(test_labels, y_pred))

Accuracy: 0.93732

Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.93      0.94     12500
           1       0.93      0.94      0.94     12500

    accuracy                           0.94     25000
   macro avg       0.94      0.94      0.94     25000
weighted avg       0.94      0.94      0.94     25000



In [23]:
print(np.asarray(X_test).shape)
print(np.asarray(test_labels).shape)

(25000, 768)
(25000,)
