In [2]:
import numpy as np
import random
import torch
import os

def seed_everything(seed):
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

seed_everything(21)

In [3]:
!pip install contractions

Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.2-py3-none-any.whl.metadata (1.5 kB)
Collecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-2.0.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (13 kB)
Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.9/289.9 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading pyahocorasick-2.0.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64

In [4]:
# preprocessing text

import pandas as pd
from sklearn.model_selection import train_test_split
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stopw = stopwords.words('english')
from nltk.stem import PorterStemmer

from tqdm import tqdm
import unicodedata
import contractions
from bs4 import BeautifulSoup

def strip_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    [s.extract() for s in soup(['iframe', 'script'])]
    stripped_text = soup.get_text()
    stripped_text = re.sub(r'[\r|\n|\r\n]+', '\n', stripped_text)
    return stripped_text

def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

def pre_process_corpus(docs):
    norm_docs = []
    for doc in tqdm(docs):
        doc = doc.translate(doc.maketrans("\n\t\r", "   "))
        doc = doc.lower()
        doc = remove_accented_chars(doc)
        doc = contractions.fix(doc)
        # remove special characters\whitespaces
        doc = re.sub(r'[^a-zA-Z0-9\s]', '', doc, re.I|re.A)
        doc = re.sub(' +', ' ', doc)
        # remove stop words
        #doc = ' '.join(word for word in str(doc).split() if word not in stopw)  
        doc = doc.strip()
        #stemmer = PorterStemmer()
        #doc = [stemmer.stem(word) for word in doc.split()]
        # Join the stemmed words back into a single string
        #doc = ' '.join(doc)
        norm_docs.append(doc)

    return norm_docs

train_data['cleaned_text'] = pre_process_corpus(train_data['text'])
print("Completed pre-processing train texts...")

test_data['cleaned_text'] = pre_process_corpus(test_data['text'])
print("Completed pre-processing test texts...")

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


100%|██████████| 6000/6000 [00:01<00:00, 4416.02it/s]


Completed pre-processing train texts...


100%|██████████| 1801/1801 [00:00<00:00, 4243.99it/s]

Completed pre-processing test texts...





In [5]:
# https://huggingface.co/docs/transformers/model_doc/roberta 

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, AutoTokenizer, DistilBertTokenizer
from transformers import RobertaTokenizerFast, RobertaForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# define tokenizer

#tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', max_length=512)

#model_name = "nghuyong/ernie-2.0-base-en"
#tokenizer = AutoTokenizer.from_pretrained(model_name)

#tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base', max_length = 512)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [6]:
!pip install datasets



In [7]:
from datasets import Dataset, DatasetDict

train_data['stars'] = train_data['stars'] - 1

raw_datasets = DatasetDict({
    "train": Dataset.from_pandas(train_data),
    "eval": Dataset.from_pandas(test_data)
})

# Check the datasets
print("Dataset Dict:\n", raw_datasets)
print("\n\nTrain's features:\n", raw_datasets["train"].features)
print("\n\nFirst row of Train:\n", raw_datasets["train"]['cleaned_text'][0])

Dataset Dict:
 DatasetDict({
    train: Dataset({
        features: ['text', 'stars', 'cleaned_text'],
        num_rows: 6000
    })
    eval: Dataset({
        features: ['ID', 'text', 'cleaned_text'],
        num_rows: 1801
    })
})


Train's features:
 {'text': Value(dtype='string', id=None), 'stars': Value(dtype='int64', id=None), 'cleaned_text': Value(dtype='string', id=None)}


First row of Train:
 will never come back horrible service nasty boba drink i used to live in california and had always gone to tea station the tea station in vegas is just horrible my boyfriend and i went to tea station last week we had a peppermint milk tea w coffee jelly and a wheat germ milk tea w boba the bill is 1405 seriously 2 drinks for 14and the drink is not even good the waitress has some attitude problems and she messed up our order our waitress is so rude and the drink is so watery she literally just threw our drinks on the table and she was so impatient while taking our orders i mean i have 

In [8]:
# Tokenize the text, and truncate the text if it exceed the tokenizer maximum length. Batched=True to tokenize multiple texts at the same time.
tokenized_datasets = raw_datasets.map(lambda dataset: tokenizer(dataset['cleaned_text'], truncation=True), batched=True)

print(tokenized_datasets)

  0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'stars', 'cleaned_text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 6000
    })
    eval: Dataset({
        features: ['ID', 'text', 'cleaned_text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1801
    })
})


In [9]:
# Check the first row
print(tokenized_datasets["train"]['cleaned_text'][2])

so right away if i go into a buffet setting and there are signs indicating for you to eat everything that you take right away to me that means they are having issues with food costs and their business is not doing too wellnnsome things on a buffet i do not expect much quality sushi on a buffet i expect to be mediocre dumplings i expect to be frozen ones that are steameddeep fried here the sushi was atrocious i do not think i have ever had worse sushi anywhere the dumplings were ok but it is hard to screw up reheating dumplingsnnthe hibachimongolian bbq service was good and would be the only reason for me to come back as that is something i missed from a place i used to go to often in wausaunnhowever if i cannot finish what is on my plate i do not think a restaurant should be upset with me for it cut down how much crap you have out there that no one touches or is just plain bad especially the abysmal sushi and watch your profits pick back up


In [10]:
# Rename "stars" column to "labels" only in the train split
tokenized_datasets["train"] = tokenized_datasets["train"].rename_column("stars", "labels")

print(tokenized_datasets)

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'cleaned_text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 6000
    })
    eval: Dataset({
        features: ['ID', 'text', 'cleaned_text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1801
    })
})


In [11]:
tokenized_datasets["train"]['labels'][2]

1

In [12]:
!pip -q install evaluate

In [13]:
pip install accelerate -U

Collecting accelerate
  Downloading accelerate-0.28.0-py3-none-any.whl.metadata (18 kB)
Downloading accelerate-0.28.0-py3-none-any.whl (290 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.1/290.1 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 0.27.2
    Uninstalling accelerate-0.27.2:
      Successfully uninstalled accelerate-0.27.2
Successfully installed accelerate-0.28.0
Note: you may need to restart the kernel to use updated packages.


In [14]:
# On the Stability of Fine-Tuning BERT: Misconceptions, Explanations, and Strong Baselines https://arxiv.org/pdf/2006.04884.pdf 
# reference paper for hyperparameter fine tuning for pretrained models
# I used RoBERTa with hyper parameters chosen from the research paper 
# https://huggingface.co/docs/transformers/en/training

from transformers import DataCollatorWithPadding, TrainingArguments, Trainer
from transformers import AutoModelForSequenceClassification, BertForSequenceClassification, BertModel
from transformers import get_linear_schedule_with_warmup, EarlyStoppingCallback
import evaluate
from torch.nn import CrossEntropyLoss

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

num_labels=3

#model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=num_labels)

#model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)

model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=3)

# Training args
training_args = TrainingArguments(
    "/kaggle/working/test-trainer",
    num_train_epochs=3,
    evaluation_strategy="epoch",
    weight_decay=0.1,
    save_strategy="no",
    report_to="none",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    #gradient_accumulation_steps = 16,
    per_device_eval_batch_size= 16,
    disable_tqdm = False,
    warmup_steps=0,
    logging_steps = 8,
    fp16 = True,
    logging_dir='/kaggle/working/logs',
    #dataloader_num_workers = 8,
)

'''training_args = TrainingArguments(
    output_dir="/kaggle/working/test-trainer",
    per_device_train_batch_size=16,
    num_train_epochs=3,
    evaluation_strategy="epoch",
    report_to="none"
)'''

# Get total number of training steps
total_steps = len(tokenized_datasets["train"]) // training_args.per_device_train_batch_size * training_args.num_train_epochs

# Metric for validation error
def compute_metrics(eval_preds):
    metric = evaluate.load("glue", "mrpc") # F1 and Accuracy
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# Customize the training loop to compute the loss
def compute_loss(model, inputs):
    labels = inputs.pop("labels")
    outputs = model(**inputs)
    logits = outputs.logits
    return loss_function(logits, labels)

'''def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }'''

# loss function

loss_function = CrossEntropyLoss()


# Define trainer
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["eval"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
    )

# Initialize optimizer
optimizer = AdamW(model.parameters(), lr=training_args.learning_rate, weight_decay=training_args.weight_decay, eps = 1e-6, betas=(0.9, 0.98))

warmup_ratio = 0.1

# Calculate number of warmup steps
warmup_steps = int(total_steps * warmup_ratio)

# Initialize scheduler
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=total_steps
)

# Set scheduler to Trainer
trainer.scheduler = scheduler

trainer.compute_loss = compute_loss

2024-03-16 15:38:59.712372: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-16 15:38:59.712494: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-16 15:38:59.855775: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.7437,No log
2,0.5882,No log
3,0.5019,No log


TrainOutput(global_step=1125, training_loss=0.6277527058919271, metrics={'train_runtime': 860.2059, 'train_samples_per_second': 20.925, 'train_steps_per_second': 1.308, 'total_flos': 3780569809507680.0, 'train_loss': 0.6277527058919271, 'epoch': 3.0})