In [1]:
import numpy as np
import pandas as pd
import time
import datetime
import random

import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler,random_split

from transformers import BertForSequenceClassification, AdamW,BertTokenizer,get_linear_schedule_with_warmup

from nltk.stem import PorterStemmer
from imblearn.over_sampling import SMOTE

In [24]:
import nlpaug.augmenter.char as nac




In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cpu'

In [3]:
df = pd.read_csv("preprocessed_data.csv")

def map_popularity(value):
    if value == "super positiv":
        return 0
    elif value == "positiv":
        return 1
    elif value == "negativ":
        return 2
    else:
        return 3

df["popularity"] = df["popularity"].apply(map_popularity)

df

Unnamed: 0,video_id,popularity,sentence
0,--14w5SOEUs,0,Channel with title : MigosVEVO has posted vide...
1,--2O86Z0hsM,3,Channel with title : jf.okay has posted video ...
2,--40TEbZ9Is,1,Channel with title : Television Academy has po...
3,--47FjCWgrU,3,Channel with title : NFL has posted video with...
4,--5-brQiQFg,3,Channel with title : NFL has posted video with...
...,...,...,...
44004,zzd4ydafGR0,0,Channel with title : Lil Tjay has posted video...
44005,zziBybeSAtw,1,Channel with title : NBA has posted video with...
44006,zzk09ESX7e0,0,Channel with title : MAMAMOO has posted video ...
44007,zzsIqPVv2Q4,3,Channel with title : MaxCraft has posted video...


In [14]:
text = df.sentence.values
labels = df.popularity.values

In [15]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [None]:
max_len = 0

for sent in text:
    input_ids = tokenizer.encode(sent, add_special_tokens=True, truncation=True)
    max_len = max(max_len, len(input_ids))

print('Max sentence length: ', max_len)

Max sentence length:  512


In [16]:
max_len = 512
input_ids = []
attention_masks = []
aug = nac.OcrAug()

for sentence in text:
    augmented_texts = aug.augment(text,n=3)
    encoded_dict = tokenizer.encode_plus(
        sentence,
        add_special_tokens=True,
        max_length=max_len,
        pad_to_max_length=True,
        return_attention_mask=True,
        return_tensors='pt',
    )

    stemmer = PorterStemmer()

    stemmed_input_ids = [stemmer.stem(token) for token in tokenizer.convert_ids_to_tokens(encoded_dict['input_ids'][0].tolist())]

    stemmed_input_ids = tokenizer.convert_tokens_to_ids(stemmed_input_ids)

    encoded_dict['input_ids'] = torch.tensor(stemmed_input_ids).unsqueeze(0)

    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])


input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)

labels = np.array(labels, dtype=object)
labels = np.array(labels, dtype=np.int64)
labels = torch.tensor(labels)


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [17]:
dataset = TensorDataset(input_ids, attention_masks, labels)

smote = SMOTE(random_state=42)

train_size = int(0.8 * len(dataset))
val_size = int(len(dataset) - train_size)

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

X_train_resampled, y_train_resampled = smote.fit_resample(train_dataset, val_dataset)


print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))


35,207 training samples
8,802 validation samples


In [18]:
batch_size = 8

train_dataloader = DataLoader(
            X_train_resampled,
            sampler = RandomSampler(train_dataset),
            batch_size = batch_size
        )

validation_dataloader = DataLoader(
            y_train_resampled,
            sampler = SequentialSampler(val_dataset),
            batch_size = batch_size
        )


In [19]:
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels = 4,
    output_attentions = False,
    output_hidden_states = False
)

model = model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
optimizer = AdamW(model.parameters(),
                  lr = 1e-5,
                  eps = 1e-8
                )

In [7]:
epochs = 4

total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

NameError: name 'train_dataloader' is not defined

In [8]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [9]:
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [21]:
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
training_stats = {
    'epoch': [],
    'Training Loss': [],
    'Valid. Loss': [],
    'Valid. Accur.': [],
    'Training Time': [],
    'Validation Time': [],
    'Accuracy' : []
}

total_t0 = time.time()

for epoch_i in range(0, epochs):
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    t0 = time.time()
    total_train_loss = 0
    total_train_accuracy = 0
    model.train()
    for step, batch in enumerate(train_dataloader):
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        optimizer.zero_grad()
        output = model(b_input_ids,
                             token_type_ids=None,
                             attention_mask=b_input_mask,
                             labels=b_labels)
        loss = output.loss
        total_train_loss += loss.item()

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()

        scheduler.step()


    avg_train_loss = total_train_loss / len(train_dataloader)

    training_time = format_time(time.time() - t0)
    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))

    print("")
    print("Running Validation...")
    t0 = time.time()

    model.eval()

    total_eval_accuracy = 0
    best_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    for batch in validation_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        with torch.no_grad():
            output= model(b_input_ids,
                        token_type_ids=None,
                        attention_mask=b_input_mask,
                        labels=b_labels)
        loss = output.loss
        total_eval_loss += loss.item()

        logits = output.logits
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        total_eval_accuracy += flat_accuracy(logits, label_ids)

    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

    avg_val_loss = total_eval_loss / len(validation_dataloader)

    validation_time = format_time(time.time() - t0)
    if avg_val_accuracy > best_eval_accuracy:
        torch.save(model, 'bert_model')
        best_eval_accuracy = avg_val_accuracy
    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    training_stats['epoch'].append(epoch_i+1)
    training_stats['Training Loss'].append(avg_train_loss)
    training_stats['Valid. Loss'].append(avg_val_loss)
    training_stats['Valid. Accur.'].append(avg_val_accuracy)

print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))


Training...

  Average training loss: 1.20
  Training epcoh took: 0:56:40

Running Validation...
  Accuracy: 0.51
  Validation Loss: 1.12
  Validation took: 0:04:52

Training complete!
Total training took 1:01:39 (h:mm:ss)


In [22]:
training_stats

{'epoch': [1],
 'Training Loss': [1.200403670205823],
 'Valid. Loss': [1.1219933225175232],
 'Valid. Accur.': [0.5110127157129882],
 'Training Time': [],
 'Validation Time': [],
 'Accuracy': []}

In [21]:
model = torch.load('bert_model (2)', map_location=torch.device('cpu'))


In [None]:
predictions = []
ground_truth = []
for batch in validation_dataloader:
  b_input_ids = batch[0].to(device)
  b_input_mask = batch[1].to(device)
  b_labels = batch[2].to(device)

  with torch.no_grad():
      output= model(b_input_ids,
                              token_type_ids=None,
                              attention_mask=b_input_mask)
      logits = output.logits
      logits = logits.detach().cpu().numpy()
      pred_flat = np.argmax(logits, axis=1).flatten()
      label_ids = b_labels.to('cpu').numpy()

      predictions.extend(list(pred_flat))
      ground_truth.extend(list(label_ids.flatten()))

NameError: name 'validation_dataloader' is not defined

In [76]:
from sklearn.metrics import classification_report
print(classification_report(ground_truth,predictions,labels=[0,1,2,3]))

              precision    recall  f1-score   support

           0       0.51      0.60      0.55       265
           1       0.43      0.24      0.31       226
           2       0.54      0.53      0.53       277
           3       0.61      0.76      0.68       232

    accuracy                           0.54      1000
   macro avg       0.52      0.53      0.52      1000
weighted avg       0.52      0.54      0.52      1000

