In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup
from torch.utils.data import Dataset, DataLoader, random_split, RandomSampler, SequentialSampler
import pandas as pd
import torch
import numpy as np
import time
import datetime
import random

In [2]:
"""
if torch.backends.mps.is_available():
    device = torch.device("mps")
    x = torch.ones(1, device=device)
    print (x)
else:
    print ("MPS device not found.")
"""
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU not available, using CPU")

GPU is available


In [3]:
tokenizer = AutoTokenizer.from_pretrained('roberta-base', truncation=True, do_lower_case=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [5]:
df = pd.read_csv('news-clean.csv', index_col=0)

In [6]:
class NewsDataset(Dataset):
    def __init__(self, dataframe, tokenizer):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text_clean
        self.targets = dataframe.label

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=512,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.long)
        }

In [7]:
dataset = NewsDataset(df,tokenizer)
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
print(len(train_dataset))
print(len(val_dataset))

40408
4490


In [8]:
batch_size = 16
train_dataloader = DataLoader(
            train_dataset,
            sampler = RandomSampler(train_dataset),
            batch_size = batch_size
        )

validation_dataloader = DataLoader(
            val_dataset,
            sampler = SequentialSampler(val_dataset),
            batch_size = batch_size
        )

In [14]:
model = AutoModelForSequenceClassification.from_pretrained(
    "roberta-base",
    num_labels = 2,
    output_attentions = False,
    output_hidden_states = False,
)
model.to(device)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [10]:
optimizer = torch.optim.AdamW(model.parameters(),
                  lr = 2e-5,
                  eps = 1e-6
                )

In [15]:
epochs = 2

total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

In [16]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [17]:
training_stats = []

total_t0 = time.time()

for epoch_i in range(0, epochs):

    # ========================================
    #               Training
    # ========================================

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    t0 = time.time()

    total_train_loss = 0

    model.train()

    for step, batch in enumerate(train_dataloader):

        if step % 50 == 0:
            elapsed = format_time(time.time() - t0)

            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        b_input_ids = batch['ids'].to(device)
        b_input_mask = batch['mask'].to(device)
        # b_type_ids = batch['token_type_ids'].to(device)
        b_labels = batch['targets'].to(device)

        model.zero_grad()

        result = model(b_input_ids,
                       token_type_ids=None,
                       attention_mask=b_input_mask,
                       labels=b_labels,
                       return_dict=True)

        loss = result.loss
        logits = result.logits

        total_train_loss += loss.item()

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()

        scheduler.step()

    avg_train_loss = total_train_loss / len(train_dataloader)

    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))

    # ========================================
    #               Validation
    # ========================================

    print("")
    print("Running Validation...")

    t0 = time.time()

    model.eval()

    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    for batch in validation_dataloader:

        b_input_ids = batch['ids'].to(device)
        b_input_mask = batch['mask'].to(device)
        # b_type_ids = batch['token_type_ids'].to(device)
        b_labels = batch['targets'].to(device)

        with torch.no_grad():
            result = model(b_input_ids,
                           token_type_ids=None,
                           attention_mask=b_input_mask,
                           labels=b_labels,
                           return_dict=True)

        loss = result.loss
        logits = result.logits

        total_eval_loss += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.cpu().numpy()

        total_eval_accuracy += flat_accuracy(logits, label_ids)

    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

    avg_val_loss = total_eval_loss / len(validation_dataloader)

    validation_time = format_time(time.time() - t0)

    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))


Training...
  Batch     0  of  2,526.    Elapsed: 0:00:00.
  Batch    50  of  2,526.    Elapsed: 0:01:08.
  Batch   100  of  2,526.    Elapsed: 0:02:21.
  Batch   150  of  2,526.    Elapsed: 0:03:34.
  Batch   200  of  2,526.    Elapsed: 0:04:47.
  Batch   250  of  2,526.    Elapsed: 0:06:00.
  Batch   300  of  2,526.    Elapsed: 0:07:12.
  Batch   350  of  2,526.    Elapsed: 0:08:25.
  Batch   400  of  2,526.    Elapsed: 0:09:38.
  Batch   450  of  2,526.    Elapsed: 0:10:51.
  Batch   500  of  2,526.    Elapsed: 0:12:04.
  Batch   550  of  2,526.    Elapsed: 0:13:16.
  Batch   600  of  2,526.    Elapsed: 0:14:29.
  Batch   650  of  2,526.    Elapsed: 0:15:42.
  Batch   700  of  2,526.    Elapsed: 0:16:55.
  Batch   750  of  2,526.    Elapsed: 0:18:08.
  Batch   800  of  2,526.    Elapsed: 0:19:20.
  Batch   850  of  2,526.    Elapsed: 0:20:33.
  Batch   900  of  2,526.    Elapsed: 0:21:46.
  Batch   950  of  2,526.    Elapsed: 0:22:59.
  Batch 1,000  of  2,526.    Elapsed: 0:24:12.


In [19]:
model.save_pretrained('2-epochs.pt')

In [21]:
test = AutoModelForSequenceClassification.from_pretrained('2-epochs.pt')


In [40]:
test(test_ids, attention_mask=mask)

SequenceClassifierOutput(loss=None, logits=tensor([[-0.1145,  0.2283],
        [-0.1066,  0.2197],
        [-0.1019,  0.2126],
        [-0.1036,  0.2160],
        [-0.1015,  0.2059],
        [-0.1058,  0.2053],
        [-0.1082,  0.2129],
        [-0.1129,  0.2224],
        [-0.1092,  0.2333],
        [-0.1042,  0.2134],
        [-0.0958,  0.2206],
        [-0.1029,  0.2184],
        [-0.0972,  0.2135],
        [-0.0958,  0.2020],
        [-0.1069,  0.2231],
        [-0.1029,  0.2254]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [25]:
test_ids = tokenizer.encode_plus(
            "fake news",
            None,
            add_special_tokens=True,
            max_length=512,
            pad_to_max_length=True,
            return_token_type_ids=True
        )['input_ids']



In [38]:
for batch in validation_dataloader:
  test_ids = batch['ids']
  mask = batch['mask']
  break



In [39]:
mask

tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0]])

In [53]:
tokenizer.convert_ids_to_tokens(test_ids[0])

['<s>',
 'Saturday',
 'ĠNight',
 'ĠLive',
 'Ġonce',
 'Ġagain',
 'Ġtook',
 'Ġaim',
 'Ġat',
 'ĠDonald',
 'ĠTrump',
 '.',
 'ĠTrump',
 'Ġonce',
 'Ġagain',
 'Ġthrew',
 'Ġa',
 'Ġhis',
 'sy',
 'Ġfit',
 'Ġabout',
 'Ġit',
 '.',
 'ĠAnd',
 'ĠAlec',
 'ĠBaldwin',
 'Ġonce',
 'Ġagain',
 'Ġresponded',
 'Ġperfectly',
 '.',
 'B',
 'ald',
 'win',
 'Ġs',
 'Ġimperson',
 'ation',
 'Ġof',
 'ĠTrump',
 'Ġhas',
 'Ġbeen',
 'Ġspot',
 '-',
 'on',
 ',',
 'Ġmust',
 'Ġto',
 'Ġthe',
 'Ġdelight',
 'Ġof',
 'ĠSaturday',
 'ĠNight',
 'ĠLive',
 'Ġfans',
 'Ġacross',
 'Ġthe',
 'Ġcountry',
 '.',
 'This',
 'Ġweek',
 ',',
 'ĠBaldwin',
 'Ġreturned',
 'Ġto',
 'Ġlamp',
 'oon',
 'ĠTrump',
 'Ġs',
 'Ġinability',
 'Ġto',
 'Ġfocus',
 'Ġon',
 'Ġthe',
 'Ġmost',
 'Ġimportant',
 'Ġjob',
 'Ġhe',
 'Ġll',
 'Ġever',
 'Ġhave',
 'Ġbecause',
 'Ġhe',
 'Ġrefuses',
 'Ġto',
 'Ġstop',
 'Ġtweeting',
 'Ġinsults',
 'Ġand',
 'Ġpetty',
 'Ġwhining',
 '.',
 'B',
 'ald',
 'win',
 'Ġs',
 'ĠTrump',
 'Ġinterrupted',
 'Ġa',
 'Ġsecurity',
 'Ġbriefing',
 'Ġto',
 'Ġr