In [1]:
!pip install datasets transformers evaluate accelerate umap-learn umap-learn[plot]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m23.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers
  Downloading transformers-4.29.1-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m112.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.19.0-py3-none-any.whl (219 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m219.1/219.1 kB[0m [31m24.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting umap-learn
  Downloading umap-learn-0.5.3.tar.gz (88 kB)
[2K     [9

In [2]:
from google.colab import drive
# mount drive to access data
drive.mount('/content/drive')
# load data folder into working directory
!cp -r drive/MyDrive/data .
!cp -r drive/MyDrive/bert_emotion .

Mounted at /content/drive


In [37]:

from datasets import Dataset
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, \
 AdamW, TrainingArguments, Trainer
import numpy as np
import evaluate
import torch
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
import tqdm

In [4]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cuda')

In [5]:
# loading data with preprocessed tweets
df = pd.read_csv('data/cleaned_tweets.csv')

In [23]:
# drop other columns
df = df['TweetText']
dates = df['Timestamp']

### Prepare Input for BERT

In [7]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [8]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

In [9]:
def return_dataset(df, select=None):

  dataset = Dataset.from_pandas(df)
  if select is not None:
    dataset = dataset.select(range(select))

  dataset = dataset.rename_column("TweetText", "text")

  dataset = dataset.map(tokenize_function, batched=True)

  dataset.set_format("torch")

  return dataset

In [10]:
dataset = return_dataset(df)

Map:   0%|          | 0/561730 [00:00<?, ? examples/s]

Load our pretrained model.

In [54]:
model = AutoModelForSequenceClassification.from_pretrained("bert_emotion")

In [55]:
for param in model.parameters():
  param.requires_grad = False

In [65]:
loader = DataLoader(dataset, \
           batch_size=128, \
           pin_memory=False, \
           shuffle=False, \
           num_workers=4)

In [66]:
model = model.to('cuda')

In [67]:
model.train()
predictions = torch.empty(0,).to(device)

for inputs in tqdm.tqdm(loader):

  with torch.no_grad():

    outputs = model(inputs['input_ids'].to(device))
    outputs = torch.sigmoid(outputs.logits)
    outputs = torch.round(outputs)
    predictions = torch.cat((predictions,outputs))

  0%|          | 0/4389 [00:00<?, ?it/s]
