In [1]:
# !nvidia-smi

In [2]:
#! pip install transformers

In [3]:
import os
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import transformers
from transformers import RobertaModel, RobertaTokenizer
import tensorflow as tf
from tensorflow import keras

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
tf.test.is_built_with_cuda()
import sys
sys.version

'3.9.12 (main, Apr  4 2022, 05:22:27) [MSC v.1916 64 bit (AMD64)]'

In [5]:
class Settings:
    batch_size=320
    max_len=350
    device = "cuda"
    seed = 318

In [6]:
class TrainValidDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.df = df
        self.text = df["tweet"].values
        self.target = df["label"].values
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        texts = self.text[idx]
        tokenized = self.tokenizer.encode_plus(texts, truncation=True, add_special_tokens=True,
                                               max_length=self.max_len, padding="max_length")
        ids = tokenized["input_ids"]
        mask = tokenized["attention_mask"]
        targets = self.target[idx]
        return {
            "ids": torch.LongTensor(ids),
            "mask": torch.LongTensor(mask),
            "targets": torch.tensor(targets, dtype=torch.float32)
        }

In [7]:
class CommonLitRoBERTa(nn.Module):
    def __init__(self, pretrained_path):
        super().__init__()
        self.roberta = RobertaModel.from_pretrained(pretrained_path)
        
    def forward(self, ids, mask):
        output = self.roberta(ids, attention_mask=mask)
        return output

In [8]:
model = CommonLitRoBERTa("roberta-base")
model.to(Settings.device)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


AssertionError: Torch not compiled with CUDA enabled

In [None]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
tokenizer

In [None]:
datatweet = pd.read_csv("dataset_all_tweet.csv")

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
datatweet.label = le.fit_transform(datatweet.label.values)

In [None]:
datatweet.shape

In [None]:
datatweet.head()

In [None]:
# prepare dataset
# df_train = pd.read_csv("dataset_merge_final.csv")

train_dataset = TrainValidDataset(df, tokenizer, Settings.max_len)
train_loader = DataLoader(train_dataset, batch_size=Settings.batch_size,
                          shuffle=True, num_workers=5, pin_memory=True)

In [None]:
# make mini batch data
batch = next(iter(train_loader))

In [None]:
ids = batch["ids"].to(Settings.device)
mask = batch["mask"].to(Settings.device)
targets = batch["targets"].to(Settings.device)

print(ids.shape)
print(mask.shape)
print(targets.shape)

In [None]:
output = model(ids, mask)
output

In [None]:
# last_hidden_state
last_hidden_state = output[0]
print("shape:", last_hidden_state.shape)

In [None]:
# pooler output
pooler_output = output[1]
print("shape:", pooler_output.shape)

In [None]:
cls_embeddings = last_hidden_state[:, 0, :].detach()

print("shape:", cls_embeddings.shape)
print("")
print(cls_embeddings)

In [None]:
pd.DataFrame(cls_embeddings.numpy()).head()

In [None]:
pd.DataFrame(cls_embeddings.numpy()).shape

In [None]:
last_hidden_state.shape

In [None]:
# apply avg.pooling to word embeddings
# 単語埋め込みベクトルにaverage pooling を適用する
pooled_embeddings = last_hidden_state.detach().mean(dim=1)

print("shape:", pooled_embeddings.shape)
print("")
print(pooled_embeddings)

In [None]:
pd.DataFrame(pooled_embeddings.numpy()).head()

In [None]:
pd.DataFrame(pooled_embeddings.numpy()).shape

In [None]:
pd.DataFrame(pooled_embeddings.numpy()).to_csv("roberta_embeddings.csv")