In [22]:
import pandas as pd

In [23]:
train_data_save_path = "/Users/sabrina/Computational Social Science/final-project/extracted/SBIC.v2.trn.csv"
dev_data_save_path = "/Users/sabrina/Computational Social Science/final-project/extracted/SBIC.v2.dev.csv"
test_data_save_path = "/Users/sabrina/Computational Social Science/final-project/extracted/SBIC.v2.tst.csv"

In [24]:
from process_data import read_tgz_data

train_data = read_tgz_data(train_data_save_path)
dev_data = read_tgz_data(dev_data_save_path)
test_data = read_tgz_data(test_data_save_path)

In [25]:
import re

def get_context_str(category, target_group, target_stereotype):
    category, target_group, target_stereotype = str(category), str(target_group), str(target_stereotype)

    if target_stereotype.startswith(target_group):
        return " ".join((category, ":", target_stereotype))
    else:
        return " ".join((category, ":", target_group, target_stereotype))

def normalize_spacing(input):
    cleaned_string = input.strip()
    cleaned_string = ' '.join(cleaned_string.split())
    return cleaned_string

def remove_html_entities(text):
    return re.sub(r"&#[0-9]+;", "", text)

def remove_rt_username(text):
    return re.sub(r"RT @\w+\s*:", "", text)

def remove_beginning_and_ending_tags(text):
    text = re.sub(r"^(?:@[A-Za-z0-9_]+ )+", "", text)
    text = re.sub(r"(?: @[A-Za-z0-9_]+)+$", "", text)
    text = re.sub(r"^(?:@[A-Za-z0-9_]+[^\w\s]? )+", "", text)
    text = re.sub(r"^(?:\.\s*)?(?:@[A-Za-z0-9_]+ )+", "", text)
    return text

def remove_start_html(text):
    return re.sub(r"&gt;", "", text)

def remove_emojis(text):
    emoji_pattern = re.compile("["
                               "\U0001F600-\U0001F64F"
                               "\U0001F300-\U0001F5FF"
                               "\U0001F680-\U0001F6FF"
                               "\U0001F1E0-\U0001F1FF"
                               "]+", flags=re.UNICODE)
    return re.sub(emoji_pattern, "", text)

def clean_post(post):
    post = remove_html_entities(post)
    post = remove_rt_username(post)
    post = remove_beginning_and_ending_tags(post)
    post = remove_emojis(post)
    post = remove_start_html(post)
    post = normalize_spacing(post)
    post = re.sub(r"@", "", post)
    post = re.sub(r"#", "", post)
    return post

def merge_same_posts(posts):
    first_row = posts.iloc[0]
    posts = posts.fillna("nan")
    implications = posts["targetStereotype"].tolist()
    targeted_groups = posts["targetMinority"].tolist()
    targeted_categories = posts["targetCategory"].tolist()
    
    contexts = list(zip(targeted_categories, targeted_groups, implications))
    nan_context = ("nan", "nan", "nan")
    filtered_contexts = [context for context in contexts if context != nan_context]
    contexts = [get_context_str(tcat, tgroup, implication) for (tcat, tgroup, implication) in filtered_contexts]
    # print(f"Contexts: {contexts}")
    first_row["context"] = contexts

    raw_post = first_row["post"]
    first_row["post"] = clean_post(raw_post)
    return first_row



In [26]:
merged_train = train_data.groupby("post").apply(merge_same_posts).reset_index(drop = True)
merged_dev = dev_data.groupby("post").apply(merge_same_posts).reset_index(drop = True)
merged_test = test_data.groupby("post").apply(merge_same_posts).reset_index(drop = True)

  merged_train = train_data.groupby("post").apply(merge_same_posts).reset_index(drop = True)
  merged_dev = dev_data.groupby("post").apply(merge_same_posts).reset_index(drop = True)
  merged_test = test_data.groupby("post").apply(merge_same_posts).reset_index(drop = True)


In [27]:
# Save merged
merged_train_savepath = "/Users/sabrina/Computational Social Science/final-project/data/merged/merged_train.csv"
merged_dev_savepath = "/Users/sabrina/Computational Social Science/final-project/data/merged/merged_dev.csv"
merged_test_savepath = "/Users/sabrina/Computational Social Science/final-project/data/merged/merged_test.csv"

merged_train.to_csv(merged_train_savepath, index=False)
merged_dev.to_csv(merged_dev_savepath, index=False)
merged_test.to_csv(merged_test_savepath, index=False)

In [28]:
# Save with impl
merged_train_impl_savepath = "/Users/sabrina/Computational Social Science/final-project/data/with_impl/merged_train.csv"
merged_dev_impl_savepath = "/Users/sabrina/Computational Social Science/final-project/data/with_impl/merged_dev.csv"
merged_test_impl_savepath = "/Users/sabrina/Computational Social Science/final-project/data/with_impl/merged_test.csv"

merged_train_impl = merged_train[merged_train["context"].map(len) > 0]
merged_dev_impl = merged_dev[merged_dev["context"].map(len) > 0]
merged_test_impl = merged_test[merged_test["context"].map(len) > 0]

merged_train_impl.to_csv(merged_train_impl_savepath, index=False)
merged_dev_impl.to_csv(merged_dev_impl_savepath, index=False)
merged_test_impl.to_csv(merged_test_impl_savepath, index=False)

In [134]:
# Testing to see tokenizer outputs
import torch
from transformers import FunnelTokenizer

tokenizer = FunnelTokenizer.from_pretrained("funnel-transformer/small")

# Tests
texts = ["Hello, world!", "Here is a test string!"]

tokenized_output = tokenizer.batch_encode_plus(texts, padding=True, truncation=True, return_tensors="pt")
print(tokenized_output)



{'input_ids': tensor([[ 101, 7592, 1010, 2088,  999,  102,    0,    0],
        [ 101, 2182, 2003, 1037, 3231, 5164,  999,  102]]), 'token_type_ids': tensor([[2, 0, 0, 0, 0, 0, 0, 0],
        [2, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1]])}


In [94]:
import torch
from transformers import FunnelTokenizer, FunnelModel

tokenizer = FunnelTokenizer.from_pretrained("funnel-transformer/small")
model = FunnelModel.from_pretrained("funnel-transformer/small")

In [71]:
import numpy as np
from transformers import FunnelTokenizer, FunnelModel

tokenizer = FunnelTokenizer.from_pretrained("funnel-transformer/small")

# Tokenize post helper function
def tokenize_post(row):
    context_str = " <sep> ".join(row["context"]) + " <sep>"
    post_str = "<cls> " + row["post"]
    input_str = post_str + context_str
    
    post_data = row["post"]
    # print(f"Input str: {input_str}")
    tokenized_output = tokenizer(input_str, padding=True, add_special_tokens=False, truncation=True, return_tensors="pt")
    
    input_ids = tokenized_output["input_ids"].numpy()
    attention_mask = tokenized_output["attention_mask"].numpy()
    token_type_ids = tokenized_output["token_type_ids"].numpy()
    
    tokenized_post = tokenizer(post_data, padding=True, truncation=True, return_tensors="pt")
    target_labels = tokenized_post["input_ids"].numpy()
    # token_type_ids_np = np.zeros_like(tokenized_output["token_type_ids"])
    
    return {
        "input_ids": input_ids,
        "token_type_ids": token_type_ids,
        "attention_mask": attention_mask,
        "target_labels": target_labels
    }

def t5_tokenize_post_reconstruction_prompt(row):
    summary = row["summary"]
    context = row["context"]
    input_prompt = f"Based on this summary: <{summary}>, and the implications of the post <{context}>, reconstruct the post."
    tokenized_output = tokenizer.encode(input_prompt, return_tensors="pt")
    
    return tokenized_output.numpy()

# Contexts is a list of strings
# def tokenize_context(contexts):
#     input_ids = []
#     attention_masks = []
#     for context in contexts:
#         tokenized_output = tokenizer(context, padding=True, truncation=True, return_tensors="pt")
#         context_tokens.append((tokenized_output["input_ids"], tokenized_output["token_type_ids"], tokenized_output["attention_mask"]))
    



In [103]:
## FOR FUNNEL TRANSFORMER ###

# Tokenize and add to df
train_tokens = merged_train.apply(tokenize_post, axis = 1)
train_tokens_df = pd.DataFrame(train_tokens.tolist(), columns=["input_ids", "token_type_ids", "attention_mask", "target_labels"])
train_tokenized_df = pd.concat([merged_train.reset_index(drop=True), train_tokens_df.reset_index(drop=True)], axis=1)

print(train_tokenized_df.columns.tolist())

# Tokenize and add to df
dev_tokens = merged_dev.apply(tokenize_post, axis = 1)
dev_tokens_df = pd.DataFrame(dev_tokens.tolist(), columns=["input_ids", "token_type_ids", "attention_mask", "target_labels"])
dev_tokenized_df = pd.concat([merged_dev.reset_index(drop=True), dev_tokens_df.reset_index(drop=True)], axis=1)


# Tokenize and add to df
test_tokens = merged_test.apply(tokenize_post, axis = 1)
test_tokens_df = pd.DataFrame(test_tokens.tolist(), columns=["input_ids", "token_type_ids", "attention_mask", "target_labels"])
test_tokenized_df = pd.concat([merged_test.reset_index(drop=True), test_tokens_df.reset_index(drop=True)], axis=1)


train_tokenized_df.to_json("/Users/sabrina/Computational Social Science/final-project/data/tokenized/train_tokenized.json", orient = "records", lines = True)
dev_tokenized_df.to_json("/Users/sabrina/Computational Social Science/final-project/data/tokenized/dev_tokenized.json", orient = "records", lines = True)
test_tokenized_df.to_json("/Users/sabrina/Computational Social Science/final-project/data/tokenized/test_tokenized.json", orient = "records", lines = True)

['whoTarget', 'intentYN', 'sexYN', 'sexReason', 'offensiveYN', 'annotatorGender', 'annotatorMinority', 'sexPhrase', 'speakerMinorityYN', 'WorkerId', 'HITId', 'annotatorPolitics', 'annotatorRace', 'annotatorAge', 'post', 'targetMinority', 'targetCategory', 'targetStereotype', 'dataSource', 'context', 'input_ids', 'token_type_ids', 'attention_mask', 'target_labels']


In [99]:
import ast
def convert_string_to_list(string):
	return ast.literal_eval(string)

In [100]:
# For loading data back in rather than rerunning above
merged_train_savepath = "/Users/sabrina/Computational Social Science/final-project/data/merged/merged_train.csv"
merged_dev_savepath = "/Users/sabrina/Computational Social Science/final-project/data/merged/merged_dev.csv"
merged_test_savepath = "/Users/sabrina/Computational Social Science/final-project/data/merged/merged_test.csv"

merged_train = pd.read_csv(merged_train_savepath)
merged_dev = pd.read_csv(merged_dev_savepath)
merged_test = pd.read_csv(merged_test_savepath)

merged_train_impl_savepath = "/Users/sabrina/Computational Social Science/final-project/data/with_impl/merged_train.csv"
merged_dev_impl_savepath = "/Users/sabrina/Computational Social Science/final-project/data/with_impl/merged_dev.csv"
merged_test_impl_savepath = "/Users/sabrina/Computational Social Science/final-project/data/with_impl/merged_test.csv"

merged_train_impl = pd.read_csv(merged_train_impl_savepath)
merged_dev_impl = pd.read_csv(merged_dev_impl_savepath)
merged_test_impl = pd.read_csv(merged_test_impl_savepath)

train_tokenized_df = pd.read_json("/Users/sabrina/Computational Social Science/final-project/data/tokenized/train_tokenized.csv", orient = "records", lines = True)
dev_tokenized_df = pd.read_json("/Users/sabrina/Computational Social Science/final-project/data/tokenized/dev_tokenized.csv", orient = "records", lines = True)
test_tokenized_df = pd.read_json("/Users/sabrina/Computational Social Science/final-project/data/tokenized/test_tokenized.csv", orient = "records", lines = True)


SyntaxError: invalid syntax. Perhaps you forgot a comma? (<unknown>, line 1)

In [115]:
# Dataloader
from torch.utils.data import Dataset, DataLoader

class BiasDataset(Dataset):
    def __init__(self, dataset):
        self.dataset = dataset
        
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        print("test")
        input_ids = self.dataset.loc[idx, "input_ids"]
        attention_mask = self.dataset.loc[idx, "attention_mask"]
        token_type_ids = self.dataset.loc[idx, "token_type_ids"]
        target_labels = self.dataset.loc[idx, "target_labels"]
        
        print(input_ids.shape)
        print(type(attention_mask))
        print(type(token_type_ids))
        print(type(target_labels))
        print("test2")
        # Create input tensors
        inputs = {
            "input_ids": torch.tensor(input_ids, dtype=torch.long).squeeze(dim=0),
            "attention_mask": torch.tensor(attention_mask, dtype=torch.long).squeeze(dim=0),
            "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long).squeeze(dim=0),
            "target_labels": torch.tensor(target_labels, dtype=torch.long).squeeze(dim=0)
        }

        print("test3")
        
        return inputs

In [116]:
train_dataloader = DataLoader(
    BiasDataset(train_tokenized_df),
    batch_size=1,
    shuffle=True
)

test_dataloader = DataLoader(
    BiasDataset(test_tokenized_df),
    batch_size=1,
    shuffle=False
)

In [76]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using cpu device


In [138]:
import torch.nn as nn

class RegenerativeTransformer(nn.Module):
    def __init__(self, encoder, decoder):
        super(RegenerativeTransformer, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.embedding = nn.Embedding(30522, 768)
        self.output = nn.Linear(768, 30522)

    def forward(self, src, tgt, src_mask=None, tgt_mask=None):
        memory = self.encoder(src, attention_mask=src_mask)[0]  # Ensure output matches expected format
        output = self.decoder(tgt, memory, tgt_mask=tgt_mask, memory_mask=src_mask)
        return output



In [174]:
import tqdm
import torch.nn.functional as F

MAX_LENGTH = 50
def train_loop(dataloader, model, loss_fn, encoder_optimizer, decoder_optimizer, start_token_id):
    train_loss = 0
    # set the model to training model
    model.train()
    iter_count = 0
    # for batch in dataloader:
    for batch in tqdm.tqdm(dataloader):
        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()
        
        # previous tokens
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        token_type_ids = batch["token_type_ids"].to(device)
        target_labels = batch["target_labels"][:, :MAX_LENGTH].to(device)
        
        target_pad = MAX_LENGTH - target_labels.size(1)
        if target_pad > 0:
             target_labels = F.pad(target_labels, (0, target_pad), "constant", 0)
        
        # print(f"Input ids: {input_ids.shape}")
        # print(f"Attention_mask: {attention_mask.shape}")
        encoder_outputs = model.encoder(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        
        encoder_state = encoder_outputs.last_hidden_state
        # Decoder part initiated
        
        start_token_id = torch.tensor([start_token_id], dtype=torch.long, device=device)
        start_token_embed = model.embedding(start_token_id)
        decoder_input = start_token_embed.repeat(input_ids.size(0), 1, 1)
        
        # decoder_input = torch.tensor([model.embedding(start_token_id)]*input_ids.size(0), device=device)
        outputs = []

        for i in range(MAX_LENGTH):
            print(f"Decoder input: {decoder_input.shape}")
            print(f"Encoder output: {encoder_state.shape}")
            decoder_output = model.decoder(decoder_input, encoder_state)
            
            logits = model.output(decoder_output.squeeze(dim = 1))
            outputs.append(logits)
            
            decoder_input = target_labels[0, i]
            decoder_input = model.embedding(decoder_input).repeat(input_ids.size(0), 1, 1)

            # _, topi = logits.topk(1)
            # decoder_input = topi.squeeze().detach()
            # decoder_input = model.embedding(decoder_input).repeat(input_ids.size(0), 1, 1)

        outputs = torch.stack(outputs, dim=1)  # [batch_size, MAX_LENGTH, vocab_size]
        print(f"OUTPUTS: {outputs.shape}")
        print(f"TARGET LABELS: {target_labels.shape}")
        
        # preds = torch.argmax(outputs, dim = 2)
        mask = target_labels != 0
        loss = loss_fn(outputs.view(-1, outputs.size(-1)), target_labels.view(-1))
        print(f"Loss shape: {loss.shape}")
        loss = loss.view(target_labels.shape)
        masked_loss = loss * mask.float()

        loss_sum = masked_loss.sum()
        num_valid_tokens = mask.sum()
        loss = loss_sum / num_valid_tokens.float()
        print(f"Loss: {loss.item()}")
        train_loss += loss.item()


        if iter_count % 5 == 0:
            print(loss)
        loss.backward()

        encoder_optimizer.step()
        decoder_optimizer.step()

        iter_count += 1
    
    return train_loss / len(dataloader)

def test_loop(dataloader, model, loss_fn, tokenizer, start_token_id):

    all_sentences = []
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in tqdm.tqdm(dataloader):
            # previous tokens
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            token_type_ids = batch["token_type_ids"].to(device)
            target_labels = batch["target_labels"][:, :MAX_LENGTH].to(device)
            
            target_pad = MAX_LENGTH - target_labels.size(1)
            if target_pad > 0:
                target_labels = F.pad(target_labels, (0, target_pad), "constant", 0)
            
            # print(f"Input ids: {input_ids.shape}")
            # print(f"Attention_mask: {attention_mask.shape}")
            encoder_outputs = model.encoder(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
            
            encoder_state = encoder_outputs.last_hidden_state
            
            start_token_id = torch.tensor([start_token_id], dtype=torch.long, device=device)
            start_token_embed = model.embedding(start_token_id)
            decoder_input = start_token_embed.repeat(input_ids.size(0), 1, 1)
            
            # decoder_input = torch.tensor([model.embedding(start_token_id)]*input_ids.size(0), device=device)  # Start token
            outputs = []

            for i in range(MAX_LENGTH):
                print(f"Decoder input: {decoder_input.shape}")
                print(f"Encoder output: {encoder_state.shape}")
                decoder_output = model.decoder(decoder_input, encoder_state)
                
                logits = model.output(decoder_output.squeeze(dim = 1))
                outputs.append(logits)

                _, topi = logits.topk(1)
                decoder_input = topi.squeeze().detach()
                decoder_input = model.embedding(decoder_input).repeat(input_ids.size(0), 1, 1)

            outputs = torch.stack(outputs, dim=1)  # [batch_size, MAX_LENGTH, vocab_size]
            # print(f"OUTPUTS: {outputs.shape}")
            # print(f"TARGET LABELS: {target_labels.shape}")
            
            preds = torch.argmax(outputs, dim = 2)
            mask = target_labels != 0
            loss = loss_fn(outputs.view(-1, outputs.size(-1)), target_labels.view(-1))
            loss = loss.view(target_labels.shape)
            masked_loss = loss * mask.float()

            loss_sum = masked_loss.sum()
            num_valid_tokens = mask.sum()
            loss = loss_sum / num_valid_tokens.float()
            print(f"Loss: {loss.item()}")
            val_loss += loss.item()
            
            decoded_sentences = [tokenizer.decode(pred, skip_special_tokens=True) for pred in preds]
            all_sentences.extend(decoded_sentences)

    return val_loss / len(dataloader), all_sentences
    
        
    

In [172]:
import torch
from torch.nn import Transformer, TransformerDecoder, TransformerDecoderLayer
from transformers import FunnelTokenizer, FunnelModel
import torch.optim as optim

encoder = FunnelModel.from_pretrained("funnel-transformer/small")
encoder_tokenizer = FunnelTokenizer.from_pretrained("funnel-transformer/small")
print(encoder_tokenizer.vocab_size)
decoder_layer = TransformerDecoderLayer(d_model=768, nhead=8, batch_first = True)
decoder = TransformerDecoder(decoder_layer, num_layers = 6)

regenerative_model = RegenerativeTransformer(encoder, decoder)

encoder_optimizer = optim.Adam(regenerative_model.encoder.parameters(), lr=0.001)
decoder_optimizer = optim.Adam(regenerative_model.decoder.parameters(), lr=0.01)

loss_fn = nn.CrossEntropyLoss(reduction='none')



30522


In [175]:
print(encoder_tokenizer.pad_token)
print(encoder_tokenizer.convert_tokens_to_ids(encoder_tokenizer.pad_token))

<pad>
0


In [176]:
bos_token_id = encoder_tokenizer.convert_tokens_to_ids(encoder_tokenizer.bos_token)
train_loop(train_dataloader, regenerative_model, loss_fn, encoder_optimizer, decoder_optimizer, bos_token_id)

  0%|          | 0/35504 [00:00<?, ?it/s]

test
(1, 220)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
test2
test3
Decoder input: torch.Size([1, 1, 768])
Encoder output: torch.Size([1, 220, 768])
Decoder input: torch.Size([1, 1, 768])
Encoder output: torch.Size([1, 220, 768])
Decoder input: torch.Size([1, 1, 768])
Encoder output: torch.Size([1, 220, 768])
Decoder input: torch.Size([1, 1, 768])
Encoder output: torch.Size([1, 220, 768])
Decoder input: torch.Size([1, 1, 768])
Encoder output: torch.Size([1, 220, 768])
Decoder input: torch.Size([1, 1, 768])
Encoder output: torch.Size([1, 220, 768])
Decoder input: torch.Size([1, 1, 768])
Encoder output: torch.Size([1, 220, 768])
Decoder input: torch.Size([1, 1, 768])
Encoder output: torch.Size([1, 220, 768])
Decoder input: torch.Size([1, 1, 768])
Encoder output: torch.Size([1, 220, 768])
Decoder input: torch.Size([1, 1, 768])
Encoder output: torch.Size([1, 220, 768])
Decoder input: torch.Size([1, 1, 768])
Encoder output: torch.Size([1, 220, 768])
Decoder inp

  0%|          | 0/35504 [00:15<?, ?it/s]


KeyboardInterrupt: 

In [None]:
EPOCHS = 50

train_losses = []
val_losses = []
for epoch in range(EPOCHS):
	train_loss = train_loop(train_dataloader, regenerative_model, loss_fn, encoder_optimizer, decoder_optimizer, bos_token_id)
	print(f"Epoch: {train_loss}")
	train_losses.append(train_loss)
	if epoch % 5 == 0:
		val_loss, val_sentences = test_loop(test_dataloader, regenerative_model, loss_fn, encoder_tokenizer, bos_token_id)
		val_losses.append(val_loss)
		checkpoint = {
			"model": regenerative_model.state_dict(),
			"encoder_optimizer": encoder_optimizer.state_dict(),
			"decoder_optimizer": decoder_optimizer.state_dict(),
			"train_losses": train_losses,
			"val_losses": val_losses,
			"val_sentences": val_sentences,
			"epoch": epoch
		}
		torch.save(checkpoint, f"./checkpoints/checkpoint_{epoch}.pt")


print(f"Train Losses: {train_losses}")
print(f"Val Losses: {val_losses}")

for sent in val_sentences:
	print(sent)
	

In [97]:
from transformers import FunnelTokenizer, FunnelModel
tokenizer = FunnelTokenizer.from_pretrained("funnel-transformer/small")
model = FunnelModel.from_pretrained("funnel-transformer/small")
text = "Replace me by any text you'd like."
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)
print(output)
print(output.last_hidden_state.shape)



BaseModelOutput(last_hidden_state=tensor([[[ 0.1376, -0.3090,  0.6842,  ..., -0.4041,  0.8737, -0.7017],
         [-0.1957, -0.2551,  0.0395,  ...,  0.2188,  0.1387,  0.1632],
         [-0.2267,  0.3701,  0.0961,  ...,  0.0500, -0.0244, -0.1002],
         ...,
         [-0.5314,  0.7220,  0.2493,  ...,  0.0725,  0.0889, -0.0084],
         [-0.1779,  0.1923,  0.5646,  ...,  0.4919,  0.7747, -1.1734],
         [-0.1268,  0.0407,  0.0438,  ...,  0.6065,  1.0749, -1.3630]]],
       grad_fn=<NativeLayerNormBackward0>), hidden_states=None, attentions=None)
torch.Size([1, 12, 768])
