<a href="https://colab.research.google.com/github/suchitbhayani/psychtweets/blob/main/model_training_eval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from transformers import BertTokenizerFast, BertModel, get_scheduler
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
from torch.optim import AdamW

In [2]:
from google.colab import files
uploaded = files.upload()

Saving cleaned.csv to cleaned.csv


# Further Preprocessing

In [3]:
data = pd.read_csv('/content/cleaned.csv').drop(columns=['Unnamed: 0'])
data

Unnamed: 0,cleaned text,label
0,"the pope is infallible, this is a catholic dog...",intj
1,"being you makes you look cute on, because then...",intj
2,"i'm like entp but idiotichey boy, do you want ...",intj
3,give it to ... he has pica since childhood say...,intj
4,frances farmer will have her revenge on seattl...,intj
...,...,...
7232,"god,,pls take care hiro emergency room???? are...",intp
7233,wow last time i got intp i think u upset the f...,intp
7234,a 100% that someone will get his ass kicked so...,entp
7235,if you’re #intj this one is for you | what is ...,infj


In [4]:
le = LabelEncoder()
data['label'] = le.fit_transform(data['label'])
data

Unnamed: 0,cleaned text,label
0,"the pope is infallible, this is a catholic dog...",10
1,"being you makes you look cute on, because then...",10
2,"i'm like entp but idiotichey boy, do you want ...",10
3,give it to ... he has pica since childhood say...,10
4,frances farmer will have her revenge on seattl...,10
...,...,...
7232,"god,,pls take care hiro emergency room???? are...",11
7233,wow last time i got intp i think u upset the f...,11
7234,a 100% that someone will get his ass kicked so...,3
7235,if you’re #intj this one is for you | what is ...,8


# Dataset Class

https://pytorch.org/tutorials/beginner/basics/data_tutorial.html

In [5]:
class MBTIDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    # returns a sample from dataset based on given idx
    def __getitem__(self, idx):
        item_dct = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item_dct['labels'] = torch.tensor(self.labels[idx])
        return item_dct

In [6]:
train_text, test_text, train_labels, test_labels = train_test_split(data['cleaned text'].tolist(), data['label'].tolist(), test_size=0.2, random_state=5)

In [7]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

train_encodings = tokenizer(text=train_text, truncation=True, padding=True)
test_encodings = tokenizer(text=test_text, truncation=True, padding=True)

train_data = MBTIDataset(train_encodings, train_labels)
test_data = MBTIDataset(test_encodings, test_labels)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [8]:
train_data[0]

{'input_ids': tensor([  101,  2191,  1996, 18414, 19291,  1998, 11947,  5332,  2243, 21495,
          3398,  2009,  1005,  1055,  2066,  2007,  1996,  7793,  1010,  2057,
          2074,  2655,  2068,  2035,  1005,  3902,  1005,  1012,  1996,  2069,
         20582,  2003,  1996,  2112,  1997,  1996,  2866,  1529,  1045,  1005,
          2310,  2196,  2657,  1037,  3345,  2022,  2170,  1037,  2422,  4334,
          3345,  2058,  2182,  1010,  2057,  2031,  2028,  2427,  1997,  3345,
          1998,  2009,  1005,  1055,  2170,  1005,  3345,  1005,  1996,  5848,
          2017, 10295, 18168,  2290,  9103, 19291,  2296,  2051,  2002,  3092,
          2039,  4911,  2046, 11947,  5332,  2243,  1005,  1055,  8102,  1024,
          2009,  1005,  1055,  2471, 20720,  2213,  1045,  1005,  1049,  2589,
         12403, 25057,  2023,  4485, 29300,  1010,  1045,  2074,  2228, 16914,
          3775,  2003, 17704,  4658,  1010,  1010,  9467,  2026,  6180,  2828,
          4487,  1529,  2057, 20014, 27

# Building the Neural Network

https://pytorch.org/tutorials/beginner/basics/buildmodel_tutorial.html

In [9]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using cuda device


In [10]:
hidden_layer_size = 64

In [11]:
class MBTIPredictor(nn.Module):
    def __init__(self):
        super(MBTIPredictor, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(768, hidden_layer_size),
            nn.ReLU(),
            nn.Linear(hidden_layer_size, 16)
        )

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        logits = self.linear_relu_stack(outputs[1])
        return logits

In [12]:
model = MBTIPredictor().to(device)
print(model)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

MBTIPredictor(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise

# Model Training

https://pytorch.org/tutorials/beginner/basics/optimization_tutorial.html

In [13]:
learning_rate = 5e-5
loss_fn = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=learning_rate)
batch_size = 16
epochs = 3

train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False)

num_training_steps = epochs * len(train_loader)
lr_scheduler = get_scheduler(
    name="cosine", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

In [14]:
def train_loop(dataloader, model, loss_fn, optimizer, device):
    size = len(dataloader.dataset)
    model.train()
    for batch, batch_data in enumerate(dataloader):

        # Compute prediction and loss
        input_ids = batch_data['input_ids'].to(device)
        attention_mask = batch_data['attention_mask'].to(device)
        labels = batch_data['labels'].to(device)

        pred = model(input_ids, attention_mask)
        loss = loss_fn(pred, labels)

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        lr_scheduler.step()

        if batch % 50 == 0:
            loss_value = loss.item()
            current = batch * len(input_ids)
            print(f"loss: {loss_value:>7f}  [{current:>5d}/{size:>5d}]")

In [None]:
model = MBTIPredictor().to(device)

# Training loop
epochs = 3
for epoch in range(epochs):
    print(f"Epoch {epoch+1}\n-------------------------------")
    train_loop(train_loader, model, loss_fn, optimizer, device)

print("Training complete!")

Epoch 1
-------------------------------
loss: 2.683860  [    0/ 5789]
loss: 2.699994  [  800/ 5789]


In [None]:
def find_accuracy(model, dataloader):
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for batch_data in dataloader:
            input_ids = batch_data['input_ids'].to(device)
            attention_mask = batch_data['attention_mask'].to(device)
            labels = batch_data['labels'].to(device)

            model_output = model(input_ids, attention_mask)

            _, predicted = torch.max(model_output.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = 100 * correct / total
    return accuracy

In [None]:
find_accuracy(model, test_loader)