In [1]:
import pandas as pd
import string
import re
# import traintestsplit
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, RobertaModel, AutoModel
import torch
from torch.utils.data import DataLoader, Dataset
from torch import nn
from tqdm import tqdm
from sklearn.metrics import f1_score
from transformers import BertTokenizer, BertForSequenceClassification

In [2]:
df = pd.read_csv('/kaggle/input/nlp-proj/mbti_1.csv')

In [3]:
df.head()
print(len(df))

8675


In [4]:
personality_types = df['type'].unique() 
print(personality_types)

['INFJ' 'ENTP' 'INTP' 'INTJ' 'ENTJ' 'ENFJ' 'INFP' 'ENFP' 'ISFP' 'ISTP'
 'ISFJ' 'ISTJ' 'ESTP' 'ESFP' 'ESTJ' 'ESFJ']


In [5]:
def clean_text(text):
    regex = re.compile('[%s]' % re.escape('|'))
    text = regex.sub(" ", text)
    words = str(text).split()
    words = [i.lower() + " " for i in words]
    words = [i for i in words if not "http" in i]
    words = " ".join(words)
    words = words.translate(words.maketrans('', '', string.punctuation))
    return words

In [6]:
df['cleaned_text'] = df['posts'].apply(clean_text)

In [7]:
df.head()

Unnamed: 0,type,posts,cleaned_text
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,enfp and intj moments sportscenter not t...
1,ENTP,'I'm finding the lack of me in these posts ver...,im finding the lack of me in these pos...
2,INTP,'Good one _____ https://www.youtube.com/wat...,good one of course to which i say i ...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o...",dear intp i enjoyed our conversation the...
4,ENTJ,'You're fired.|||That's another silly misconce...,youre fired thats another silly misconcep...


In [8]:
df.drop(columns=['posts'], inplace=True)

In [9]:
df.rename(columns={'cleaned_text': 'posts'}, inplace=True)
df.head()

Unnamed: 0,type,posts
0,INFJ,enfp and intj moments sportscenter not t...
1,ENTP,im finding the lack of me in these pos...
2,INTP,good one of course to which i say i ...
3,INTJ,dear intp i enjoyed our conversation the...
4,ENTJ,youre fired thats another silly misconcep...


In [10]:
#'INFJ' 'ENTP' 'INTP' 'INTJ' 'ENTJ' 'ENFJ' 'INFP' 'ENFP' 'ISFP' 'ISTP' 'ISFJ' 'ISTJ' 'ESTP' 'ESFP' 'ESTJ' 'ESFJ'
id2label = {0: 'INFJ', 1: 'ENTP', 2: 'INTP', 3: 'INTJ', 4: 'ENTJ', 5: 'ENFJ', 6: 'INFP', 7: 'ENFP', 8: 'ISFP', 9: 'ISTP', 10: 'ISFJ', 11: 'ISTJ', 12: 'ESTP', 13: 'ESFP', 14: 'ESTJ', 15: 'ESFJ'} 
label2id = {'INFJ': 0, 'ENTP': 1, 'INTP': 2, 'INTJ': 3, 'ENTJ': 4, 'ENFJ': 5, 'INFP': 6, 'ENFP': 7, 'ISFP': 8, 'ISTP': 9, 'ISFJ': 10, 'ISTJ': 11, 'ESTP': 12, 'ESFP': 13, 'ESTJ': 14, 'ESFJ': 15} # label rep

In [11]:
df['label'] = df['type'].map(label2id)
df.head()

Unnamed: 0,type,posts,label
0,INFJ,enfp and intj moments sportscenter not t...,0
1,ENTP,im finding the lack of me in these pos...,1
2,INTP,good one of course to which i say i ...,2
3,INTJ,dear intp i enjoyed our conversation the...,3
4,ENTJ,youre fired thats another silly misconcep...,4


In [12]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [13]:
# # only for local testing
# train_df = train_df[:100]
# test_df = test_df[:100]

In [14]:
tokenizer = AutoTokenizer.from_pretrained('Bert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [15]:
class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        #bertforsequenceclassification
        self.bert = AutoModel.from_pretrained('Bert-base-uncased')
        self.fc = nn.Linear(768, 16)

    def forward(self, input_ids, attention_mask):
        x = self.bert(input_ids, attention_mask)
        out = x[0]
        out = out[:, 0, :]
        out = self.fc(out)
        return out

        
    # def forward(self, input_ids, attention_mask):
    #     x = self.roberta(input_ids, attention_mask)
    #     out = x[0]
    #     out = out[:, 0, :]
    #     out = self.fc(out)
    #     return out

In [16]:
class MBTIDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        text = self.df.iloc[idx]['posts']
        label = self.df.iloc[idx]['label']

        encoding = self.tokenizer(
            text,
            return_tensors='pt',
            max_length=self.max_len,
            padding='max_length',
            truncation=True
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label)
        }

In [18]:
train_dataset = MBTIDataset(train_df, tokenizer, 256)
test_dataset = MBTIDataset(test_df, tokenizer, 256)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

In [19]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [20]:
model = Model().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [21]:
num_classes = 16

In [22]:
def train(model, train_loader, test_loader, criterion, optimizer, num_classes, num_epochs):
    train_losses = []
    val_losses = []
    f1_micros = []
    f1_macros = []
    f1_weighteds = []
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0
        # avg_f1_micro = 0
        # avg_f1_macro = 0
        # avg_f1_weighted = 0
        #tqdm
        for i, data in enumerate(tqdm(train_loader)): 
            input_ids = data['input_ids'].to(device)
            attention_masks = data['attention_mask'].to(device)
            labels = data['label'].to(device)
            optimizer.zero_grad()
            outputs = model(input_ids, attention_masks)
            loss = criterion(outputs.view(-1, num_classes), labels)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        #     predictions = torch.argmax(outputs, dim=1)
        #     labels = torch.argmax(labels, dim=1)
        #     f1_micro = f1_score(labels.cpu().numpy(), predictions.cpu().numpy(), average='micro')
        #     f1_macro = f1_score(labels.cpu().numpy(), predictions.cpu().numpy(), average='macro')
        #     f1_weighted = f1_score(labels.cpu().numpy(), predictions.cpu().numpy(), average='weighted')
        #     print(f1_micro, f1_macro, f1_weighted)
        #     avg_f1_micro += f1_micro
        #     avg_f1_macro += f1_macro
        #     avg_f1_weighted += f1_weighted
        # avg_f1_micro /= len(train_loader)
        # avg_f1_macro /= len(train_loader)
        # avg_f1_weighted /= len(train_loader)
        train_loss /= len(train_loader)
        train_losses.append(train_loss)
        # f1_micros.append(avg_f1_micro)
        # f1_macros.append(avg_f1_macro)
        # f1_weighteds.append(avg_f1_weighted)
        print(f'Epoch {epoch + 1}/{num_epochs}, Train Loss: {train_loss}')
        model.eval()
        val_loss = 0
        with torch.no_grad():
            predictions = []
            true_labels = []
            for i, data in enumerate(test_loader):
                # print(data)
                input_ids = data['input_ids'].to(device)
                attention_masks = data['attention_mask'].to(device)
                labels = data['label'].to(device) #labels are one-hot encoded
                outputs = model(input_ids, attention_masks)
                loss = criterion(outputs.view(-1, num_classes), labels)
                val_loss += loss.item()
                # print('Outputs: ', outputs.shape)
                predictions.append(torch.argmax(outputs, dim=1))
                # print('Labels: ', labels.shape)
                true_labels.append(labels)
            predictions = torch.cat(predictions, dim=0)
            true_labels = torch.cat(true_labels, dim=0)
            print(predictions)
            print(true_labels)
            f1_micro = f1_score(true_labels.cpu().numpy(), predictions.cpu().numpy(), average='micro')
            f1_macro = f1_score(true_labels.cpu().numpy(), predictions.cpu().numpy(), average='macro')
            f1_weighted = f1_score(true_labels.cpu().numpy(), predictions.cpu().numpy(), average='weighted')
            val_loss /= len(test_loader)
            val_losses.append(val_loss)
            print(f'Epoch {epoch + 1}/{num_epochs}, Val Loss: {val_loss}, F1 Micro: {f1_micro}, F1 Macro: {f1_macro}, F1 Weighted: {f1_weighted}')
    return train_losses, val_losses

In [23]:
train_losses, val_losses = train(model, train_loader, test_loader, criterion, optimizer, num_classes, 10)

100%|██████████| 217/217 [03:26<00:00,  1.05it/s]


Epoch 1/10, Train Loss: 2.172328717148249
tensor([2, 6, 2,  ..., 6, 2, 2], device='cuda:0')
tensor([ 2,  3,  2,  ...,  0, 11,  2], device='cuda:0')
Epoch 1/10, Val Loss: 1.875280551476912, F1 Micro: 0.41844380403458215, F1 Macro: 0.16429050259085462, F1 Weighted: 0.36528228051881045


100%|██████████| 217/217 [03:27<00:00,  1.05it/s]


Epoch 2/10, Train Loss: 1.7129788569041662
tensor([2, 2, 2,  ..., 6, 2, 2], device='cuda:0')
tensor([ 2,  3,  2,  ...,  0, 11,  2], device='cuda:0')
Epoch 2/10, Val Loss: 1.6939166892658581, F1 Micro: 0.47665706051873197, F1 Macro: 0.31125302327146087, F1 Weighted: 0.4594278514950719


100%|██████████| 217/217 [03:26<00:00,  1.05it/s]


Epoch 3/10, Train Loss: 1.5175851193441223
tensor([2, 2, 2,  ..., 6, 2, 2], device='cuda:0')
tensor([ 2,  3,  2,  ...,  0, 11,  2], device='cuda:0')
Epoch 3/10, Val Loss: 1.6024520505558362, F1 Micro: 0.5296829971181556, F1 Macro: 0.39237288432097966, F1 Weighted: 0.5170913393738721


100%|██████████| 217/217 [03:24<00:00,  1.06it/s]


Epoch 4/10, Train Loss: 1.3440433498901156
tensor([2, 2, 2,  ..., 6, 2, 2], device='cuda:0')
tensor([ 2,  3,  2,  ...,  0, 11,  2], device='cuda:0')
Epoch 4/10, Val Loss: 1.5789603883569892, F1 Micro: 0.5302593659942363, F1 Macro: 0.4112599662505335, F1 Weighted: 0.5211749293330971


100%|██████████| 217/217 [03:24<00:00,  1.06it/s]


Epoch 5/10, Train Loss: 1.1859527227515998
tensor([2, 1, 2,  ..., 6, 2, 2], device='cuda:0')
tensor([ 2,  3,  2,  ...,  0, 11,  2], device='cuda:0')
Epoch 5/10, Val Loss: 1.6008684158325195, F1 Micro: 0.5268011527377522, F1 Macro: 0.4241688388720961, F1 Weighted: 0.5214550180051774


100%|██████████| 217/217 [03:24<00:00,  1.06it/s]


Epoch 6/10, Train Loss: 1.0233258767062068
tensor([2, 1, 2,  ..., 7, 2, 2], device='cuda:0')
tensor([ 2,  3,  2,  ...,  0, 11,  2], device='cuda:0')
Epoch 6/10, Val Loss: 1.7425647063688798, F1 Micro: 0.5112391930835735, F1 Macro: 0.4473449188916827, F1 Weighted: 0.5120127437357783


100%|██████████| 217/217 [03:24<00:00,  1.06it/s]


Epoch 7/10, Train Loss: 0.86180817957298
tensor([2, 2, 2,  ..., 7, 2, 2], device='cuda:0')
tensor([ 2,  3,  2,  ...,  0, 11,  2], device='cuda:0')
Epoch 7/10, Val Loss: 1.7595122424038974, F1 Micro: 0.5244956772334294, F1 Macro: 0.4484494748000673, F1 Weighted: 0.5226338022303021


100%|██████████| 217/217 [03:24<00:00,  1.06it/s]


Epoch 8/10, Train Loss: 0.7062745261851544
tensor([2, 2, 2,  ..., 6, 2, 2], device='cuda:0')
tensor([ 2,  3,  2,  ...,  0, 11,  2], device='cuda:0')
Epoch 8/10, Val Loss: 1.9399656144055453, F1 Micro: 0.515850144092219, F1 Macro: 0.43047975117512727, F1 Weighted: 0.5091888070461967


100%|██████████| 217/217 [03:25<00:00,  1.06it/s]


Epoch 9/10, Train Loss: 0.5555194661639253
tensor([2, 1, 2,  ..., 7, 2, 2], device='cuda:0')
tensor([ 2,  3,  2,  ...,  0, 11,  2], device='cuda:0')
Epoch 9/10, Val Loss: 1.9900632944974033, F1 Micro: 0.523342939481268, F1 Macro: 0.45373505327282815, F1 Weighted: 0.521115404273347


100%|██████████| 217/217 [03:25<00:00,  1.06it/s]


Epoch 10/10, Train Loss: 0.434142341296519
tensor([2, 2, 2,  ..., 6, 2, 2], device='cuda:0')
tensor([ 2,  3,  2,  ...,  0, 11,  2], device='cuda:0')
Epoch 10/10, Val Loss: 2.1517654050480237, F1 Micro: 0.5152737752161384, F1 Macro: 0.43203021110094963, F1 Weighted: 0.5107344777747636


In [24]:
torch.save(model.state_dict(), "bert_MBTI.pth")