In [1]:
import numpy as np
import pandas as pd
import string
import re
# import traintestsplit
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, RobertaModel, AutoModel, RobertaTokenizer
import torch
from torch.utils.data import DataLoader, Dataset
from torch import nn
from tqdm import tqdm
from sklearn.metrics import f1_score
from transformers import BertTokenizer, BertForSequenceClassification

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
df = pd.read_csv('/kaggle/input/nlp-proj/mbti_1.csv')

In [6]:
df.head()
print(len(df))

8675


In [7]:
personality_types = df['type'].unique() 
print(personality_types)

['INFJ' 'ENTP' 'INTP' 'INTJ' 'ENTJ' 'ENFJ' 'INFP' 'ENFP' 'ISFP' 'ISTP'
 'ISFJ' 'ISTJ' 'ESTP' 'ESFP' 'ESTJ' 'ESFJ']


In [8]:
def clean_text(text):
    text = text.replace('|||', ' [SEP] ')
    words = str(text).split(' ')
    words = [i.lower() + " " for i in words]
    words = [i for i in words if not "http" in i]
    words = " ".join(words)
    # words = words.translate(words.maketrans('', '', string.punctuation))
    return words

In [9]:
df['cleaned_text'] = df['posts'].apply(clean_text)

In [10]:
df.head()

Unnamed: 0,type,posts,cleaned_text
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,[sep] [sep] enfp and intj moments sp...
1,ENTP,'I'm finding the lack of me in these posts ver...,'i'm finding the lack of me in these p...
2,INTP,'Good one _____ https://www.youtube.com/wat...,"'good one _____ [sep] of course, t..."
3,INTJ,"'Dear INTP, I enjoyed our conversation the o...","'dear intp, i enjoyed our conversatio..."
4,ENTJ,'You're fired.|||That's another silly misconce...,'you're fired. [sep] that's another silly...


In [11]:
df.drop(columns=['posts'], inplace=True)

In [12]:
df.rename(columns={'cleaned_text': 'posts'}, inplace=True)
df.head()

Unnamed: 0,type,posts
0,INFJ,[sep] [sep] enfp and intj moments sp...
1,ENTP,'i'm finding the lack of me in these p...
2,INTP,"'good one _____ [sep] of course, t..."
3,INTJ,"'dear intp, i enjoyed our conversatio..."
4,ENTJ,'you're fired. [sep] that's another silly...


In [13]:
#I is 0, E is 1
#N is 0, S is 1
#F is 0, T is 1
#J is 0, P is 1
def convert_personality_to_binary(personality):
    I = 1 if personality[0] == 'I' else 0
    N = 1 if personality[1] == 'N' else 0
    F = 1 if personality[2] == 'F' else 0
    J = 1 if personality[3] == 'J' else 0
    return [I, N, F, J]

df['label'] = df['type'].apply(convert_personality_to_binary)

In [14]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [15]:
df.head()

Unnamed: 0,type,posts,label
0,INFJ,[sep] [sep] enfp and intj moments sp...,"[1, 1, 1, 1]"
1,ENTP,'i'm finding the lack of me in these p...,"[0, 1, 0, 0]"
2,INTP,"'good one _____ [sep] of course, t...","[1, 1, 0, 0]"
3,INTJ,"'dear intp, i enjoyed our conversatio...","[1, 1, 0, 1]"
4,ENTJ,'you're fired. [sep] that's another silly...,"[0, 1, 0, 1]"


In [19]:
tokenizer = RobertaTokenizer.from_pretrained("FacebookAI/roberta-base")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [20]:
class Roberta_Model(nn.Module):
    def __init__(self):
        super(Roberta_Model, self).__init__()
        self.bert = RobertaModel.from_pretrained("FacebookAI/roberta-base")
        self.fc = nn.Linear(768, 4)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids, attention_mask):
        x = self.bert(input_ids, attention_mask)
        x = x['pooler_output']
        x = self.fc(x)
        x = self.sigmoid(x)
        return x

In [21]:
class MBTIDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        text = self.df.iloc[idx]['posts']
        label = self.df.iloc[idx]['label']

        encoding = self.tokenizer(
            text,
            return_tensors='pt',
            max_length=self.max_len,
            padding='max_length',
            truncation=True
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label)
        }

In [22]:
train_dataset = MBTIDataset(train_df, tokenizer, 256)
test_dataset = MBTIDataset(test_df, tokenizer, 256)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

In [23]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [24]:
model = Roberta_Model().to(device)
criterion = nn.BCELoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
num_epochs = 5

In [26]:
def train(model, train_loader, test_loader, num_epochs, criterion, optimizer):
    model.train()
    for epoch in range(num_epochs):
        running_loss = 0.0
        for i, data in tqdm(enumerate(train_loader), total=len(train_loader)):
            input_ids = data['input_ids'].to(device)
            attention_mask = data['attention_mask'].to(device)
            label = data['label'].to(device)   
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, label.float())
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        print(f'Epoch {epoch + 1}, Loss: {running_loss/len(train_loader)}')
        model.eval()
        running_loss = 0.0
        with torch.no_grad():
            y_true = []
            y_pred = []
            for i, data in tqdm(enumerate(test_loader), total=len(test_loader)):
                input_ids = data['input_ids'].to(device)
                attention_mask = data['attention_mask'].to(device)
                label = data['label'].to(device)
                outputs = model(input_ids, attention_mask)
                loss = criterion(outputs, label.float())
                running_loss += loss.item()
                y_true.extend(label.cpu().numpy())
                y_pred.extend(outputs.cpu().numpy())
            y_true = np.array(y_true)
            y_pred = np.array(y_pred)
            y_pred = np.round(y_pred)
            #compute f1 scores for each all 4 columns
            f1_0 = f1_score(y_true[:, 0], y_pred[:, 0], average='macro')
            f1_1 = f1_score(y_true[:, 1], y_pred[:, 1], average='macro')
            f1_2 = f1_score(y_true[:, 2], y_pred[:, 2], average='macro')
            f1_3 = f1_score(y_true[:, 3], y_pred[:, 3], average='macro')
            print(f'F1 Score for I/E: {f1_0}')
            print(f'F1 Score for N/S: {f1_1}')
            print(f'F1 Score for F/T: {f1_2}')
            print(f'F1 Score for J/P: {f1_3}')
            print(f'Loss: {running_loss/len(test_loader)}')

In [27]:
train(model, train_loader, test_loader, num_epochs, criterion, optimizer)

100%|██████████| 217/217 [06:00<00:00,  1.66s/it]


Epoch 1, Loss: 0.5805019158097456


100%|██████████| 55/55 [01:02<00:00,  1.14s/it]


F1 Score for I/E: 0.4381476683937824
F1 Score for N/S: 0.461848635235732
F1 Score for F/T: 0.594009594009594
F1 Score for J/P: 0.3805783648696894
Loss: 0.5681264801458878


100%|██████████| 217/217 [05:49<00:00,  1.61s/it]


Epoch 2, Loss: 0.5579177061533598


100%|██████████| 55/55 [01:01<00:00,  1.12s/it]


F1 Score for I/E: 0.48742707517010614
F1 Score for N/S: 0.461848635235732
F1 Score for F/T: 0.6944646937889571
F1 Score for J/P: 0.4329903395933362
Loss: 0.5263318880037828


100%|██████████| 217/217 [05:49<00:00,  1.61s/it]


Epoch 3, Loss: 0.501756442582003


100%|██████████| 55/55 [01:01<00:00,  1.12s/it]


F1 Score for I/E: 0.6625881536411995
F1 Score for N/S: 0.4784223602484472
F1 Score for F/T: 0.7194663802809029
F1 Score for J/P: 0.6221089214059052
Loss: 0.5024706016887318


100%|██████████| 217/217 [05:49<00:00,  1.61s/it]


Epoch 4, Loss: 0.4262402453180832


100%|██████████| 55/55 [01:01<00:00,  1.12s/it]


F1 Score for I/E: 0.6632360213140589
F1 Score for N/S: 0.6407910901657503
F1 Score for F/T: 0.7211064884288172
F1 Score for J/P: 0.6280017152658662
Loss: 0.5043151259422303


100%|██████████| 217/217 [05:50<00:00,  1.61s/it]


Epoch 5, Loss: 0.30604932433174503


100%|██████████| 55/55 [01:01<00:00,  1.12s/it]

F1 Score for I/E: 0.6644451995366868
F1 Score for N/S: 0.6420663191626896
F1 Score for F/T: 0.7041954130522854
F1 Score for J/P: 0.6317264058473655
Loss: 0.5857533693313599





In [28]:
torch.save(model.state_dict(), 'roberta_model.pth')