In [83]:
import numpy as np
import pandas as pd
import string
import re
# import traintestsplit
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, RobertaModel, AutoModel
import torch
from torch.utils.data import DataLoader, Dataset
from torch import nn
from tqdm import tqdm
from sklearn.metrics import f1_score
from transformers import BertTokenizer, BertForSequenceClassification

In [84]:
df = pd.read_csv('mbti_1.csv')

In [85]:
df.head()
print(len(df))

8675


In [86]:
personality_types = df['type'].unique() 
print(personality_types)

['INFJ' 'ENTP' 'INTP' 'INTJ' 'ENTJ' 'ENFJ' 'INFP' 'ENFP' 'ISFP' 'ISTP'
 'ISFJ' 'ISTJ' 'ESTP' 'ESFP' 'ESTJ' 'ESFJ']


In [87]:
def clean_text(text):
    text = text.replace('|||', ' [SEP] ')
    words = str(text).split(' ')
    words = [i.lower() + " " for i in words]
    words = [i for i in words if not "http" in i]
    words = " ".join(words)
    # words = words.translate(words.maketrans('', '', string.punctuation))
    return words

In [88]:
df['cleaned_text'] = df['posts'].apply(clean_text)

In [89]:
df.head()

Unnamed: 0,type,posts,cleaned_text
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,[sep] [sep] enfp and intj moments sp...
1,ENTP,'I'm finding the lack of me in these posts ver...,'i'm finding the lack of me in these p...
2,INTP,'Good one _____ https://www.youtube.com/wat...,"'good one _____ [sep] of course, t..."
3,INTJ,"'Dear INTP, I enjoyed our conversation the o...","'dear intp, i enjoyed our conversatio..."
4,ENTJ,'You're fired.|||That's another silly misconce...,'you're fired. [sep] that's another silly...


In [53]:
df.drop(columns=['posts'], inplace=True)

In [54]:
df.rename(columns={'cleaned_text': 'posts'}, inplace=True)
df.head()

Unnamed: 0,type,posts
0,INFJ,and intj moments sportscenter not top te...
1,ENTP,im finding the lack of me in these pos...
2,INTP,good one course to which i say i kno...
3,INTJ,dear intp i enjoyed our conversation the...
4,ENTJ,youre firedsepthats another silly misconce...


In [55]:
#I is 0, E is 1
#N is 0, S is 1
#F is 0, T is 1
#J is 0, P is 1
def convert_personality_to_binary(personality):
    I = 1 if personality[0] == 'I' else 0
    N = 1 if personality[1] == 'N' else 0
    F = 1 if personality[2] == 'F' else 0
    J = 1 if personality[3] == 'J' else 0
    return [I, N, F, J]

df['label'] = df['type'].apply(convert_personality_to_binary)

In [11]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [12]:
df.head()

Unnamed: 0,type,posts,label
0,INFJ,enfp and intj moments sportscenter not t...,"[1, 1, 1, 1]"
1,ENTP,im finding the lack of me in these pos...,"[0, 1, 0, 0]"
2,INTP,good one of course to which i say i ...,"[1, 1, 0, 0]"
3,INTJ,dear intp i enjoyed our conversation the...,"[1, 1, 0, 1]"
4,ENTJ,youre fired thats another silly misconcep...,"[0, 1, 0, 1]"


In [13]:
tokenizer = AutoTokenizer.from_pretrained('Bert-base-uncased')

In [14]:
class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        #bertforsequenceclassification
        self.bert = AutoModel.from_pretrained('Bert-base-uncased')
        self.fc = nn.Linear(768, 4)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids, attention_mask):
        x = self.bert(input_ids, attention_mask)
        x = x['pooler_output']
        x = self.fc(x)
        x = self.sigmoid(x)
        return x

In [15]:
class MBTIDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        text = self.df.iloc[idx]['posts']
        label = self.df.iloc[idx]['label']

        encoding = self.tokenizer(
            text,
            return_tensors='pt',
            max_length=self.max_len,
            padding='max_length',
            truncation=True
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label)
        }

In [16]:
train_dataset = MBTIDataset(train_df, tokenizer, 256)
test_dataset = MBTIDataset(test_df, tokenizer, 256)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

In [17]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


In [18]:
model = Model().to(device)
criterion = nn.BCELoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

In [19]:
num_epochs = 5

In [20]:
def train(model, train_loader, test_loader, num_epochs, criterion, optimizer):
    model.train()
    for epoch in range(num_epochs):
        running_loss = 0.0
        for i, data in tqdm(enumerate(train_loader), total=len(train_loader)):
            input_ids = data['input_ids'].to(device)
            attention_mask = data['attention_mask'].to(device)
            label = data['label'].to(device)   
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, label.float())
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        print(f'Epoch {epoch + 1}, Loss: {running_loss/len(train_loader)}')
        model.eval()
        running_loss = 0.0
        with torch.no_grad():
            y_true = []
            y_pred = []
            for i, data in tqdm(enumerate(test_loader), total=len(test_loader)):
                input_ids = data['input_ids'].to(device)
                attention_mask = data['attention_mask'].to(device)
                label = data['label'].to(device)
                outputs = model(input_ids, attention_mask)
                loss = criterion(outputs, label.float())
                running_loss += loss.item()
                y_true.extend(label.cpu().numpy())
                y_pred.extend(outputs.cpu().numpy())
            y_true = np.array(y_true)
            y_pred = np.array(y_pred)
            y_pred = np.round(y_pred)
            #compute f1 scores for each all 4 columns
            f1_0 = f1_score(y_true[:, 0], y_pred[:, 0], average='macro')
            f1_1 = f1_score(y_true[:, 1], y_pred[:, 1], average='macro')
            f1_2 = f1_score(y_true[:, 2], y_pred[:, 2], average='macro')
            f1_3 = f1_score(y_true[:, 3], y_pred[:, 3], average='macro')
            print(f'F1 Score for I/E: {f1_0}')
            print(f'F1 Score for N/S: {f1_1}')
            print(f'F1 Score for F/T: {f1_2}')
            print(f'F1 Score for J/P: {f1_3}')
            print(f'Loss: {running_loss/len(test_loader)}')

In [21]:
train(model, train_loader, test_loader, num_epochs, criterion, optimizer)

  0%|          | 0/217 [00:22<?, ?it/s]


KeyboardInterrupt: 