In [81]:
import numpy as np
import pandas as pd
import string
import re
# import traintestsplit
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, RobertaModel, AutoModel
import torch
from torch.utils.data import DataLoader, Dataset
from torch import nn
from tqdm import tqdm
from sklearn.metrics import f1_score
from transformers import BertTokenizer, BertForSequenceClassification

In [82]:
df = pd.read_csv('/kaggle/input/nlpproj/mbti_1.csv')

In [83]:
df.head()
print(len(df))

8675


In [84]:
personality_types = df['type'].unique() 
print(personality_types)

['INFJ' 'ENTP' 'INTP' 'INTJ' 'ENTJ' 'ENFJ' 'INFP' 'ENFP' 'ISFP' 'ISTP'
 'ISFJ' 'ISTJ' 'ESTP' 'ESFP' 'ESTJ' 'ESFJ']


In [85]:
def clean_text(text):
    text = text.replace('|||', ' [SEP] ')
    words = str(text).split(' ')
    words = [i.lower() + " " for i in words]
    words = [i for i in words if not "http" in i]
    words = " ".join(words)
    # words = words.translate(words.maketrans('', '', string.punctuation))
    return words

In [86]:
df['cleaned_text'] = df['posts'].apply(clean_text)

In [87]:
df.head()

Unnamed: 0,type,posts,cleaned_text
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,[sep] [sep] enfp and intj moments sp...
1,ENTP,'I'm finding the lack of me in these posts ver...,'i'm finding the lack of me in these p...
2,INTP,'Good one _____ https://www.youtube.com/wat...,"'good one _____ [sep] of course, t..."
3,INTJ,"'Dear INTP, I enjoyed our conversation the o...","'dear intp, i enjoyed our conversatio..."
4,ENTJ,'You're fired.|||That's another silly misconce...,'you're fired. [sep] that's another silly...


In [88]:
df.drop(columns=['posts'], inplace=True)

In [89]:
df.rename(columns={'cleaned_text': 'posts'}, inplace=True)
df.head()

Unnamed: 0,type,posts
0,INFJ,[sep] [sep] enfp and intj moments sp...
1,ENTP,'i'm finding the lack of me in these p...
2,INTP,"'good one _____ [sep] of course, t..."
3,INTJ,"'dear intp, i enjoyed our conversatio..."
4,ENTJ,'you're fired. [sep] that's another silly...


In [90]:
#I is 0, E is 1
#N is 0, S is 1
#F is 0, T is 1
#J is 0, P is 1
def convert_personality_to_binary(personality):
    I = 1 if personality[0] == 'I' else 0
    N = 1 if personality[1] == 'N' else 0
    F = 1 if personality[2] == 'F' else 0
    J = 1 if personality[3] == 'J' else 0
    return [I, N, F, J]

def convert_binary_to_personality(binary):
    I = 'I' if (binary[0] == (1)) else 'E'
    N = 'N' if (binary[1] == (1)) else 'S'
    F = 'F' if (binary[2] == (1)) else 'T'
    J = 'J' if (binary[3] == (1)) else 'P'
    return I + N + F + J

df['label'] = df['type'].apply(convert_personality_to_binary)

In [91]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [92]:
df.head()

Unnamed: 0,type,posts,label
0,INFJ,[sep] [sep] enfp and intj moments sp...,"[1, 1, 1, 1]"
1,ENTP,'i'm finding the lack of me in these p...,"[0, 1, 0, 0]"
2,INTP,"'good one _____ [sep] of course, t...","[1, 1, 0, 0]"
3,INTJ,"'dear intp, i enjoyed our conversatio...","[1, 1, 0, 1]"
4,ENTJ,'you're fired. [sep] that's another silly...,"[0, 1, 0, 1]"


In [93]:
tokenizer = AutoTokenizer.from_pretrained('Bert-base-uncased')

In [94]:
class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        #bertforsequenceclassification
        self.bert = AutoModel.from_pretrained('Bert-base-uncased')
        self.fc = nn.Linear(768, 4)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids, attention_mask):
        x = self.bert(input_ids, attention_mask)
        x = x['pooler_output']
        x = self.fc(x)
        x = self.sigmoid(x)
        return x

In [95]:
class MBTIDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        text = self.df.iloc[idx]['posts']
        label = self.df.iloc[idx]['label']

        encoding = self.tokenizer(
            text,
            return_tensors='pt',
            max_length=self.max_len,
            padding='max_length',
            truncation=True
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label)
        }

In [96]:
train_dataset = MBTIDataset(train_df, tokenizer, 256)
test_dataset = MBTIDataset(test_df, tokenizer, 256)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

In [97]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [98]:
model = Model().to(device)
criterion = nn.BCELoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

In [99]:
num_epochs = 5

In [100]:
def train(model, train_loader, test_loader, num_epochs, criterion, optimizer):
    model.train()
    for epoch in range(num_epochs):
        running_loss = 0.0
        for i, data in tqdm(enumerate(train_loader), total=len(train_loader)):
            input_ids = data['input_ids'].to(device)
            attention_mask = data['attention_mask'].to(device)
            label = data['label'].to(device)   
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, label.float())
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        print(f'Epoch {epoch + 1}, Loss: {running_loss/len(train_loader)}')
        model.eval()
        running_loss = 0.0
        with torch.no_grad():
            y_true = []
            y_pred = []
            for i, data in tqdm(enumerate(test_loader), total=len(test_loader)):
                input_ids = data['input_ids'].to(device)
                attention_mask = data['attention_mask'].to(device)
                label = data['label'].to(device)
                outputs = model(input_ids, attention_mask)
                loss = criterion(outputs, label.float())
                running_loss += loss.item()
                y_true.extend(label.cpu().numpy())
                y_pred.extend(outputs.cpu().numpy())
            y_true = np.array(y_true)
            y_pred = np.array(y_pred)
            y_pred = np.round(y_pred)
            #compute f1 scores for each all 4 columns
            f1_0 = f1_score(y_true[:, 0], y_pred[:, 0], average='macro')
            f1_1 = f1_score(y_true[:, 1], y_pred[:, 1], average='macro')
            f1_2 = f1_score(y_true[:, 2], y_pred[:, 2], average='macro')
            f1_3 = f1_score(y_true[:, 3], y_pred[:, 3], average='macro')
            print(f'F1 Score for I/E: {f1_0}')
            print(f'F1 Score for N/S: {f1_1}')
            print(f'F1 Score for F/T: {f1_2}')
            print(f'F1 Score for J/P: {f1_3}')
            print(f'Loss: {running_loss/len(test_loader)}')

In [101]:
train(model, train_loader, test_loader, num_epochs, criterion, optimizer)

100%|██████████| 217/217 [03:33<00:00,  1.02it/s]


Epoch 1, Loss: 0.567365798670026


100%|██████████| 55/55 [00:26<00:00,  2.06it/s]


F1 Score for I/E: 0.4381476683937824
F1 Score for N/S: 0.461848635235732
F1 Score for F/T: 0.7227297268113595
F1 Score for J/P: 0.4426105743359917
Loss: 0.5302430445497687


100%|██████████| 217/217 [03:31<00:00,  1.03it/s]


Epoch 2, Loss: 0.4783756415690145


100%|██████████| 55/55 [00:26<00:00,  2.08it/s]


F1 Score for I/E: 0.6845454545454546
F1 Score for N/S: 0.623843069716596
F1 Score for F/T: 0.7780141546262791
F1 Score for J/P: 0.6677238219995558
Loss: 0.4471054033799605


100%|██████████| 217/217 [03:31<00:00,  1.03it/s]


Epoch 3, Loss: 0.3881421515606515


100%|██████████| 55/55 [00:26<00:00,  2.07it/s]


F1 Score for I/E: 0.7046355484965663
F1 Score for N/S: 0.6846381660751022
F1 Score for F/T: 0.7758686007876776
F1 Score for J/P: 0.6792154966435315
Loss: 0.44999871904199773


100%|██████████| 217/217 [03:30<00:00,  1.03it/s]


Epoch 4, Loss: 0.283488155578688


100%|██████████| 55/55 [00:26<00:00,  2.07it/s]


F1 Score for I/E: 0.7394434628975266
F1 Score for N/S: 0.6946978467237787
F1 Score for F/T: 0.7597816681088996
F1 Score for J/P: 0.6889063182361697
Loss: 0.4703889239918102


100%|██████████| 217/217 [03:31<00:00,  1.03it/s]


Epoch 5, Loss: 0.17591109803195373


100%|██████████| 55/55 [00:26<00:00,  2.08it/s]

F1 Score for I/E: 0.7308258904743266
F1 Score for N/S: 0.6976465798045602
F1 Score for F/T: 0.7598660983431194
F1 Score for J/P: 0.6686009162152639
Loss: 0.5309283998879519





In [102]:
torch.save(model.state_dict(), "MBTIsig3.pth")

### Iron man

In [41]:
model = Model()

In [56]:
model.load_state_dict(torch.load('/kaggle/input/modelsig2/MBTIsig2.pth'))
model = model.to(device)

In [109]:
df_ironman3 = pd.read_csv('/kaggle/input/moviecollection/infinity_war.csv')
df_ironman3.head()

Unnamed: 0,character,line,gender
0,ASGARDIAN PA,This is the Asgardian refugee vessel Statesman...,MALE
1,EBONY MAW,"Hear me, and rejoice. You have had the privile...",MALE
2,THANOS,I know what it's like to lose. To feel so desp...,MALE
3,THOR,You talk too much.,MALE
4,THANOS,"The Tesseract, or your brother's head. I assum...",MALE


In [110]:
# concatenate all the lines spoken by a character in a single value, separate the dialogues by '|||'
df_ironman3 = df_ironman3.groupby('character').agg({'line': '|||'.join}).reset_index()
df_ironman3.head()

Unnamed: 0,character,line
0,ASGARDIAN PA,This is the Asgardian refugee vessel Statesman...
1,BRUCE BANNER,"Thanos is coming. He's coming...|||Hey, Tony.|..."
2,BUCKY,"Where's the fight?|||A semi-stable, 100-year-o..."
3,COLLECTOR,I don't have it.|||I told you. I sold it. Why ...
4,CORVUS GLAIVE,"Give up the Stone, and she lives.|||I can't.||..."


In [111]:
# important_chars = ["HAPPY HOGAN", 'TONY STARK', 'PEPPER POTTS', 'JAMES RHODES', 'JARVIS']

In [112]:
# df_ironman3 = df_ironman3[df_ironman3['character'].isin(important_chars)]
df_ironman3.head()

Unnamed: 0,character,line
0,ASGARDIAN PA,This is the Asgardian refugee vessel Statesman...
1,BRUCE BANNER,"Thanos is coming. He's coming...|||Hey, Tony.|..."
2,BUCKY,"Where's the fight?|||A semi-stable, 100-year-o..."
3,COLLECTOR,I don't have it.|||I told you. I sold it. Why ...
4,CORVUS GLAIVE,"Give up the Stone, and she lives.|||I can't.||..."


In [113]:
def predict(model, tokenizer, df):
    model.eval()
    predictions = []
    with torch.no_grad():
        for i in range(len(df)):
            text = df.iloc[i]['line']
            encoding = tokenizer(
                text,
                return_tensors='pt',
                max_length=256,
                padding='max_length',
                truncation=True
            )
            input_ids = encoding['input_ids'].to(device)
            attention_mask = encoding['attention_mask'].to(device)
            outputs = model(input_ids, attention_mask)
            outputs = torch.round(outputs)
            predictions.append(torch.tensor(outputs.cpu().numpy(), dtype = int))
        return predictions

df_ironman3['label'] = predict(model, tokenizer, df_ironman3)
df_ironman3.head()

def convert_binary_to_personality(binary):
    I = 'I' if (binary[0][0] == (1)) else 'E'
    N = 'N' if (binary[0][1] == (1)) else 'S'
    F = 'F' if (binary[0][2] == (1)) else 'T'
    J = 'J' if (binary[0][3] == (1)) else 'P'
    return I + N + F + J
df_ironman3['personality'] = df_ironman3['label'].apply(convert_binary_to_personality)
df_ironman3.head()

Unnamed: 0,character,line,label,personality
0,ASGARDIAN PA,This is the Asgardian refugee vessel Statesman...,"[[tensor(1), tensor(1), tensor(1), tensor(0)]]",INFP
1,BRUCE BANNER,"Thanos is coming. He's coming...|||Hey, Tony.|...","[[tensor(0), tensor(1), tensor(1), tensor(0)]]",ENFP
2,BUCKY,"Where's the fight?|||A semi-stable, 100-year-o...","[[tensor(1), tensor(1), tensor(0), tensor(0)]]",INTP
3,COLLECTOR,I don't have it.|||I told you. I sold it. Why ...,"[[tensor(0), tensor(1), tensor(1), tensor(0)]]",ENFP
4,CORVUS GLAIVE,"Give up the Stone, and she lives.|||I can't.||...","[[tensor(1), tensor(1), tensor(1), tensor(0)]]",INFP


In [114]:
print(df_ironman3)

           character                                               line  \
0       ASGARDIAN PA  This is the Asgardian refugee vessel Statesman...   
1       BRUCE BANNER  Thanos is coming. He's coming...|||Hey, Tony.|...   
2              BUCKY  Where's the fight?|||A semi-stable, 100-year-o...   
3          COLLECTOR  I don't have it.|||I told you. I sold it. Why ...   
4      CORVUS GLAIVE  Give up the Stone, and she lives.|||I can't.||...   
5      CULL OBSIDIAN  We're going to New York City and We will tear ...   
6       DOME CONTROL  Requesting confirmation, my King. You said ope...   
7               DRAX  We'll take his ship.|||Wow.|||He is not a dude...   
8          EBONY MAW  Hear me, and rejoice. You have had the privile...   
9              EITRI  Thor?|||You were supposed to protect us. Asgar...   
10            FRIDAY  Not sure, I'm working on it.|||Will do.|||Yep....   
11            GAMORA  It's a distress signal, Rocket. Someone could ...   
12   GAMORA'S MOTHER     