In [12]:
#%pip install --use-feature=2020-resolver --user torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
#%pip install --use-feature=2020-resolver --upgrade --user kagglehub transformers datasets evaluate accelerate typing_extensions tqdm
#%pip install --use-feature=2020-resolver --user fastapi uvicorn pydantic

In [4]:
import sys
import os
import kagglehub
import pandas as pd
from datasets import Dataset, DatasetDict

parent_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.append(parent_dir)

from src.utils.preprocess import label2id, id2label, num_labels

def get_kaggle_df():
    path = kagglehub.dataset_download("zeyadkhalid/mbti-personality-types-500-dataset")
    return pd.read_csv(os.path.join(path,'MBTI 500.csv'))

def print_id2label():
    df = get_kaggle_df()
    l = list(df["type"].unique())
    print(f"id2label = {{ {','.join([ str(k) + ':' + repr(t) for k,t in enumerate(l)])}}}")
    print(f"label2id = {{ {','.join([ repr(t) + ':' +  str(k) for k,t in enumerate(l)])}}}")

def make_hf_ds():
    df = get_kaggle_df()

    def gen():
        for i,v in df.iterrows():
            yield {'text': v['posts'], 'label': label2id[v['type']]}

    ds = Dataset.from_generator(gen)
    ds_ = ds.train_test_split(test_size=0.1)
    ds_.save_to_disk(os.path.join('..','data','mbti'))

def load_ds():
    return DatasetDict.load_from_disk(os.path.join('..','data','mbti'))

In [5]:
#print_id2label()

In [1]:
id2label = { 0:'INTJ',1:'INTP',2:'ISFJ',3:'ISFP',4:'ISTJ',5:'ISTP',6:'ENFJ',7:'ENFP',8:'ENTJ',9:'ENTP',10:'ESFJ',11:'ESFP',12:'ESTJ',13:'ESTP',14:'INFJ',15:'INFP'}
label2id = { 'INTJ':0,'INTP':1,'ISFJ':2,'ISFP':3,'ISTJ':4,'ISTP':5,'ENFJ':6,'ENFP':7,'ENTJ':8,'ENTP':9,'ESFJ':10,'ESFP':11,'ESTJ':12,'ESTP':13,'INFJ':14,'INFP':15}
num_labels = 16

In [6]:
#make_hf_ds() 

In [7]:
ds = load_ds()

In [8]:
ds["train"][0]

{'text': 'bite like ni type know thing think would accurate say difficuy verbalize reason think process visual spatial holistic rather verbal sequential like answer include low function well high one especially notice one fjs fps way many people near middle scale sometimes score wrong side way achieve balance help ferret type imo see si se ni ne fi fe ti te think problem people introvert social anxiet society make social anxiety hate bother everyone way society whole treat introvert vouch poeple introvert well social anxiety sometimes social situation always easy figure social anxiety make want leave tire introversion sure social anxious introvert way hate people simply direct r socialanxiety simple oh gosh get much live family reason marry job enjoy generally thing great get little sometimes wish understand good maybe stand offish right word think get past prickly outside want overwhelm person since tend thing know month really appreciate insight anything say ohhh goodness must hard f

In [9]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

In [10]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

tokenized_ds = ds.map(preprocess_function, batched=True)

In [11]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [12]:
import evaluate

accuracy = evaluate.load("accuracy")

In [13]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [14]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=num_labels, id2label=id2label, label2id=label2id
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
import torch
torch.cuda.is_available()

True

In [19]:
torch.cuda.get_device_name(0)

'NVIDIA GeForce RTX 2060'

In [None]:
training_args = TrainingArguments(
    output_dir="../models/my_mbti",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

  0%|          | 0/5967 [00:00<?, ?it/s]

{'loss': 1.2237, 'grad_norm': 13.743212699890137, 'learning_rate': 1.8324115971174795e-05, 'epoch': 0.08}
{'loss': 0.8859, 'grad_norm': 12.72556209564209, 'learning_rate': 1.664823194234959e-05, 'epoch': 0.17}
{'loss': 0.8207, 'grad_norm': 13.707405090332031, 'learning_rate': 1.4972347913524386e-05, 'epoch': 0.25}
{'loss': 0.7521, 'grad_norm': 11.589092254638672, 'learning_rate': 1.329646388469918e-05, 'epoch': 0.34}
{'loss': 0.7284, 'grad_norm': 8.122697830200195, 'learning_rate': 1.1620579855873975e-05, 'epoch': 0.42}
{'loss': 0.6765, 'grad_norm': 14.169639587402344, 'learning_rate': 9.94469582704877e-06, 'epoch': 0.5}
{'loss': 0.6599, 'grad_norm': 6.587297439575195, 'learning_rate': 8.268811798223564e-06, 'epoch': 0.59}
{'loss': 0.6402, 'grad_norm': 10.743701934814453, 'learning_rate': 6.592927769398358e-06, 'epoch': 0.67}
{'loss': 0.5873, 'grad_norm': 12.94200611114502, 'learning_rate': 4.917043740573153e-06, 'epoch': 0.75}
{'loss': 0.5981, 'grad_norm': 9.286853790283203, 'learning

  0%|          | 0/663 [00:00<?, ?it/s]

{'eval_loss': 0.5601599216461182, 'eval_accuracy': 0.8283209201470727, 'eval_runtime': 203.6617, 'eval_samples_per_second': 52.081, 'eval_steps_per_second': 3.255, 'epoch': 1.0}
{'train_runtime': 6223.2587, 'train_samples_per_second': 15.339, 'train_steps_per_second': 0.959, 'train_loss': 0.7321360418499665, 'epoch': 1.0}


TrainOutput(global_step=5967, training_loss=0.7321360418499665, metrics={'train_runtime': 6223.2587, 'train_samples_per_second': 15.339, 'train_steps_per_second': 0.959, 'total_flos': 1.264849503879168e+16, 'train_loss': 0.7321360418499665, 'epoch': 1.0})

In [13]:
from transformers import pipeline
classifier = pipeline("text-classification", model="../models/my_mbti/checkpoint-5967")

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [14]:
classifier("I am a very happy person")

[{'label': 'INFP', 'score': 0.13372699916362762}]