In [2]:
import os
import pandas as pd
import numpy as np

pd.set_option("max_columns", 300)
os.chdir(os.getcwd().replace('notebooks','').replace('MBIT',''))

In [3]:
import pandas as pd
from tqdm import tqdm
import tensorflow as tf
import transformers
from transformers import DistilBertTokenizer
from transformers import TFDistilBertForSequenceClassification
pd.set_option('display.max_colwidth', None)
MODEL_NAME = 'distilbert-base-uncased-finetuned-sst-2-english'
BATCH_SIZE = 16
N_EPOCHS = 1
MAX_LEN = 300

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
df = pd.read_csv('data/myer_briggs/raw/mbti_1.csv')

In [5]:
df_unravelled = pd.DataFrame({
    'type': np.repeat(df['type'], df['posts'].str.count('\|\|\|') + 1),
    'text': df['posts'].str.split('\|\|\|').explode()
})
df_unravelled = df_unravelled.reset_index(drop=True)

In [6]:
df_unravelled['I'] = ['I' in c for c in df_unravelled.type]
df_unravelled['N'] = ['N' in c for c in df_unravelled.type]
df_unravelled['T'] = ['T' in c for c in df_unravelled.type]
df_unravelled['J'] = ['J' in c for c in df_unravelled.type]
df_unravelled['E'] = ['E' in c for c in df_unravelled.type]
df_unravelled['S'] = ['S' in c for c in df_unravelled.type]
df_unravelled['F'] = ['F' in c for c in df_unravelled.type]
df_unravelled['P'] = ['P' in c for c in df_unravelled.type]
df_unravelled.head()


Unnamed: 0,type,text,I,N,T,J,E,S,F,P
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw,True,True,False,True,False,False,True,False
1,INFJ,http://41.media.tumblr.com/tumblr_lfouy03PMA1qa1rooo1_500.jpg,True,True,False,True,False,False,True,False
2,INFJ,enfp and intj moments https://www.youtube.com/watch?v=iz7lE1g4XM4 sportscenter not top ten plays https://www.youtube.com/watch?v=uCdfze1etec pranks,True,True,False,True,False,False,True,False
3,INFJ,What has been the most life-changing experience in your life?,True,True,False,True,False,False,True,False
4,INFJ,http://www.youtube.com/watch?v=vXZeYwwRDw8 http://www.youtube.com/watch?v=u8ejam5DP3E On repeat for most of today.,True,True,False,True,False,False,True,False


In [7]:
df_sample = df_unravelled.sample(2000)

In [8]:
X = df_sample['text']
y = df_sample['I']

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3)

In [10]:
y_test.value_counts()/len(y_test)

True     0.783333
False    0.216667
Name: I, dtype: float64

In [11]:
y_train.value_counts()/len(y_train)

True     0.773571
False    0.226429
Name: I, dtype: float64

In [12]:
#define a tokenizer object
tokenizer = DistilBertTokenizer.from_pretrained(MODEL_NAME)
#tokenize the text
train_encodings = tokenizer(list(X_train.values),
                            truncation=True, 
                            padding=True)
test_encodings = tokenizer(list(X_test.values),
                           truncation=True, 
                           padding=True)

In [13]:
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings),
                                    list(y_train.values)))
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings),
                                    list(y_test.values)))

In [42]:
BATCH_SIZE = 64
N_EPOCHS = 10

In [43]:
from src.metrics.keras import BinaryF1Score

In [44]:
model = TFDistilBertForSequenceClassification.from_pretrained(MODEL_NAME)
#chose the optimizer
optimizerr = tf.keras.optimizers.Adam(learning_rate=5e-5)
#define the loss function 
losss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

#build the model
model.compile(optimizer=optimizerr,
              loss=losss,
              metrics=[BinaryF1Score()])
# train the model 
model.fit(train_dataset.shuffle(len(X_train)).batch(BATCH_SIZE),
          epochs=N_EPOCHS,
          batch_size=BATCH_SIZE)

All PyTorch model weights were used when initializing TFDistilBertForSequenceClassification.

All the weights of TFDistilBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.


Epoch 1/10
Epoch 2/10


  m.reset_state()




In [17]:
import pickle
import pickle

# Assuming you have a trained model object called 'model'
# Save the model as a pickle file
with open('data/myer_briggs/model.pkl', 'wb') as f:
    pickle.dump(model, f)



Keras weights file (<HDF5 file "variables.h5" (mode r+)>) saving:
...classifier
......vars
.........0
.........1
...distilbert
......vars
...distilbert\embeddings
......vars
.........0
.........1
...distilbert\embeddings\LayerNorm
......vars
.........0
.........1
...distilbert\embeddings\dropout
......vars
...distilbert\transformer
......vars
...distilbert\transformer\layer\tf_transformer_block
......vars
...distilbert\transformer\layer\tf_transformer_block\attention
......vars
...distilbert\transformer\layer\tf_transformer_block\attention\dropout
......vars
...distilbert\transformer\layer\tf_transformer_block\attention\k_lin
......vars
.........0
.........1
...distilbert\transformer\layer\tf_transformer_block\attention\out_lin
......vars
.........0
.........1
...distilbert\transformer\layer\tf_transformer_block\attention\q_lin
......vars
.........0
.........1
...distilbert\transformer\layer\tf_transformer_block\attention\v_lin
......vars
.........0
.........1
...distilbert\transformer

In [18]:
def predict_proba(text_list, model, tokenizer):  
    #tokenize the text
    encodings = tokenizer(text_list, 
                          max_length=15000, 
                          truncation=True, 
                          padding=True)
    #transform to tf.Dataset
    dataset = tf.data.Dataset.from_tensor_slices((dict(encodings)))
    #predict
    preds = model.predict(dataset.batch(1)).logits  
    
    #transform to array with probabilities
    res = tf.nn.softmax(preds, axis=1).numpy()      
    
    return res


In [29]:
i = 90
print('Introvert',y_test.iloc[i])

predict_proba(X_test.iloc[i], model, tokenizer)[:,1].mean()

Introvert True


0.77337915

In [20]:
y_pred = model.predict(test_dataset).logits  



In [41]:
from sklearn.metrics import classification_report

y_score = np.abs(y_pred[:,1])>0.5
# Generate the classification report
report = classification_report(y_test, y_score)

# Print the classification report
print(report)


              precision    recall  f1-score   support

       False       0.21      0.42      0.28       130
        True       0.78      0.56      0.65       470

    accuracy                           0.53       600
   macro avg       0.49      0.49      0.46       600
weighted avg       0.65      0.53      0.57       600

