In [1]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, TFBertModel
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

# Paths
data_path = r"C:\Users\sagni\Downloads\New folder\mbti_1.csv"
output_model_path = r"C:\Users\sagni\Downloads\New folder\personality_estimator.h5"

# MBTI → Big Five mapping function
def mbti_to_bigfive(mbti_type):
    return [
        1 if mbti_type[0] == 'E' else 0,  # Extraversion
        1 if mbti_type[1] == 'N' else 0,  # Openness
        1 if mbti_type[2] == 'F' else 0,  # Agreeableness
        1 if mbti_type[3] == 'J' else 0   # Conscientiousness
    ]

# Load and preprocess data
df = pd.read_csv(data_path)
df = df.dropna()
df['big5'] = df['type'].apply(mbti_to_bigfive)

# Reduce data (optional, for speed)
df = df.sample(1000, random_state=42)

# BERT Setup
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

def get_bert_embeddings(texts):
    tokens = tokenizer(texts.tolist(), padding=True, truncation=True, return_tensors='tf', max_length=128)
    outputs = bert_model(tokens)
    return outputs.last_hidden_state[:, 0, :].numpy()  # CLS token

# Embedding extraction
print("Extracting BERT embeddings...")
X = get_bert_embeddings(df['posts'])
y = np.array(df['big5'].tolist())

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model
model = Sequential([
    Dense(256, activation='relu', input_shape=(768,)),
    Dropout(0.3),
    Dense(128, activation='relu'),
    Dense(4, activation='sigmoid')  # Output: 4 Big Five traits (scaled between 0 and 1)
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['mae'])
model.fit(X_train, y_train, epochs=5, batch_size=32, validation_split=0.1)

# Save the model
model.save(output_model_path)
print(f"✅ Model saved at: {output_model_path}")






TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were 

Extracting BERT embeddings...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/5
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 23ms/step - loss: 0.5942 - mae: 0.3877 - val_loss: 0.6091 - val_mae: 0.3939
Epoch 2/5
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 0.5622 - mae: 0.3813 - val_loss: 0.6083 - val_mae: 0.3832
Epoch 3/5
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 0.5409 - mae: 0.3598 - val_loss: 0.6052 - val_mae: 0.4073
Epoch 4/5
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 0.5487 - mae: 0.3728 - val_loss: 0.6061 - val_mae: 0.4049
Epoch 5/5
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 0.5402 - mae: 0.3697 - val_loss: 0.6061 - val_mae: 0.3960




✅ Model saved at: C:\Users\sagni\Downloads\New folder\personality_estimator.h5
