In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
import joblib
import os

# Set paths
DATA_PATH = r"C:\Users\sagni\Downloads\New folder\mbti_1.csv"
MODEL_PATH = r"C:\Users\sagni\Downloads\New folder\personality_estimator_bi_lstm.h5"
TOKENIZER_PATH = r"C:\Users\sagni\Downloads\New folder\tokenizer_bi_lstm.joblib"

# Load and clean data
df = pd.read_csv(DATA_PATH)
df.dropna(inplace=True)

def clean_text(text):
    text = re.sub(r"http\S+|www.\S+", "", text)
    text = re.sub(r"[^A-Za-z ]", "", text)
    return text.lower()

df['posts'] = df['posts'].apply(clean_text)

# Encode labels
le = LabelEncoder()
df['type_encoded'] = le.fit_transform(df['type'])
y = to_categorical(df['type_encoded'])

# Tokenize text
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(df['posts'])
X = tokenizer.texts_to_sequences(df['posts'])
X = pad_sequences(X, maxlen=300)

# Save tokenizer
joblib.dump(tokenizer, TOKENIZER_PATH)

# Compute class weights
class_weights = compute_class_weight('balanced', classes=np.unique(df['type_encoded']), y=df['type_encoded'])
class_weights_dict = dict(enumerate(class_weights))

# Train-test split
X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# Build model
model = Sequential([
    Embedding(input_dim=10000, output_dim=128, input_length=300),
    Bidirectional(LSTM(64, return_sequences=True)),
    Dropout(0.5),
    Bidirectional(LSTM(32)),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(16, activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=10,
    batch_size=64,
    class_weight=class_weights_dict,
    callbacks=[early_stop]
)

# Save model
model.save(MODEL_PATH)
print(f"✅ Model saved to: {MODEL_PATH}")
print(f"✅ Tokenizer saved to: {TOKENIZER_PATH}")




Epoch 1/10
[1m109/109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 342ms/step - accuracy: 0.1259 - loss: 2.7304 - val_accuracy: 0.0294 - val_loss: 2.7682
Epoch 2/10
[1m109/109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 330ms/step - accuracy: 0.1041 - loss: 2.7208 - val_accuracy: 0.0317 - val_loss: 2.7504
Epoch 3/10
[1m109/109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 332ms/step - accuracy: 0.0895 - loss: 2.5340 - val_accuracy: 0.0582 - val_loss: 2.7167
Epoch 4/10
[1m109/109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 331ms/step - accuracy: 0.1711 - loss: 1.9953 - val_accuracy: 0.0628 - val_loss: 2.8050
Epoch 5/10
[1m109/109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 326ms/step - accuracy: 0.2726 - loss: 1.3958 - val_accuracy: 0.0893 - val_loss: 2.8528
Epoch 6/10
[1m109/109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 336ms/step - accuracy: 0.4172 - loss: 0.9584 - val_accuracy: 0.0928 - val_loss: 3.1046




✅ Model saved to: C:\Users\sagni\Downloads\New folder\personality_estimator_bi_lstm.h5
✅ Tokenizer saved to: C:\Users\sagni\Downloads\New folder\tokenizer_bi_lstm.joblib


In [2]:
import numpy as np
import re
import joblib
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load paths
MODEL_PATH = r"C:\Users\sagni\Downloads\New folder\personality_estimator_bi_lstm.h5"
TOKENIZER_PATH = r"C:\Users\sagni\Downloads\New folder\tokenizer_bi_lstm.joblib"

# Load model and tokenizer
model = load_model(MODEL_PATH)
tokenizer = joblib.load(TOKENIZER_PATH)
labels = ['INFJ','ENTP','INTP','INTJ','ENTJ','ENFJ','INFP','ENFP',
          'ISTJ','ISFJ','ESTJ','ESFJ','ISTP','ISFP','ESTP','ESFP']

# Text cleaning
def clean_text(text):
    text = re.sub(r"http\S+|www.\S+", "", text)
    text = re.sub(r"[^A-Za-z ]", "", text)
    return text.lower()

# Prediction loop
print("📝 Enter text for personality prediction (or type 'exit' to quit):")
while True:
    text = input(">  ")
    if text.lower() == 'exit':
        break
    cleaned = clean_text(text)
    seq = tokenizer.texts_to_sequences([cleaned])
    padded = pad_sequences(seq, maxlen=300)
    pred = model.predict(padded)[0]
    mbti_type = labels[np.argmax(pred)]
    confidence = np.max(pred)
    print(f"\n🔍 Predicted MBTI Type: {mbti_type} (Confidence: {confidence:.2f})\n")




📝 Enter text for personality prediction (or type 'exit' to quit):


>   I love to spend time thinking about the mysteries of life and enjoy deep conversations.


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step

🔍 Predicted MBTI Type: ENTP (Confidence: 0.14)



>   i am very sad for my pet


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 83ms/step

🔍 Predicted MBTI Type: INTJ (Confidence: 0.11)



>   exit
