In [3]:
import pandas as pd
import numpy as np
import os
import joblib
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# === Paths ===
dataset_path = r"C:\Users\sagni\Downloads\New folder\mbti_1.csv"
model_path = r"C:\Users\sagni\Downloads\New folder\personality_estimator.h5"
tokenizer_path = r"C:\Users\sagni\Downloads\New folder\tokenizer.joblib"

# === Load dataset ===
df = pd.read_csv(dataset_path)

# === Basic preprocessing ===
df = df[['type', 'posts']]
df['posts'] = df['posts'].astype(str)

# === Encode labels ===
le = LabelEncoder()
df['label'] = le.fit_transform(df['type'])
joblib.dump(le, os.path.join(os.path.dirname(model_path), 'label_encoder.joblib'))

# === Tokenize text ===
max_words = 10000
max_len = 200

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(df['posts'])
joblib.dump(tokenizer, tokenizer_path)  # ✅ Save tokenizer

sequences = tokenizer.texts_to_sequences(df['posts'])
X = pad_sequences(sequences, maxlen=max_len)

# === Prepare labels ===
y = to_categorical(df['label'])

# === Train-test split ===
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# === Build model ===
model = Sequential([
    Embedding(input_dim=max_words, output_dim=128, input_length=max_len),
    LSTM(64, return_sequences=False),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dropout(0.3),
    Dense(y.shape[1], activation='softmax')
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# === Train ===
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=5, batch_size=128)

# === Save model ===
model.save(model_path)
print(f"✅ Model saved to: {model_path}")
print(f"✅ Tokenizer saved to: {tokenizer_path}")


Epoch 1/5




[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 164ms/step - accuracy: 0.1475 - loss: 2.6291 - val_accuracy: 0.2133 - val_loss: 2.3134
Epoch 2/5
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 135ms/step - accuracy: 0.1844 - loss: 2.3548 - val_accuracy: 0.2133 - val_loss: 2.2933
Epoch 3/5
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 136ms/step - accuracy: 0.1941 - loss: 2.3078 - val_accuracy: 0.2173 - val_loss: 2.3065
Epoch 4/5
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 138ms/step - accuracy: 0.2136 - loss: 2.1977 - val_accuracy: 0.2225 - val_loss: 2.3222
Epoch 5/5
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 138ms/step - accuracy: 0.2714 - loss: 2.0612 - val_accuracy: 0.2173 - val_loss: 2.4131




✅ Model saved to: C:\Users\sagni\Downloads\New folder\personality_estimator.h5
✅ Tokenizer saved to: C:\Users\sagni\Downloads\New folder\tokenizer.joblib


In [4]:
import numpy as np
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import re
import string
import joblib

# === Load the trained model ===
model_path = r"C:\Users\sagni\Downloads\New folder\personality_estimator.h5"
model = load_model(model_path)
print(f"✅ Model loaded from: {model_path}")

# === Load tokenizer ===
tokenizer_path = r"C:\Users\sagni\Downloads\New folder\tokenizer.joblib"
tokenizer = joblib.load(tokenizer_path)
print(f"✅ Tokenizer loaded from: {tokenizer_path}")

# === Constants ===
max_len = 1000  # this must match training value

# === MBTI label map (adjust based on your training setup) ===
label_map = {
    0: "INTJ", 1: "INTP", 2: "ENTJ", 3: "ENTP",
    4: "INFJ", 5: "INFP", 6: "ENFJ", 7: "ENFP",
    8: "ISTJ", 9: "ISFJ", 10: "ESTJ", 11: "ESFJ",
    12: "ISTP", 13: "ISFP", 14: "ESTP", 15: "ESFP"
}

# === Text cleaning function ===
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+", "", text)
    text = text.translate(str.maketrans("", "", string.punctuation))
    text = re.sub(r"\d+", "", text)
    text = text.strip()
    return text

# === Prediction function ===
def predict_mbti(text):
    cleaned = clean_text(text)
    seq = tokenizer.texts_to_sequences([cleaned])
    padded = pad_sequences(seq, maxlen=max_len)
    pred = model.predict(padded)[0]
    mbti_idx = np.argmax(pred)
    return label_map[mbti_idx], float(np.max(pred))

# === Prediction loop ===
while True:
    user_input = input("\n📝 Enter text for personality prediction (or type 'exit' to quit):\n> ")
    if user_input.strip().lower() == "exit":
        print("👋 Exiting...")
        break
    try:
        mbti, confidence = predict_mbti(user_input)
        print(f"\n🔍 Predicted MBTI Type: {mbti} (Confidence: {confidence:.2f})")
    except Exception as e:
        print(f"❌ Error: {e}")




✅ Model loaded from: C:\Users\sagni\Downloads\New folder\personality_estimator.h5
✅ Tokenizer loaded from: C:\Users\sagni\Downloads\New folder\tokenizer.joblib



📝 Enter text for personality prediction (or type 'exit' to quit):
>  I love to spend time thinking about the mysteries of life and enjoy deep conversations.


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 758ms/step

🔍 Predicted MBTI Type: ISFJ (Confidence: 0.75)



📝 Enter text for personality prediction (or type 'exit' to quit):
>  i am ver scared of the dark.


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 87ms/step

🔍 Predicted MBTI Type: ISFJ (Confidence: 0.81)



📝 Enter text for personality prediction (or type 'exit' to quit):
>  exit


👋 Exiting...
