In [3]:
# Install necessary libraries
!pip install nltk catboost scikit-learn

# Import necessary libraries
import numpy as np
import pandas as pd
import warnings
import joblib
import string
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from catboost import CatBoostClassifier
warnings.filterwarnings('ignore')

# Ensure NLTK resources are downloaded
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Text preprocessing functions
def to_lower(text):
    return text.lower()

def remove_urls(text):
    return re.sub(r'https?://\S+|www\.\S+', '', text)

def remove_stopwords(text):
    stop_words_set = set(nltk.corpus.stopwords.words('english'))
    return ' '.join(word for word in text.split() if word not in stop_words_set)

def remove_mbti_labels(text):
    mbti_labels = ['infj', 'infp', 'intj', 'intp', 'isfj', 'isfp', 'istj', 'istp', 'enfj', 'enfp', 'entj', 'entp', 'esfj', 'esfp', 'estj', 'estp']
    return ' '.join(word for word in text.split() if word not in mbti_labels)

def remove_punct(text):
    return ''.join(char for char in text if char not in string.punctuation)

def remove_number(text):
    return ''.join(char for char in text if not char.isdigit())

def to_strip(text):
    return ' '.join(text.split())

def lemmatize(text):
    lemmatizer = WordNetLemmatizer()
    words = nltk.word_tokenize(text)
    return ' '.join(lemmatizer.lemmatize(word) for word in words)

def prepro(text):
    return lemmatize(to_strip(remove_number(remove_punct(remove_mbti_labels(remove_stopwords(remove_urls(to_lower(str(text)))))))))

# Load the data
data = pd.read_csv('mbti_1.csv')

# Define MBTI dimensions
mbti_dimensions = ['I-E', 'N-S', 'T-F', 'J-P']
for dim in mbti_dimensions:
    data[dim] = data['type'].apply(lambda x: x[mbti_dimensions.index(dim)])

data['clean'] = data['posts'].apply(prepro)

# Define machine learning techniques to use
ml_techniques = {
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "RandomForest": RandomForestClassifier(n_estimators=100),
    "SVM": SVC(probability=True),
    "CatBoost": CatBoostClassifier(iterations=500, verbose=False)  # Reduced iterations for quicker execution
}

# Train and evaluate models for each MBTI dimension and ML technique
for dim in mbti_dimensions:
    X = data['clean']
    y = data[dim]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    vectorizer = TfidfVectorizer(ngram_range=(1,1))
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)

    # Serialize the vectorizer
    joblib.dump(vectorizer, f'vectorizer_{dim}.pkl')

    for technique_name, model in ml_techniques.items():
        model.fit(X_train_vec, y_train)
        # Serialize the model
        joblib.dump(model, f'{dim}_{technique_name}.pkl')
        # Predict and evaluate the model
        predictions = model.predict(X_test_vec)
        print(f"\nClassification report for {dim} dimension using {technique_name}:")
        print(classification_report(y_test, predictions))

# Predicting a new post's MBTI type (example using Logistic Regression for 'I-E' dimension)
new_post_text = input("Enter new post text: ")
post_processed = prepro(new_post_text)

final_mbti_type = ''
for dim in mbti_dimensions:
    vectorizer = joblib.load(f'vectorizer_{dim}.pkl')
    model = joblib.load(f'{dim}_LogisticRegression.pkl')

    post_vectorized = vectorizer.transform([post_processed])
    prediction = model.predict(post_vectorized)[0]
    final_mbti_type += prediction

print(f"\nPredicted MBTI Type (using Logistic Regression): {final_mbti_type}")




[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.



Classification report for I-E dimension using LogisticRegression:
              precision    recall  f1-score   support

           E       0.83      0.14      0.23       400
           I       0.79      0.99      0.88      1335

    accuracy                           0.79      1735
   macro avg       0.81      0.56      0.56      1735
weighted avg       0.80      0.79      0.73      1735


Classification report for I-E dimension using RandomForest:
              precision    recall  f1-score   support

           E       0.00      0.00      0.00       400
           I       0.77      1.00      0.87      1335

    accuracy                           0.77      1735
   macro avg       0.38      0.50      0.43      1735
weighted avg       0.59      0.77      0.67      1735


Classification report for I-E dimension using SVM:
              precision    recall  f1-score   support

           E       0.85      0.15      0.25       400
           I       0.80      0.99      0.88      1335

  