In [36]:
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.svm import SVC,LinearSVC
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from tqdm import tqdm
import seaborn as sns
import pandas as pd
import warnings
import gensim
import numpy as np
warnings.filterwarnings('ignore')

# Training on Lemmatized data

In [37]:
df = pd.read_csv('../dataset/preprocessed/mbti_lemmatized.csv')
df

Unnamed: 0,type,posts
0,INFJ,intj moment sportscenter play pr...
1,ENTP,m find lack post alarming Sex boring s posit...
2,INTP,good course know s blessing ...
3,INTJ,Dear INTP enjoy conversation day esot...
4,ENTJ,fire s silly misconception approach logica...
...,...,...
8670,ISFP,think cat Fi dom reason website have...
8671,ENFP,thread exist someplace heck delete...
8672,INTP,question thing purple pill pick win lot...
8673,INFP,conflicted right come want child honestly...


In [38]:
vectorizer = gensim.models.Word2Vec.load('../models/gensim_wikipedia_word2vec.model')

In [39]:


def train_test_evaluate_report(model, X_train, X_test, y_train, y_test):
    """
    Trains, tests, and evaluates a model using Word2Vec document vectors with MinMax scaling.

    Args:
        model: The scikit-learn classifier model.
        X_train: Training posts (pandas Series).
        X_test: Testing posts (pandas Series).
        y_train: Training labels (pandas Series).
        y_test: Testing labels (pandas Series).
        vectorizer: The pre-loaded Word2Vec model.
    """

    def preprocess_posts(posts):
        processed_posts = []
        for post in posts:
            if isinstance(post, str):
                words = post.lower().split()
                cleaned_words = [word for word in words if word.isalnum() and len(word) > 2]
                processed_posts.append(cleaned_words)
            else:
                processed_posts.append([])
        return processed_posts

    def document_vector(words, model):
        words = [word for word in words if word in model.wv]
        if len(words) == 0:
            return np.zeros(model.vector_size)
        return np.mean(model.wv[words], axis=0)

    # Preprocess posts
    X_train_processed = preprocess_posts(X_train)
    X_test_processed = preprocess_posts(X_test)

    # Create document vectors by iterating over the processed data
    X_train_vec = np.vstack([document_vector(post, vectorizer) for post in X_train_processed])
    X_test_vec = np.vstack([document_vector(post, vectorizer) for post in X_test_processed])

    # Apply MinMax scaling
    scaler = MinMaxScaler()
    X_train_vec_scaled = scaler.fit_transform(X_train_vec)
    X_test_vec_scaled = scaler.transform(X_test_vec)

    # Encode target labels
    encoder = LabelEncoder()
    y_train_encoded = encoder.fit_transform(y_train)
    y_test_encoded = encoder.transform(y_test)

    # Train the model
    model.fit(X_train_vec_scaled, y_train_encoded)

    # Generate classification report
    report = classification_report(y_test_encoded, model.predict(X_test_vec_scaled),
                                    target_names=encoder.inverse_transform([i for i in range(16)]),
                                    output_dict=True)

    # Extract metrics
    accuracy = report['accuracy']
    macro_precision = report['macro avg']['precision']
    weighted_recall = report['weighted avg']['recall']

    print(f"Accuracy: {accuracy}")
    print(f"Macro Precision: {macro_precision}")
    print(f"Weighted Recall: {weighted_recall} \n\n")

    return accuracy


## Logistic Regresssion

In [40]:
folds = StratifiedKFold(n_splits=5)

report = pd.read_csv('./report.csv')
score = []
for train_index, test_index in folds.split(X=df.posts, y=df.type):
    X_train, X_test, y_train, y_test = df.posts[train_index], df.posts[test_index] \
        , df.type[train_index], df.type[test_index]
    model = LogisticRegression()
    score.append(train_test_evaluate_report(model,X_train, X_test, y_train, y_test))
    
summary = {
    "model": "LogisticRegression",
    "text_representation": "Word2Vec",
    "lemmatized": "yes",
    "accuracy": sum(score) / len(score)
}

report.loc[len(report)] = summary
report.to_csv('./report.csv', index=False, header=True)
print(report)

Accuracy: 0.28472622478386167
Macro Precision: 0.1238409506203725
Weighted Recall: 0.28472622478386167 


Accuracy: 0.30662824207492795
Macro Precision: 0.12161534787060867
Weighted Recall: 0.30662824207492795 


Accuracy: 0.30144092219020174
Macro Precision: 0.15579094801866056
Weighted Recall: 0.30144092219020174 


Accuracy: 0.29394812680115273
Macro Precision: 0.17731062961707922
Weighted Recall: 0.29394812680115273 


Accuracy: 0.29913544668587894
Macro Precision: 0.12300055744166069
Weighted Recall: 0.29913544668587894 


                             model text_representation lemmatized  accuracy
0               LogisticRegression              TF-IDF        yes  0.659366
1           DecisionTreeClassifier              TF-IDF        yes  0.489568
2                    MultinomialNB              TF-IDF        yes  0.366455
3   HistGradientBoostingClassifier              TF-IDF        yes  0.151931
4                              SVC              TF-IDF        yes  0.654409
5         

## Decision Tree Classifier

In [41]:
folds = StratifiedKFold(n_splits=5)
report = pd.read_csv('./report.csv')
score = []
for train_index, test_index in folds.split(X=df.posts, y=df.type):
    X_train, X_test, y_train, y_test = df.posts.iloc[train_index], df.posts.iloc[test_index], df.type.iloc[train_index], df.type.iloc[test_index]
    model = DecisionTreeClassifier()
    score.append(train_test_evaluate_report(model, X_train, X_test, y_train, y_test))
summary = {
    "model": "DecisionTreeClassifier",
    "text_representation": "Word2Vec",
    "lemmatized": "yes",
    "accuracy": sum(score) / len(score)
}
report.loc[len(report)] = summary
report.to_csv('./report.csv', index=False, header=True)
print(report)


Accuracy: 0.16772334293948127
Macro Precision: 0.07810691783053253
Weighted Recall: 0.16772334293948127 


Accuracy: 0.1585014409221902
Macro Precision: 0.07982906321645182
Weighted Recall: 0.1585014409221902 


Accuracy: 0.15965417867435158
Macro Precision: 0.07912867029202571
Weighted Recall: 0.15965417867435158 


Accuracy: 0.1786743515850144
Macro Precision: 0.08475993594000553
Weighted Recall: 0.1786743515850144 


Accuracy: 0.14755043227665707
Macro Precision: 0.06815048680559123
Weighted Recall: 0.14755043227665707 


                             model text_representation lemmatized  accuracy
0               LogisticRegression              TF-IDF        yes  0.659366
1           DecisionTreeClassifier              TF-IDF        yes  0.489568
2                    MultinomialNB              TF-IDF        yes  0.366455
3   HistGradientBoostingClassifier              TF-IDF        yes  0.151931
4                              SVC              TF-IDF        yes  0.654409
5           R

## Multinomial Naive Bayes

In [42]:
folds = StratifiedKFold(n_splits=5)
report = pd.read_csv('./report.csv')
score = []
for train_index, test_index in folds.split(X=df.posts, y=df.type):
    X_train, X_test, y_train, y_test = df.posts.iloc[train_index], df.posts.iloc[test_index], df.type.iloc[train_index], df.type.iloc[test_index]
    model = MultinomialNB()
    score.append(train_test_evaluate_report(model, X_train, X_test, y_train, y_test))
summary = {
    "model": "MultinomialNB",
    "text_representation": "Word2Vec",
    "lemmatized": "yes",
    "accuracy": sum(score) / len(score)
}
report.loc[len(report)] = summary
report.to_csv('./report.csv', index=False, header=True)
print(report)

Accuracy: 0.21498559077809798
Macro Precision: 0.03506418884599756
Weighted Recall: 0.21498559077809798 


Accuracy: 0.21268011527377523
Macro Precision: 0.028895796974985455
Weighted Recall: 0.21268011527377523 


Accuracy: 0.21440922190201728
Macro Precision: 0.02727911843568112
Weighted Recall: 0.21440922190201728 


Accuracy: 0.21268011527377523
Macro Precision: 0.02799213744903902
Weighted Recall: 0.21268011527377523 


Accuracy: 0.21210374639769453
Macro Precision: 0.028870512449334106
Weighted Recall: 0.21210374639769453 


                             model text_representation lemmatized  accuracy
0               LogisticRegression              TF-IDF        yes  0.659366
1           DecisionTreeClassifier              TF-IDF        yes  0.489568
2                    MultinomialNB              TF-IDF        yes  0.366455
3   HistGradientBoostingClassifier              TF-IDF        yes  0.151931
4                              SVC              TF-IDF        yes  0.654409
5      

## Histogram Based Gradient Boosting

In [43]:
folds = StratifiedKFold(n_splits=5)

report = pd.read_csv('./report.csv')

score = []

for train_index, test_index in folds.split(X=df.posts, y=df.type):
    X_train, X_test, y_train, y_test = df.posts[train_index], df.posts[test_index], df.type[train_index], df.type[test_index]
    
    model = HistGradientBoostingClassifier()
    score.append(train_test_evaluate_report(model, X_train, X_test, y_train, y_test))

summary = {
    "model": "HistGradientBoostingClassifier",
    "text_representation": "Word2Vec",
    "lemmatized": "yes",
    "accuracy": sum(score) / len(score)
}

report.loc[len(report)] = summary
report.to_csv('./report.csv', index=False, header=True)

print(report)


Accuracy: 0.2680115273775216
Macro Precision: 0.1349460807424873
Weighted Recall: 0.2680115273775216 


Accuracy: 0.28357348703170027
Macro Precision: 0.1152683119651642
Weighted Recall: 0.28357348703170027 


Accuracy: 0.2760806916426513
Macro Precision: 0.11666198959947047
Weighted Recall: 0.2760806916426513 


Accuracy: 0.2680115273775216
Macro Precision: 0.18315088831982884
Weighted Recall: 0.2680115273775216 


Accuracy: 0.2703170028818444
Macro Precision: 0.24603565036887964
Weighted Recall: 0.2703170028818444 


                             model text_representation lemmatized  accuracy
0               LogisticRegression              TF-IDF        yes  0.659366
1           DecisionTreeClassifier              TF-IDF        yes  0.489568
2                    MultinomialNB              TF-IDF        yes  0.366455
3   HistGradientBoostingClassifier              TF-IDF        yes  0.151931
4                              SVC              TF-IDF        yes  0.654409
5           RandomF

## Support Vector Classifier

In [44]:
folds = StratifiedKFold(n_splits=5)

report = pd.read_csv('./report.csv')

score = []

for train_index, test_index in folds.split(X=df.posts, y=df.type):
    X_train, X_test, y_train, y_test = df.posts[train_index], df.posts[test_index], df.type[train_index], df.type[test_index]
    
    model = SVC()
    score.append(train_test_evaluate_report(model, X_train, X_test, y_train, y_test))

summary = {
    "model": "SVC",
    "text_representation": "Word2Vec",
    "lemmatized": "yes",
    "accuracy": sum(score) / len(score)
}

report.loc[len(report)] = summary
report.to_csv('./report.csv', index=False, header=True)

print(report)


Accuracy: 0.2922190201729107
Macro Precision: 0.10981045680618029
Weighted Recall: 0.2922190201729107 


Accuracy: 0.3048991354466859
Macro Precision: 0.1257834307647264
Weighted Recall: 0.3048991354466859 


Accuracy: 0.2985590778097983
Macro Precision: 0.14882991898145748
Weighted Recall: 0.2985590778097983 


Accuracy: 0.30086455331412104
Macro Precision: 0.18384420435964002
Weighted Recall: 0.30086455331412104 


Accuracy: 0.2864553314121037
Macro Precision: 0.12499950980467175
Weighted Recall: 0.2864553314121037 


                             model text_representation lemmatized  accuracy
0               LogisticRegression              TF-IDF        yes  0.659366
1           DecisionTreeClassifier              TF-IDF        yes  0.489568
2                    MultinomialNB              TF-IDF        yes  0.366455
3   HistGradientBoostingClassifier              TF-IDF        yes  0.151931
4                              SVC              TF-IDF        yes  0.654409
5           Random

## Random Forest Classifier

In [45]:
folds = StratifiedKFold(n_splits=5)

report = pd.read_csv('./report.csv')

score = []

for train_index, test_index in folds.split(X=df.posts, y=df.type):
    X_train, X_test, y_train, y_test = df.posts[train_index], df.posts[test_index], df.type[train_index], df.type[test_index]
    
    model = RandomForestClassifier()
    score.append(train_test_evaluate_report(model, X_train, X_test, y_train, y_test))

summary = {
    "model": "RandomForestClassifier",
    "text_representation": "Word2Vec",
    "lemmatized": "yes",
    "accuracy": sum(score) / len(score)
}

report.loc[len(report)] = summary
report.to_csv('./report.csv', index=False, header=True)

print(report)


Accuracy: 0.26455331412103744
Macro Precision: 0.14124329357082904
Weighted Recall: 0.26455331412103744 


Accuracy: 0.2760806916426513
Macro Precision: 0.10703597696349106
Weighted Recall: 0.2760806916426513 


Accuracy: 0.27204610951008645
Macro Precision: 0.1383653578443373
Weighted Recall: 0.27204610951008645 


Accuracy: 0.2703170028818444
Macro Precision: 0.12271738758764089
Weighted Recall: 0.2703170028818444 


Accuracy: 0.2703170028818444
Macro Precision: 0.12113559912017666
Weighted Recall: 0.2703170028818444 


                             model text_representation lemmatized  accuracy
0               LogisticRegression              TF-IDF        yes  0.659366
1           DecisionTreeClassifier              TF-IDF        yes  0.489568
2                    MultinomialNB              TF-IDF        yes  0.366455
3   HistGradientBoostingClassifier              TF-IDF        yes  0.151931
4                              SVC              TF-IDF        yes  0.654409
5           Rand

## Gradient Boosting Classifier

In [46]:
folds = StratifiedKFold(n_splits=5)

report = pd.read_csv('./report.csv')

score = []

for train_index, test_index in folds.split(X=df.posts, y=df.type):
    X_train, X_test, y_train, y_test = df.posts[train_index], df.posts[test_index], df.type[train_index], df.type[test_index]
    
    model = GradientBoostingClassifier(n_estimators=9, warm_start=True)
    score.append(train_test_evaluate_report(model, X_train, X_test, y_train, y_test))

summary = {
    "model": "GradientBoostingClassifier",
    "text_representation": "Word2Vec",
    "lemmatized": "yes",
    "accuracy": sum(score) / len(score)
}

report.loc[len(report)] = summary
report.to_csv('./report.csv', index=False, header=True)

print(report)


Accuracy: 0.25936599423631124
Macro Precision: 0.10260590216334965
Weighted Recall: 0.25936599423631124 


Accuracy: 0.269164265129683
Macro Precision: 0.08674481525488803
Weighted Recall: 0.269164265129683 


Accuracy: 0.2564841498559078
Macro Precision: 0.09064013335954825
Weighted Recall: 0.2564841498559078 


Accuracy: 0.25936599423631124
Macro Precision: 0.10709121718375542
Weighted Recall: 0.25936599423631124 


Accuracy: 0.26570605187319885
Macro Precision: 0.1052347387889993
Weighted Recall: 0.26570605187319885 


                             model text_representation lemmatized  accuracy
0               LogisticRegression              TF-IDF        yes  0.659366
1           DecisionTreeClassifier              TF-IDF        yes  0.489568
2                    MultinomialNB              TF-IDF        yes  0.366455
3   HistGradientBoostingClassifier              TF-IDF        yes  0.151931
4                              SVC              TF-IDF        yes  0.654409
5           Rand

# Training on Unlemmatized data

In [47]:
df = pd.read_csv('../dataset/preprocessed/mbti_lemmatized.csv')
df

Unnamed: 0,type,posts
0,INFJ,intj moment sportscenter play pr...
1,ENTP,m find lack post alarming Sex boring s posit...
2,INTP,good course know s blessing ...
3,INTJ,Dear INTP enjoy conversation day esot...
4,ENTJ,fire s silly misconception approach logica...
...,...,...
8670,ISFP,think cat Fi dom reason website have...
8671,ENFP,thread exist someplace heck delete...
8672,INTP,question thing purple pill pick win lot...
8673,INFP,conflicted right come want child honestly...


## Logistic Regression

In [48]:
folds = StratifiedKFold(n_splits=5)

report = pd.read_csv('./report.csv')
score = []
for train_index, test_index in folds.split(X=df.posts, y=df.type):
    X_train, X_test, y_train, y_test = df.posts[train_index], df.posts[test_index] \
        , df.type[train_index], df.type[test_index]
    model = LogisticRegression()
    score.append(train_test_evaluate_report(model,X_train, X_test, y_train, y_test))
    
summary = {
    "model": "LogisticRegression",
    "text_representation": "Word2Vec",
    "lemmatized": "no",
    "accuracy": sum(score) / len(score)
}

report.loc[len(report)] = summary
report.to_csv('./report.csv', index=False, header=True)
print(report)

Accuracy: 0.28472622478386167
Macro Precision: 0.1238409506203725
Weighted Recall: 0.28472622478386167 


Accuracy: 0.30662824207492795
Macro Precision: 0.12161534787060867
Weighted Recall: 0.30662824207492795 


Accuracy: 0.30144092219020174
Macro Precision: 0.15579094801866056
Weighted Recall: 0.30144092219020174 


Accuracy: 0.29394812680115273
Macro Precision: 0.17731062961707922
Weighted Recall: 0.29394812680115273 


Accuracy: 0.29913544668587894
Macro Precision: 0.12300055744166069
Weighted Recall: 0.29913544668587894 


                             model text_representation lemmatized  accuracy
0               LogisticRegression              TF-IDF        yes  0.659366
1           DecisionTreeClassifier              TF-IDF        yes  0.489568
2                    MultinomialNB              TF-IDF        yes  0.366455
3   HistGradientBoostingClassifier              TF-IDF        yes  0.151931
4                              SVC              TF-IDF        yes  0.654409
5         

## Decision Tree Classifier

In [49]:
folds = StratifiedKFold(n_splits=5)
report = pd.read_csv('./report.csv')
score = []
for train_index, test_index in folds.split(X=df.posts, y=df.type):
    X_train, X_test, y_train, y_test = df.posts.iloc[train_index], df.posts.iloc[test_index], df.type.iloc[train_index], df.type.iloc[test_index]
    model = DecisionTreeClassifier()
    score.append(train_test_evaluate_report(model, X_train, X_test, y_train, y_test))
summary = {
    "model": "DecisionTreeClassifier",
    "text_representation": "Word2Vec",
    "lemmatized": "no",
    "accuracy": sum(score) / len(score)
}
report.loc[len(report)] = summary
report.to_csv('./report.csv', index=False, header=True)
print(report)


Accuracy: 0.17636887608069166
Macro Precision: 0.08925435261875488
Weighted Recall: 0.17636887608069166 


Accuracy: 0.16368876080691644
Macro Precision: 0.08142790119141838
Weighted Recall: 0.16368876080691644 


Accuracy: 0.1579250720461095
Macro Precision: 0.07579825469214853
Weighted Recall: 0.1579250720461095 


Accuracy: 0.16138328530259366
Macro Precision: 0.07762651269705402
Weighted Recall: 0.16138328530259366 


Accuracy: 0.14927953890489915
Macro Precision: 0.0702268316406635
Weighted Recall: 0.14927953890489915 


                             model text_representation lemmatized  accuracy
0               LogisticRegression              TF-IDF        yes  0.659366
1           DecisionTreeClassifier              TF-IDF        yes  0.489568
2                    MultinomialNB              TF-IDF        yes  0.366455
3   HistGradientBoostingClassifier              TF-IDF        yes  0.151931
4                              SVC              TF-IDF        yes  0.654409
5           

## Multinomial Naive Bayes

In [50]:
folds = StratifiedKFold(n_splits=5)
report = pd.read_csv('./report.csv')
score = []
for train_index, test_index in folds.split(X=df.posts, y=df.type):
    X_train, X_test, y_train, y_test = df.posts.iloc[train_index], df.posts.iloc[test_index], df.type.iloc[train_index], df.type.iloc[test_index]
    model = MultinomialNB()
    score.append(train_test_evaluate_report(model, X_train, X_test, y_train, y_test))
summary = {
    "model": "MultinomialNB",
    "text_representation": "Word2Vec",
    "lemmatized": "no",
    "accuracy": sum(score) / len(score)
}
report.loc[len(report)] = summary
report.to_csv('./report.csv', index=False, header=True)
print(report)

Accuracy: 0.21498559077809798
Macro Precision: 0.03506418884599756
Weighted Recall: 0.21498559077809798 


Accuracy: 0.21268011527377523
Macro Precision: 0.028895796974985455
Weighted Recall: 0.21268011527377523 


Accuracy: 0.21440922190201728
Macro Precision: 0.02727911843568112
Weighted Recall: 0.21440922190201728 


Accuracy: 0.21268011527377523
Macro Precision: 0.02799213744903902
Weighted Recall: 0.21268011527377523 


Accuracy: 0.21210374639769453
Macro Precision: 0.028870512449334106
Weighted Recall: 0.21210374639769453 


                             model text_representation lemmatized  accuracy
0               LogisticRegression              TF-IDF        yes  0.659366
1           DecisionTreeClassifier              TF-IDF        yes  0.489568
2                    MultinomialNB              TF-IDF        yes  0.366455
3   HistGradientBoostingClassifier              TF-IDF        yes  0.151931
4                              SVC              TF-IDF        yes  0.654409
5      

## Histogram Based Gradiant Boost

In [51]:
folds = StratifiedKFold(n_splits=5)

report = pd.read_csv('./report.csv')

score = []

for train_index, test_index in folds.split(X=df.posts, y=df.type):
    X_train, X_test, y_train, y_test = df.posts[train_index], df.posts[test_index], df.type[train_index], df.type[test_index]
    
    model = HistGradientBoostingClassifier()
    score.append(train_test_evaluate_report(model, X_train, X_test, y_train, y_test))

summary = {
    "model": "HistGradientBoostingClassifier",
    "text_representation": "Word2Vec",
    "lemmatized": "no",
    "accuracy": sum(score) / len(score)
}

report.loc[len(report)] = summary
report.to_csv('./report.csv', index=False, header=True)

print(report)


Accuracy: 0.2680115273775216
Macro Precision: 0.1349460807424873
Weighted Recall: 0.2680115273775216 


Accuracy: 0.28357348703170027
Macro Precision: 0.1152683119651642
Weighted Recall: 0.28357348703170027 


Accuracy: 0.2760806916426513
Macro Precision: 0.11666198959947047
Weighted Recall: 0.2760806916426513 


Accuracy: 0.2680115273775216
Macro Precision: 0.18315088831982884
Weighted Recall: 0.2680115273775216 


Accuracy: 0.2703170028818444
Macro Precision: 0.24603565036887964
Weighted Recall: 0.2703170028818444 


                             model text_representation lemmatized  accuracy
0               LogisticRegression              TF-IDF        yes  0.659366
1           DecisionTreeClassifier              TF-IDF        yes  0.489568
2                    MultinomialNB              TF-IDF        yes  0.366455
3   HistGradientBoostingClassifier              TF-IDF        yes  0.151931
4                              SVC              TF-IDF        yes  0.654409
5           RandomF

## Support Vector Classifier

In [52]:
folds = StratifiedKFold(n_splits=5)

report = pd.read_csv('./report.csv')

score = []

for train_index, test_index in folds.split(X=df.posts, y=df.type):
    X_train, X_test, y_train, y_test = df.posts[train_index], df.posts[test_index], df.type[train_index], df.type[test_index]
    
    model = SVC()
    score.append(train_test_evaluate_report(model, X_train, X_test, y_train, y_test))

summary = {
    "model": "SVC",
    "text_representation": "Word2Vec",
    "lemmatized": "no",
    "accuracy": sum(score) / len(score)
}

report.loc[len(report)] = summary
report.to_csv('./report.csv', index=False, header=True)

print(report)


Accuracy: 0.2922190201729107
Macro Precision: 0.10981045680618029
Weighted Recall: 0.2922190201729107 


Accuracy: 0.3048991354466859
Macro Precision: 0.1257834307647264
Weighted Recall: 0.3048991354466859 


Accuracy: 0.2985590778097983
Macro Precision: 0.14882991898145748
Weighted Recall: 0.2985590778097983 


Accuracy: 0.30086455331412104
Macro Precision: 0.18384420435964002
Weighted Recall: 0.30086455331412104 


Accuracy: 0.2864553314121037
Macro Precision: 0.12499950980467175
Weighted Recall: 0.2864553314121037 


                             model text_representation lemmatized  accuracy
0               LogisticRegression              TF-IDF        yes  0.659366
1           DecisionTreeClassifier              TF-IDF        yes  0.489568
2                    MultinomialNB              TF-IDF        yes  0.366455
3   HistGradientBoostingClassifier              TF-IDF        yes  0.151931
4                              SVC              TF-IDF        yes  0.654409
5           Random

## Random Forest Classifier

In [53]:
folds = StratifiedKFold(n_splits=5)

report = pd.read_csv('./report.csv')

score = []

for train_index, test_index in folds.split(X=df.posts, y=df.type):
    X_train, X_test, y_train, y_test = df.posts[train_index], df.posts[test_index], df.type[train_index], df.type[test_index]
    
    model = RandomForestClassifier()
    score.append(train_test_evaluate_report(model, X_train, X_test, y_train, y_test))

summary = {
    "model": "RandomForestClassifier",
    "text_representation": "Word2Vec",
    "lemmatized": "no",
    "accuracy": sum(score) / len(score)
}

report.loc[len(report)] = summary
report.to_csv('./report.csv', index=False, header=True)

print(report)


Accuracy: 0.2760806916426513
Macro Precision: 0.1309992570012704
Weighted Recall: 0.2760806916426513 


Accuracy: 0.2824207492795389
Macro Precision: 0.11593110299802799
Weighted Recall: 0.2824207492795389 


Accuracy: 0.2639769452449568
Macro Precision: 0.10045831257253829
Weighted Recall: 0.2639769452449568 


Accuracy: 0.2610951008645533
Macro Precision: 0.1260772658675006
Weighted Recall: 0.2610951008645533 


Accuracy: 0.26512968299711814
Macro Precision: 0.09315346787184467
Weighted Recall: 0.26512968299711814 


                             model text_representation lemmatized  accuracy
0               LogisticRegression              TF-IDF        yes  0.659366
1           DecisionTreeClassifier              TF-IDF        yes  0.489568
2                    MultinomialNB              TF-IDF        yes  0.366455
3   HistGradientBoostingClassifier              TF-IDF        yes  0.151931
4                              SVC              TF-IDF        yes  0.654409
5           RandomF

## Gradient Boosting Classifier

In [54]:
folds = StratifiedKFold(n_splits=5)

report = pd.read_csv('./report.csv')

score = []

for train_index, test_index in folds.split(X=df.posts, y=df.type):
    X_train, X_test, y_train, y_test = df.posts[train_index], df.posts[test_index], df.type[train_index], df.type[test_index]
    
    model = GradientBoostingClassifier(n_estimators=9, warm_start=True)
    score.append(train_test_evaluate_report(model, X_train, X_test, y_train, y_test))

summary = {
    "model": "GradientBoostingClassifier",
    "text_representation": "Word2Vec",
    "lemmatized": "no",
    "accuracy": sum(score) / len(score)
}

report.loc[len(report)] = summary
report.to_csv('./report.csv', index=False, header=True)

print(report)


Accuracy: 0.26051873198847264
Macro Precision: 0.10705207040681744
Weighted Recall: 0.26051873198847264 


Accuracy: 0.2564841498559078
Macro Precision: 0.08693031999713283
Weighted Recall: 0.2564841498559078 


Accuracy: 0.2570605187319885
Macro Precision: 0.10517443152709255
Weighted Recall: 0.2570605187319885 


Accuracy: 0.2610951008645533
Macro Precision: 0.1346911409724959
Weighted Recall: 0.2610951008645533 


Accuracy: 0.26570605187319885
Macro Precision: 0.1161032210155017
Weighted Recall: 0.26570605187319885 


                             model text_representation lemmatized  accuracy
0               LogisticRegression              TF-IDF        yes  0.659366
1           DecisionTreeClassifier              TF-IDF        yes  0.489568
2                    MultinomialNB              TF-IDF        yes  0.366455
3   HistGradientBoostingClassifier              TF-IDF        yes  0.151931
4                              SVC              TF-IDF        yes  0.654409
5           Rando