In [1]:
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.svm import SVC,LinearSVC
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from tqdm import tqdm
import seaborn as sns
import pandas as pd
import warnings
warnings.filterwarnings('ignore')



# Training on Lemmatized data

In [2]:
df = pd.read_csv('../dataset/preprocessed/mbti_lemmatized.csv')
df

Unnamed: 0,type,posts
0,INFJ,intj moment sportscenter play pr...
1,ENTP,m find lack post alarming Sex boring s posit...
2,INTP,good course know s blessing ...
3,INTJ,Dear INTP enjoy conversation day esot...
4,ENTJ,fire s silly misconception approach logica...
...,...,...
8670,ISFP,think cat Fi dom reason website have...
8671,ENFP,thread exist someplace heck delete...
8672,INTP,question thing purple pill pick win lot...
8673,INFP,conflicted right come want child honestly...


In [None]:
def train_test_evaluate_report(model, X_train, X_test, y_train, y_test):
    # vectorize
    vectorizer = TfidfVectorizer(max_features=5000)
    vectorizer.fit(pd.concat([X_train, X_test]))
    X = vectorizer.transform(pd.concat([X_train, X_test])).toarray()
    
    # transform
    X_train = vectorizer.transform(X_train).toarray()
    X_test = vectorizer.transform(X_test).toarray()
    
    # encode target
    encoder = LabelEncoder()
    y_train = encoder.fit_transform(y_train)
    y_test = encoder.fit_transform(y_test)
    
    
    # train
    model.fit(X_train, y_train)
    
    report = classification_report(y_test, model.predict(X_test), target_names=encoder.inverse_transform([i for i in range(16)]), output_dict=True)
    
    # Accessing metrics
    accuracy = report['accuracy']
    macro_precision = report['macro avg']['precision']
    weighted_recall = report['weighted avg']['recall']

    print(f"Accuracy: {accuracy}")
    print(f"Macro Precision: {macro_precision}")
    print(f"Weighted Recall: {weighted_recall} \n\n")
    
    return accuracy

## ✅ Main Script with Grid Search over OneVsRest + LogisticRegression


In [4]:
folds = StratifiedKFold(n_splits=5)

report = pd.read_csv('./report.csv')
score = []
for train_index, test_index in folds.split(X=df.posts, y=df.type):
    X_train, X_test, y_train, y_test = df.posts[train_index], df.posts[test_index] \
        , df.type[train_index], df.type[test_index]
    model = LogisticRegression()
    score.append(train_test_evaluate_report(model,X_train, X_test, y_train, y_test))
    
summary = {
    "model": "LogisticRegression",
    "text_representation": "TF-IDF",
    "lemmatized": "yes",
    "accuracy": sum(score) / len(score)
}

report.loc[len(report)] = summary
report.to_csv('./report.csv', index=False, header=True)
print(report)

Accuracy: 0.6547550432276658
Macro Precision: 0.5981197743700568
Weighted Recall: 0.6547550432276658 


Accuracy: 0.6622478386167147
Macro Precision: 0.6009970929257646
Weighted Recall: 0.6622478386167147 


Accuracy: 0.6755043227665706
Macro Precision: 0.6117258857827558
Weighted Recall: 0.6755043227665706 


Accuracy: 0.6455331412103746
Macro Precision: 0.6147156932591632
Weighted Recall: 0.6455331412103746 


Accuracy: 0.6587896253602306
Macro Precision: 0.6284958370267169
Weighted Recall: 0.6587896253602306 


                model text_representation lemmatized  accuracy
0  LogisticRegression              TF-IDF        yes  0.659366


## Decision Tree Classifier

In [5]:
folds = StratifiedKFold(n_splits=5)
report = pd.read_csv('./report.csv')
score = []
for train_index, test_index in folds.split(X=df.posts, y=df.type):
    X_train, X_test, y_train, y_test = df.posts.iloc[train_index], df.posts.iloc[test_index], df.type.iloc[train_index], df.type.iloc[test_index]
    model = DecisionTreeClassifier()
    score.append(train_test_evaluate_report(model, X_train, X_test, y_train, y_test))
summary = {
    "model": "DecisionTreeClassifier",
    "text_representation": "TF-IDF",
    "lemmatized": "yes",
    "accuracy": sum(score) / len(score)
}
report.loc[len(report)] = summary
report.to_csv('./report.csv', index=False, header=True)
print(report)


Accuracy: 0.4847262247838617
Macro Precision: 0.3235730085071963
Weighted Recall: 0.4847262247838617 


Accuracy: 0.5072046109510087
Macro Precision: 0.33230275833747325
Weighted Recall: 0.5072046109510087 


Accuracy: 0.4910662824207493
Macro Precision: 0.32960753013978106
Weighted Recall: 0.4910662824207493 


Accuracy: 0.4904899135446686
Macro Precision: 0.3374430050969215
Weighted Recall: 0.4904899135446686 


Accuracy: 0.4743515850144092
Macro Precision: 0.30098421606063963
Weighted Recall: 0.4743515850144092 


                    model text_representation lemmatized  accuracy
0      LogisticRegression              TF-IDF        yes  0.659366
1  DecisionTreeClassifier              TF-IDF        yes  0.489568


## Multinomial Naive Bayes

In [6]:
folds = StratifiedKFold(n_splits=5)
report = pd.read_csv('./report.csv')
score = []
for train_index, test_index in folds.split(X=df.posts, y=df.type):
    X_train, X_test, y_train, y_test = df.posts.iloc[train_index], df.posts.iloc[test_index], df.type.iloc[train_index], df.type.iloc[test_index]
    model = MultinomialNB()
    score.append(train_test_evaluate_report(model, X_train, X_test, y_train, y_test))
summary = {
    "model": "MultinomialNB",
    "text_representation": "TF-IDF",
    "lemmatized": "yes",
    "accuracy": sum(score) / len(score)
}
report.loc[len(report)] = summary
report.to_csv('./report.csv', index=False, header=True)
print(report)

Accuracy: 0.3573487031700288
Macro Precision: 0.13964264676298493
Weighted Recall: 0.3573487031700288 


Accuracy: 0.3786743515850144
Macro Precision: 0.20897749140899896
Weighted Recall: 0.3786743515850144 


Accuracy: 0.36023054755043227
Macro Precision: 0.195397257270923
Weighted Recall: 0.36023054755043227 


Accuracy: 0.37175792507204614
Macro Precision: 0.1910233764021929
Weighted Recall: 0.37175792507204614 


Accuracy: 0.3642651296829971
Macro Precision: 0.20003786827487752
Weighted Recall: 0.3642651296829971 


                    model text_representation lemmatized  accuracy
0      LogisticRegression              TF-IDF        yes  0.659366
1  DecisionTreeClassifier              TF-IDF        yes  0.489568
2           MultinomialNB              TF-IDF        yes  0.366455


## Histogram Based Gradient Boosting

In [None]:
folds = StratifiedKFold(n_splits=5)

report = pd.read_csv('./report.csv')

score = []

for train_index, test_index in folds.split(X=df.posts, y=df.type):
    X_train, X_test, y_train, y_test = df.posts[train_index], df.posts[test_index], df.type[train_index], df.type[test_index]
    
    model = HistGradientBoostingClassifier()
    score.append(train_test_evaluate_report(model, X_train, X_test, y_train, y_test))

summary = {
    "model": "HistGradientBoostingClassifier",
    "text_representation": "TF-IDF",
    "lemmatized": "yes",
    "accuracy": sum(score) / len(score)
}

report.loc[len(report)] = summary
report.to_csv('./report.csv', index=False, header=True)

print(report)


Accuracy: 0.130835734870317
Macro Precision: 0.07579658687181423
Weighted Recall: 0.130835734870317 


Accuracy: 0.21152737752161382
Macro Precision: 0.07571490467937608
Weighted Recall: 0.21152737752161382 


Accuracy: 0.17406340057636888
Macro Precision: 0.09714383287261802
Weighted Recall: 0.17406340057636888 


Accuracy: 0.13025936599423632
Macro Precision: 0.08607301672038314
Weighted Recall: 0.13025936599423632 


Accuracy: 0.11296829971181556
Macro Precision: 0.07593965859771379
Weighted Recall: 0.11296829971181556 


                            model text_representation lemmatized  accuracy
0              LogisticRegression              TF-IDF        yes  0.659366
1          DecisionTreeClassifier              TF-IDF        yes  0.489568
2                   MultinomialNB              TF-IDF        yes  0.366455
3  HistGradientBoostingClassifier              TF-IDF        yes  0.151931


## Support Vector Classifier

In [8]:
folds = StratifiedKFold(n_splits=5)

report = pd.read_csv('./report.csv')

score = []

for train_index, test_index in folds.split(X=df.posts, y=df.type):
    X_train, X_test, y_train, y_test = df.posts[train_index], df.posts[test_index], df.type[train_index], df.type[test_index]
    
    model = SVC()
    score.append(train_test_evaluate_report(model, X_train, X_test, y_train, y_test))

summary = {
    "model": "SVC",
    "text_representation": "TF-IDF",
    "lemmatized": "yes",
    "accuracy": sum(score) / len(score)
}

report.loc[len(report)] = summary
report.to_csv('./report.csv', index=False, header=True)

print(report)


Accuracy: 0.6432276657060518
Macro Precision: 0.5895845160796904
Weighted Recall: 0.6432276657060518 


Accuracy: 0.661671469740634
Macro Precision: 0.5982320322777801
Weighted Recall: 0.661671469740634 


Accuracy: 0.667435158501441
Macro Precision: 0.6074338852894714
Weighted Recall: 0.667435158501441 


Accuracy: 0.6495677233429394
Macro Precision: 0.6058370443145276
Weighted Recall: 0.6495677233429394 


Accuracy: 0.6501440922190201
Macro Precision: 0.5947021567901003
Weighted Recall: 0.6501440922190201 


                            model text_representation lemmatized  accuracy
0              LogisticRegression              TF-IDF        yes  0.659366
1          DecisionTreeClassifier              TF-IDF        yes  0.489568
2                   MultinomialNB              TF-IDF        yes  0.366455
3  HistGradientBoostingClassifier              TF-IDF        yes  0.151931
4                             SVC              TF-IDF        yes  0.654409


## Random Forest Classifier

In [9]:
folds = StratifiedKFold(n_splits=5)

report = pd.read_csv('./report.csv')

score = []

for train_index, test_index in folds.split(X=df.posts, y=df.type):
    X_train, X_test, y_train, y_test = df.posts[train_index], df.posts[test_index], df.type[train_index], df.type[test_index]
    
    model = RandomForestClassifier()
    score.append(train_test_evaluate_report(model, X_train, X_test, y_train, y_test))

summary = {
    "model": "RandomForestClassifier",
    "text_representation": "TF-IDF",
    "lemmatized": "yes",
    "accuracy": sum(score) / len(score)
}

report.loc[len(report)] = summary
report.to_csv('./report.csv', index=False, header=True)

print(report)


Accuracy: 0.5244956772334294
Macro Precision: 0.49632579782891717
Weighted Recall: 0.5244956772334294 


Accuracy: 0.5106628242074928
Macro Precision: 0.38347868397700124
Weighted Recall: 0.5106628242074928 


Accuracy: 0.5371757925072046
Macro Precision: 0.37607045912798565
Weighted Recall: 0.5371757925072046 


Accuracy: 0.506628242074928
Macro Precision: 0.49381592942068187
Weighted Recall: 0.506628242074928 


Accuracy: 0.5285302593659942
Macro Precision: 0.5290394464890507
Weighted Recall: 0.5285302593659942 


                            model text_representation lemmatized  accuracy
0              LogisticRegression              TF-IDF        yes  0.659366
1          DecisionTreeClassifier              TF-IDF        yes  0.489568
2                   MultinomialNB              TF-IDF        yes  0.366455
3  HistGradientBoostingClassifier              TF-IDF        yes  0.151931
4                             SVC              TF-IDF        yes  0.654409
5          RandomForestClass

## Gradient Boosting Classifier

In [13]:
folds = StratifiedKFold(n_splits=5)

report = pd.read_csv('./report.csv')

score = []

for train_index, test_index in folds.split(X=df.posts, y=df.type):
    X_train, X_test, y_train, y_test = df.posts[train_index], df.posts[test_index], df.type[train_index], df.type[test_index]
    
    model = GradientBoostingClassifier(n_estimators=9, warm_start=True)
    score.append(train_test_evaluate_report(model, X_train, X_test, y_train, y_test))

summary = {
    "model": "GradientBoostingClassifier",
    "text_representation": "TF-IDF",
    "lemmatized": "yes",
    "accuracy": sum(score) / len(score)
}

report.loc[len(report)] = summary
report.to_csv('./report.csv', index=False, header=True)

print(report)


Accuracy: 0.6219020172910663
Macro Precision: 0.5121086621192832
Weighted Recall: 0.6219020172910663 


Accuracy: 0.6172910662824207
Macro Precision: 0.49508511868022465
Weighted Recall: 0.6172910662824207 


Accuracy: 0.6144092219020173
Macro Precision: 0.4832410393859426
Weighted Recall: 0.6144092219020173 


Accuracy: 0.5936599423631124
Macro Precision: 0.4977107064581531
Weighted Recall: 0.5936599423631124 


Accuracy: 0.6097982708933718
Macro Precision: 0.5018758087683344
Weighted Recall: 0.6097982708933718 


                            model text_representation lemmatized  accuracy
0              LogisticRegression              TF-IDF        yes  0.659366
1          DecisionTreeClassifier              TF-IDF        yes  0.489568
2                   MultinomialNB              TF-IDF        yes  0.366455
3  HistGradientBoostingClassifier              TF-IDF        yes  0.151931
4                             SVC              TF-IDF        yes  0.654409
5          RandomForestClassi

# Training on Unlemmatized data

In [14]:
df = pd.read_csv('../dataset/preprocessed/mbti_lemmatized.csv')
df

Unnamed: 0,type,posts
0,INFJ,intj moment sportscenter play pr...
1,ENTP,m find lack post alarming Sex boring s posit...
2,INTP,good course know s blessing ...
3,INTJ,Dear INTP enjoy conversation day esot...
4,ENTJ,fire s silly misconception approach logica...
...,...,...
8670,ISFP,think cat Fi dom reason website have...
8671,ENFP,thread exist someplace heck delete...
8672,INTP,question thing purple pill pick win lot...
8673,INFP,conflicted right come want child honestly...


## Logistic Regression

In [15]:
folds = StratifiedKFold(n_splits=5)

report = pd.read_csv('./report.csv')
score = []
for train_index, test_index in folds.split(X=df.posts, y=df.type):
    X_train, X_test, y_train, y_test = df.posts[train_index], df.posts[test_index] \
        , df.type[train_index], df.type[test_index]
    model = LogisticRegression()
    score.append(train_test_evaluate_report(model,X_train, X_test, y_train, y_test))
    
summary = {
    "model": "LogisticRegression",
    "text_representation": "TF-IDF",
    "lemmatized": "no",
    "accuracy": sum(score) / len(score)
}

report.loc[len(report)] = summary
report.to_csv('./report.csv', index=False, header=True)
print(report)

Accuracy: 0.6547550432276658
Macro Precision: 0.5981197743700568
Weighted Recall: 0.6547550432276658 


Accuracy: 0.6622478386167147
Macro Precision: 0.6009970929257646
Weighted Recall: 0.6622478386167147 


Accuracy: 0.6755043227665706
Macro Precision: 0.6117258857827558
Weighted Recall: 0.6755043227665706 


Accuracy: 0.6455331412103746
Macro Precision: 0.6147156932591632
Weighted Recall: 0.6455331412103746 


Accuracy: 0.6587896253602306
Macro Precision: 0.6284958370267169
Weighted Recall: 0.6587896253602306 


                            model text_representation lemmatized  accuracy
0              LogisticRegression              TF-IDF        yes  0.659366
1          DecisionTreeClassifier              TF-IDF        yes  0.489568
2                   MultinomialNB              TF-IDF        yes  0.366455
3  HistGradientBoostingClassifier              TF-IDF        yes  0.151931
4                             SVC              TF-IDF        yes  0.654409
5          RandomForestClassif

## Decision Tree Classifier

In [16]:
folds = StratifiedKFold(n_splits=5)
report = pd.read_csv('./report.csv')
score = []
for train_index, test_index in folds.split(X=df.posts, y=df.type):
    X_train, X_test, y_train, y_test = df.posts.iloc[train_index], df.posts.iloc[test_index], df.type.iloc[train_index], df.type.iloc[test_index]
    model = DecisionTreeClassifier()
    score.append(train_test_evaluate_report(model, X_train, X_test, y_train, y_test))
summary = {
    "model": "DecisionTreeClassifier",
    "text_representation": "TF-IDF",
    "lemmatized": "no",
    "accuracy": sum(score) / len(score)
}
report.loc[len(report)] = summary
report.to_csv('./report.csv', index=False, header=True)
print(report)


Accuracy: 0.4899135446685879
Macro Precision: 0.33108792896167166
Weighted Recall: 0.4899135446685879 


Accuracy: 0.506628242074928
Macro Precision: 0.3263070034057317
Weighted Recall: 0.506628242074928 


Accuracy: 0.49279538904899134
Macro Precision: 0.3218138087614454
Weighted Recall: 0.49279538904899134 


Accuracy: 0.484149855907781
Macro Precision: 0.32314247620251035
Weighted Recall: 0.484149855907781 


Accuracy: 0.47723342939481267
Macro Precision: 0.30582944935132184
Weighted Recall: 0.47723342939481267 


                            model text_representation lemmatized  accuracy
0              LogisticRegression              TF-IDF        yes  0.659366
1          DecisionTreeClassifier              TF-IDF        yes  0.489568
2                   MultinomialNB              TF-IDF        yes  0.366455
3  HistGradientBoostingClassifier              TF-IDF        yes  0.151931
4                             SVC              TF-IDF        yes  0.654409
5          RandomForestClas

## Multinomial Naive Bayes

In [17]:
folds = StratifiedKFold(n_splits=5)
report = pd.read_csv('./report.csv')
score = []
for train_index, test_index in folds.split(X=df.posts, y=df.type):
    X_train, X_test, y_train, y_test = df.posts.iloc[train_index], df.posts.iloc[test_index], df.type.iloc[train_index], df.type.iloc[test_index]
    model = MultinomialNB()
    score.append(train_test_evaluate_report(model, X_train, X_test, y_train, y_test))
summary = {
    "model": "MultinomialNB",
    "text_representation": "TF-IDF",
    "lemmatized": "no",
    "accuracy": sum(score) / len(score)
}
report.loc[len(report)] = summary
report.to_csv('./report.csv', index=False, header=True)
print(report)

Accuracy: 0.3573487031700288
Macro Precision: 0.13964264676298493
Weighted Recall: 0.3573487031700288 


Accuracy: 0.3786743515850144
Macro Precision: 0.20897749140899896
Weighted Recall: 0.3786743515850144 


Accuracy: 0.36023054755043227
Macro Precision: 0.195397257270923
Weighted Recall: 0.36023054755043227 


Accuracy: 0.37175792507204614
Macro Precision: 0.1910233764021929
Weighted Recall: 0.37175792507204614 


Accuracy: 0.3642651296829971
Macro Precision: 0.20003786827487752
Weighted Recall: 0.3642651296829971 


                            model text_representation lemmatized  accuracy
0              LogisticRegression              TF-IDF        yes  0.659366
1          DecisionTreeClassifier              TF-IDF        yes  0.489568
2                   MultinomialNB              TF-IDF        yes  0.366455
3  HistGradientBoostingClassifier              TF-IDF        yes  0.151931
4                             SVC              TF-IDF        yes  0.654409
5          RandomForestC

## Histogram Based Gradiant Boost

In [18]:
folds = StratifiedKFold(n_splits=5)

report = pd.read_csv('./report.csv')

score = []

for train_index, test_index in folds.split(X=df.posts, y=df.type):
    X_train, X_test, y_train, y_test = df.posts[train_index], df.posts[test_index], df.type[train_index], df.type[test_index]
    
    model = HistGradientBoostingClassifier()
    score.append(train_test_evaluate_report(model, X_train, X_test, y_train, y_test))

summary = {
    "model": "HistGradientBoostingClassifier",
    "text_representation": "TF-IDF",
    "lemmatized": "no",
    "accuracy": sum(score) / len(score)
}

report.loc[len(report)] = summary
report.to_csv('./report.csv', index=False, header=True)

print(report)


Accuracy: 0.130835734870317
Macro Precision: 0.07579658687181423
Weighted Recall: 0.130835734870317 


Accuracy: 0.21152737752161382
Macro Precision: 0.07571490467937608
Weighted Recall: 0.21152737752161382 


Accuracy: 0.17406340057636888
Macro Precision: 0.09714383287261802
Weighted Recall: 0.17406340057636888 


Accuracy: 0.13025936599423632
Macro Precision: 0.08607301672038314
Weighted Recall: 0.13025936599423632 


Accuracy: 0.11296829971181556
Macro Precision: 0.07593965859771379
Weighted Recall: 0.11296829971181556 


                             model text_representation lemmatized  accuracy
0               LogisticRegression              TF-IDF        yes  0.659366
1           DecisionTreeClassifier              TF-IDF        yes  0.489568
2                    MultinomialNB              TF-IDF        yes  0.366455
3   HistGradientBoostingClassifier              TF-IDF        yes  0.151931
4                              SVC              TF-IDF        yes  0.654409
5           R

## Support Vector Classifier

In [19]:
folds = StratifiedKFold(n_splits=5)

report = pd.read_csv('./report.csv')

score = []

for train_index, test_index in folds.split(X=df.posts, y=df.type):
    X_train, X_test, y_train, y_test = df.posts[train_index], df.posts[test_index], df.type[train_index], df.type[test_index]
    
    model = SVC()
    score.append(train_test_evaluate_report(model, X_train, X_test, y_train, y_test))

summary = {
    "model": "SVC",
    "text_representation": "TF-IDF",
    "lemmatized": "no",
    "accuracy": sum(score) / len(score)
}

report.loc[len(report)] = summary
report.to_csv('./report.csv', index=False, header=True)

print(report)


Accuracy: 0.6432276657060518
Macro Precision: 0.5895845160796904
Weighted Recall: 0.6432276657060518 


Accuracy: 0.661671469740634
Macro Precision: 0.5982320322777801
Weighted Recall: 0.661671469740634 


Accuracy: 0.667435158501441
Macro Precision: 0.6074338852894714
Weighted Recall: 0.667435158501441 


Accuracy: 0.6495677233429394
Macro Precision: 0.6058370443145276
Weighted Recall: 0.6495677233429394 


Accuracy: 0.6501440922190201
Macro Precision: 0.5947021567901003
Weighted Recall: 0.6501440922190201 


                             model text_representation lemmatized  accuracy
0               LogisticRegression              TF-IDF        yes  0.659366
1           DecisionTreeClassifier              TF-IDF        yes  0.489568
2                    MultinomialNB              TF-IDF        yes  0.366455
3   HistGradientBoostingClassifier              TF-IDF        yes  0.151931
4                              SVC              TF-IDF        yes  0.654409
5           RandomForestClas

## Random Forest Classifier

In [20]:
folds = StratifiedKFold(n_splits=5)

report = pd.read_csv('./report.csv')

score = []

for train_index, test_index in folds.split(X=df.posts, y=df.type):
    X_train, X_test, y_train, y_test = df.posts[train_index], df.posts[test_index], df.type[train_index], df.type[test_index]
    
    model = RandomForestClassifier()
    score.append(train_test_evaluate_report(model, X_train, X_test, y_train, y_test))

summary = {
    "model": "RandomForestClassifier",
    "text_representation": "TF-IDF",
    "lemmatized": "no",
    "accuracy": sum(score) / len(score)
}

report.loc[len(report)] = summary
report.to_csv('./report.csv', index=False, header=True)

print(report)


Accuracy: 0.5193083573487032
Macro Precision: 0.37895351906865066
Weighted Recall: 0.5193083573487032 


Accuracy: 0.5337175792507205
Macro Precision: 0.43899791739280375
Weighted Recall: 0.5337175792507205 


Accuracy: 0.5429394812680115
Macro Precision: 0.43954081604185
Weighted Recall: 0.5429394812680115 


Accuracy: 0.5181556195965418
Macro Precision: 0.47177086914951827
Weighted Recall: 0.5181556195965418 


Accuracy: 0.49971181556195965
Macro Precision: 0.4470046424920896
Weighted Recall: 0.49971181556195965 


                             model text_representation lemmatized  accuracy
0               LogisticRegression              TF-IDF        yes  0.659366
1           DecisionTreeClassifier              TF-IDF        yes  0.489568
2                    MultinomialNB              TF-IDF        yes  0.366455
3   HistGradientBoostingClassifier              TF-IDF        yes  0.151931
4                              SVC              TF-IDF        yes  0.654409
5           RandomFor

## Gradient Boosting Classifier

In [21]:
folds = StratifiedKFold(n_splits=5)

report = pd.read_csv('./report.csv')

score = []

for train_index, test_index in folds.split(X=df.posts, y=df.type):
    X_train, X_test, y_train, y_test = df.posts[train_index], df.posts[test_index], df.type[train_index], df.type[test_index]
    
    model = GradientBoostingClassifier(n_estimators=9, warm_start=True)
    score.append(train_test_evaluate_report(model, X_train, X_test, y_train, y_test))

summary = {
    "model": "GradientBoostingClassifier",
    "text_representation": "TF-IDF",
    "lemmatized": "yes",
    "accuracy": sum(score) / len(score)
}

report.loc[len(report)] = summary
report.to_csv('./report.csv', index=False, header=True)

print(report)


Accuracy: 0.6178674351585014
Macro Precision: 0.5064601397085225
Weighted Recall: 0.6178674351585014 


Accuracy: 0.6213256484149856
Macro Precision: 0.4932625966459539
Weighted Recall: 0.6213256484149856 


Accuracy: 0.6184438040345821
Macro Precision: 0.4913710508567132
Weighted Recall: 0.6184438040345821 


Accuracy: 0.6
Macro Precision: 0.523723934787216
Weighted Recall: 0.6 


Accuracy: 0.6121037463976945
Macro Precision: 0.5122625675971274
Weighted Recall: 0.6121037463976945 


                             model text_representation lemmatized  accuracy
0               LogisticRegression              TF-IDF        yes  0.659366
1           DecisionTreeClassifier              TF-IDF        yes  0.489568
2                    MultinomialNB              TF-IDF        yes  0.366455
3   HistGradientBoostingClassifier              TF-IDF        yes  0.151931
4                              SVC              TF-IDF        yes  0.654409
5           RandomForestClassifier              TF-IDF 