In [1]:
import joblib
import pandas as pd
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings('ignore')

# Fine Tuning Model and exportation

In [2]:
df = pd.read_csv('../dataset/preprocessed/mbti_lemmatized.csv')
df

Unnamed: 0,type,posts
0,INFJ,intj moment sportscenter play pr...
1,ENTP,m find lack post alarming Sex boring s posit...
2,INTP,good course know s blessing ...
3,INTJ,Dear INTP enjoy conversation day esot...
4,ENTJ,fire s silly misconception approach logica...
...,...,...
8670,ISFP,think cat Fi dom reason website have...
8671,ENFP,thread exist someplace heck delete...
8672,INTP,question thing purple pill pick win lot...
8673,INFP,conflicted right come want child honestly...


In [15]:
encoder = LabelEncoder()

def train_test_evaluate_report(model, X_train, X_test, y_train, y_test):
    # Encode target
    y_train = encoder.fit_transform(y_train)
    y_test = encoder.transform(y_test)
    
    # Train model
    model.fit(X_train, y_train)
    
    joblib.dump(encoder, 'label_encoder.pkl')
    # Get classification report
    report = classification_report(y_test, model.predict(X_test), target_names=encoder.inverse_transform([i for i in range(16)]))
    print(report)
    
    return model.score(X_test, y_test)

# Define the pipeline
def create_pipeline():
    return Pipeline([
        ('vectorizer', TfidfVectorizer(max_features=5000)),
        ('classifier', LogisticRegression())
    ])

## Logistic Regresssion

In [None]:
# Define parameter grid for grid search
param_grid = {
    'classifier__C': [0.1, 1, 10],
    'classifier__solver': ['lbfgs', 'liblinear'],
    'classifier__max_iter': [1000, 2000, 5000, 10000]
}

# Setup cross-validation and GridSearchCV
folds = StratifiedKFold(n_splits=5)
report = pd.read_csv('./report.csv')
score = []

for train_index, test_index in folds.split(X=df.posts, y=df.type):
    X_train, X_test, y_train, y_test = df.posts[train_index], df.posts[test_index], df.type[train_index], df.type[test_index]

    # Create a pipeline
    pipeline = create_pipeline()
    
    # Set up GridSearchCV
    grid_search = GridSearchCV(pipeline, param_grid, cv=folds, n_jobs=-1, verbose=2)
    grid_search.fit(X_train, y_train)
    
    # Print the best parameters
    print(f"Best parameters: {grid_search.best_params_}")
    
    # Evaluate the model
    best_model = grid_search.best_estimator_
    score.append(train_test_evaluate_report(best_model, X_train, X_test, y_train, y_test))

# Saving the trained pipeline (best model)
joblib.dump(grid_search.best_estimator_, 'best_model_pipeline.pkl')

Fitting 5 folds for each of 24 candidates, totalling 120 fits


[CV] END classifier__C=0.1, classifier__max_iter=1000, classifier__solver=lbfgs; total time=  12.5s
[CV] END classifier__C=0.1, classifier__max_iter=1000, classifier__solver=lbfgs; total time=  12.4s
[CV] END classifier__C=0.1, classifier__max_iter=1000, classifier__solver=lbfgs; total time=  14.1s
[CV] END classifier__C=0.1, classifier__max_iter=1000, classifier__solver=lbfgs; total time=  14.7s
[CV] END classifier__C=0.1, classifier__max_iter=1000, classifier__solver=liblinear; total time=  11.9s
[CV] END classifier__C=0.1, classifier__max_iter=1000, classifier__solver=lbfgs; total time=  15.0s
[CV] END classifier__C=0.1, classifier__max_iter=1000, classifier__solver=liblinear; total time=  15.0s
[CV] END classifier__C=0.1, classifier__max_iter=1000, classifier__solver=liblinear; total time=  16.1s
[CV] END classifier__C=0.1, classifier__max_iter=1000, classifier__solver=liblinear; total time=  13.2s
[CV] END classifier__C=0.1, classifier__max_iter=1000, classifier__solver=liblinear;

['best_model_pipeline.pkl']

In [8]:
dir(grid_search.best_estimator_)

['__abstractmethods__',
 '__annotations__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__sklearn_clone__',
 '__sklearn_is_fitted__',
 '__sklearn_tags__',
 '__slotnames__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_build_request_for_signature',
 '_can_fit_transform',
 '_can_inverse_transform',
 '_can_transform',
 '_check_feature_names',
 '_check_method_params',
 '_check_n_features',
 '_doc_link_module',
 '_doc_link_template',
 '_doc_link_url_param_generator',
 '_estimator_type',
 '_final_estimator',
 '_fit',
 '_get_default_requests',
 '_get_doc_link',
 '_get_metadata_for_step',
 '_get_metadata_request',
 '_get_param_names',
 '_

In [9]:
grid_search.best_score_

0.6557636887608069

In [13]:
grid_search.best_estimator_.predict(["I feel so tired today"])

array([9])

In [14]:
grid_search.best_params_

{'classifier__C': 10,
 'classifier__max_iter': 1000,
 'classifier__solver': 'liblinear'}

In [16]:
# Define parameter grid for grid search
param_grid = {
    'classifier__C': [10],
    'classifier__solver': ['liblinear'],
    'classifier__max_iter': [1000]
}

# Setup cross-validation and GridSearchCV
folds = StratifiedKFold(n_splits=5)
report = pd.read_csv('./report.csv')
score = []

for train_index, test_index in folds.split(X=df.posts, y=df.type):
    X_train, X_test, y_train, y_test = df.posts[train_index], df.posts[test_index], df.type[train_index], df.type[test_index]

    # Create a pipeline
    pipeline = create_pipeline()
    
    # Set up GridSearchCV
    grid_search = GridSearchCV(pipeline, param_grid, cv=folds, n_jobs=-1, verbose=2)
    grid_search.fit(X_train, y_train)
    
    # Print the best parameters
    print(f"Best parameters: {grid_search.best_params_}")
    
    # Evaluate the model
    best_model = grid_search.best_estimator_
    score.append(train_test_evaluate_report(best_model, X_train, X_test, y_train, y_test))

# Saving the trained pipeline (best model)
joblib.dump(grid_search.best_estimator_, 'best_model_pipeline.pkl')

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END classifier__C=10, classifier__max_iter=1000, classifier__solver=liblinear; total time=  25.2s
[CV] END classifier__C=10, classifier__max_iter=1000, classifier__solver=liblinear; total time=  25.4s
[CV] END classifier__C=10, classifier__max_iter=1000, classifier__solver=liblinear; total time=  25.7s
[CV] END classifier__C=10, classifier__max_iter=1000, classifier__solver=liblinear; total time=  26.2s
[CV] END classifier__C=10, classifier__max_iter=1000, classifier__solver=liblinear; total time=  10.5s
Best parameters: {'classifier__C': 10, 'classifier__max_iter': 1000, 'classifier__solver': 'liblinear'}
              precision    recall  f1-score   support

        ENFJ       0.79      0.29      0.42        38
        ENFP       0.66      0.60      0.63       135
        ENTJ       0.79      0.40      0.54        47
        ENTP       0.62      0.62      0.62       137
        ESFJ       1.00      0.12      0.22        

['best_model_pipeline.pkl']

In [17]:
results = grid_search.best_estimator_.predict(["I feel so tired today"])
decoded = encoder.inverse_transform(results)
decoded

array(['INFP'], dtype=object)

In [22]:
results = grid_search.best_estimator_.predict_proba(["I feel so tired today"])

for index in range(16):
    print(encoder.inverse_transform([index]), results[0][index])

['ENFJ'] 0.006501841384795359
['ENFP'] 0.0033455879785575537
['ENTJ'] 0.018341351479867502
['ENTP'] 0.004449597256039055
['ESFJ'] 0.0030376229343223402
['ESFP'] 0.012279221585312524
['ESTJ'] 0.004692909357345497
['ESTP'] 0.0014148845446219699
['INFJ'] 0.1280410616863811
['INFP'] 0.7576781540703091
['INTJ'] 0.00846308543954056
['INTP'] 0.025556921120510674
['ISFJ'] 0.015766500823655796
['ISFP'] 0.005553521354262769
['ISTJ'] 0.003371218206668025
['ISTP'] 0.001506520777809998
