# Myers Brigss ML Project: Logistical Regression Model

## Imports & Dependencies

In [1]:
import pandas as pd
import numpy as np

In [2]:
# import the cleaned df
cleaned_df = pd.read_csv('../../Resources/data/cleaned_mbti.csv', index_col=0)
cleaned_df.head()

Unnamed: 0,type,text
0,INFJ,and intj moments sportscenter not top ten...
1,ENTP,i m finding the lack of me in these posts ver...
2,INTP,good one course to which i say i k...
3,INTJ,dear intp i enjoyed our conversation the o...
4,ENTJ,you re fired that s another silly misconcept...


## Split type column to 4 combinations

In [3]:
# Split type columns into four binary columns
split_df = cleaned_df[['type','text']].copy()
split_df['E-I'] = split_df['type'].str.extract('(.)[N,S]',1)
split_df['N-S'] = split_df['type'].str.extract('[E,I](.)[F,T]',1)
split_df['T-F'] = split_df['type'].str.extract('[N,S](.)[J,P]',1)
split_df['J-P'] = split_df['type'].str.extract('[F,T](.)',1)
split_df.head()

Unnamed: 0,type,text,E-I,N-S,T-F,J-P
0,INFJ,and intj moments sportscenter not top ten...,I,N,F,J
1,ENTP,i m finding the lack of me in these posts ver...,E,N,T,P
2,INTP,good one course to which i say i k...,I,N,T,P
3,INTJ,dear intp i enjoyed our conversation the o...,I,N,T,J
4,ENTJ,you re fired that s another silly misconcept...,E,N,T,J


In [4]:
# Encode letters to numeric values
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

encoded_df = cleaned_df[['type','text']].copy()
encoded_df['E0-I1'] = le.fit_transform(split_df['E-I'])
encoded_df['N0-S1'] = le.fit_transform(split_df['N-S'])
encoded_df['F0-T1'] = le.fit_transform(split_df['T-F'])
encoded_df['J0-P1'] = le.fit_transform(split_df['J-P'])

encoded_df.head()

Unnamed: 0,type,text,E0-I1,N0-S1,F0-T1,J0-P1
0,INFJ,and intj moments sportscenter not top ten...,1,0,0,0
1,ENTP,i m finding the lack of me in these posts ver...,0,0,1,1
2,INTP,good one course to which i say i k...,1,0,1,1
3,INTJ,dear intp i enjoyed our conversation the o...,1,0,1,0
4,ENTJ,you re fired that s another silly misconcept...,0,0,1,0


In [5]:
encoded_df.columns

Index(['type', 'text', 'E0-I1', 'N0-S1', 'F0-T1', 'J0-P1'], dtype='object')

## TFIDF Vectorizer

In [5]:
# Define X and y
X = encoded_df["text"].values
y_all = encoded_df.drop(columns=['type', 'text'])

# Split training and testing dataset
from sklearn.model_selection import train_test_split
X_train, X_test, y_all_train, y_all_test = train_test_split(X, y_all, random_state=42)

In [6]:
# Define TFIDF verctorizer
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    max_features=17000,
    min_df=7,
    max_df=0.8,
    stop_words="english",
    ngram_range=(1,3),
)

  from collections import Mapping, defaultdict


In [7]:
# create vectors for X
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

# Logistics Regression Model

In [8]:
# create log reg model
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs', random_state=1)
classifier

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=1, solver='lbfgs', tol=0.0001,
          verbose=0, warm_start=False)

## 1. No resampling

### E-I

In [10]:
# Fit E-I combination
y_EI_train = y_all_train['E0-I1']
y_EI_test = y_all_test['E0-I1']

classifier.fit(X_train, y_EI_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=1, solver='lbfgs', tol=0.0001,
          verbose=0, warm_start=False)

In [11]:
# Predict outcomes for test data set
y_EI_pred = classifier.predict(X_test)
EI_result = pd.DataFrame({"Prediction": y_EI_pred, "Actual": y_EI_test})
EI_result.head()

Unnamed: 0,Prediction,Actual
2802,0,1
2166,1,1
1919,1,1
360,0,0
1115,1,0


### N-S

In [12]:
# Fit N-S combination
y_NS_train = y_all_train['N0-S1']
y_NS_test = y_all_test['N0-S1']

classifier.fit(X_train, y_NS_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=1, solver='lbfgs', tol=0.0001,
          verbose=0, warm_start=False)

In [13]:
# Predict outcomes for test data set
y_NS_pred = classifier.predict(X_test)
NS_result = pd.DataFrame({"Prediction": y_NS_pred, "Actual": y_NS_test})
NS_result.head()

Unnamed: 0,Prediction,Actual
2802,0,0
2166,0,0
1919,0,0
360,0,0
1115,0,0


### F-T

In [14]:
# Fit F-T combination
y_FT_train = y_all_train['F0-T1']
y_FT_test = y_all_test['F0-T1']

classifier.fit(X_train, y_FT_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=1, solver='lbfgs', tol=0.0001,
          verbose=0, warm_start=False)

In [15]:
# Predict outcomes for test data set
y_FT_pred = classifier.predict(X_test)
FT_result = pd.DataFrame({"Prediction": y_FT_pred, "Actual": y_FT_test})
FT_result.head()

Unnamed: 0,Prediction,Actual
2802,1,1
2166,0,1
1919,1,1
360,0,0
1115,1,1


### J-P

In [16]:
# Fit F-T combination
y_JP_train = y_all_train['J0-P1']
y_JP_test = y_all_test['J0-P1']

classifier.fit(X_train, y_JP_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=1, solver='lbfgs', tol=0.0001,
          verbose=0, warm_start=False)

In [17]:
# Predict outcomes for test data set
y_JP_pred = classifier.predict(X_test)
JP_result = pd.DataFrame({"Prediction": y_JP_pred, "Actual": y_JP_test})
JP_result.head()

Unnamed: 0,Prediction,Actual
2802,1,1
2166,0,0
1919,1,1
360,0,1
1115,0,0


### Model Validation

In [18]:
# Calculate accuracy score for each group
from sklearn.metrics import accuracy_score
print(f" Logistic regression model accuracy for E-I: {accuracy_score(y_EI_test, y_EI_pred):.3f}")
print(f" Logistic regression model accuracy for N-S: {accuracy_score(y_NS_test, y_NS_pred):.3f}")
print(f" Logistic regression model accuracy for F-T: {accuracy_score(y_FT_test, y_FT_pred):.3f}")
print(f" Logistic regression model accuracy for J-P: {accuracy_score(y_JP_test, y_JP_pred):.3f}")

 Logistic regression model accuracy for E-I: 0.846
 Logistic regression model accuracy for N-S: 0.871
 Logistic regression model accuracy for F-T: 0.848
 Logistic regression model accuracy for J-P: 0.799


In [36]:
# check out classigication report
from sklearn.metrics import confusion_matrix, classification_report

report_EI = classification_report(y_EI_test, y_EI_pred)
print(f"Classification report for E0-I1 group:")
print(report_EI)
print(f"Accuracy score: {accuracy_score(y_EI_test, y_EI_pred):.3f}")
print("--------------------------")

report_NS = classification_report(y_NS_test, y_NS_pred)
print(f"Classification report for N0-S1 group:")
print(report_NS)
print(f"Accuracy score: {accuracy_score(y_NS_test, y_NS_pred):.3f}")
print("--------------------------")

report_FT = classification_report(y_FT_test, y_FT_pred)
print(f"Classification report for F0-T1 group:")
print(report_FT)
print(f"Accuracy score: {accuracy_score(y_FT_test, y_FT_pred):.3f}")
print("--------------------------")

report_JP = classification_report(y_JP_test, y_JP_pred)
print(f"Classification report for J0-P1 group:")
print(report_JP)
print(f"Accuracy score: {accuracy_score(y_JP_test, y_JP_pred_ros):.3f}")

Classification report for E0-I1 group:
             precision    recall  f1-score   support

          0       0.83      0.37      0.51       473
          1       0.85      0.98      0.91      1696

avg / total       0.84      0.85      0.82      2169

Accuracy score: 0.846
--------------------------
Classification report for N0-S1 group:
             precision    recall  f1-score   support

          0       0.87      1.00      0.93      1862
          1       0.80      0.12      0.21       307

avg / total       0.86      0.87      0.83      2169

Accuracy score: 0.871
--------------------------
Classification report for F0-T1 group:
             precision    recall  f1-score   support

          0       0.86      0.86      0.86      1179
          1       0.83      0.83      0.83       990

avg / total       0.85      0.85      0.85      2169

Accuracy score: 0.848
--------------------------
Classification report for J0-P1 group:
             precision    recall  f1-score   support

## 2. Random Oversampling

### E-I

In [9]:
# implement random oversampling
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)

y_EI_train = y_all_train['E0-I1']
y_EI_test = y_all_test['E0-I1']

X_resampled_ros, y_EI_resampled_ros = ros.fit_sample(X_train, y_EI_train)

from collections import Counter
Counter(y_EI_resampled_ros)

Counter({0: 4980, 1: 4980})

In [10]:
# Fit E-I combination with oversampled x_train and y_EI_train

classifier.fit(X_resampled_ros, y_EI_resampled_ros)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=1, solver='lbfgs', tol=0.0001,
          verbose=0, warm_start=False)

In [11]:
# Predict outcomes for test data set
y_EI_pred_ros = classifier.predict(X_test)
EI_result = pd.DataFrame({"Prediction": y_EI_pred_ros, "Actual": y_EI_test})

In [None]:
# import pickle
import pickle

In [12]:
# save the model weights
pickle.dump(classifier, open('../../Resources/model_weights/model_EI.pkl','wb'))

### N-S

In [23]:
# resample N-S combination
y_NS_train = y_all_train['N0-S1']
y_NS_test = y_all_test['N0-S1']

X_resampled_ros, y_NS_resampled_ros = ros.fit_sample(X_train, y_NS_train)

Counter(y_NS_resampled_ros)

Counter({0: 5616, 1: 5616})

In [24]:
# Fit N-S combination with oversampled x_train and y_NS_train

classifier.fit(X_resampled_ros, y_NS_resampled_ros)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=1, solver='lbfgs', tol=0.0001,
          verbose=0, warm_start=False)

In [25]:
# Predict outcomes for test data set
y_NS_pred_ros = classifier.predict(X_test)
NS_result = pd.DataFrame({"Prediction": y_NS_pred_ros, "Actual": y_NS_test})
NS_result.head(5)

Unnamed: 0,Prediction,Actual
2802,0,0
2166,0,0
1919,0,0
360,0,0
1115,0,0


In [None]:
# save the model weights
pickle.dump(classifier, open('../../Resources/model_weights/model_NS.pkl','wb'))

### F-T

In [26]:
# resample F-T combination
y_FT_train = y_all_train['F0-T1']
y_FT_test = y_all_test['F0-T1']

X_resampled_ros, y_FT_resampled_ros = ros.fit_sample(X_train, y_FT_train)

Counter(y_FT_resampled_ros)

Counter({0: 3515, 1: 3515})

In [27]:
# Fit F-T combination with oversampled x_train and y_FT_train

classifier.fit(X_resampled_ros, y_FT_resampled_ros)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=1, solver='lbfgs', tol=0.0001,
          verbose=0, warm_start=False)

In [28]:
# Predict outcomes for test data set
y_FT_pred_ros = classifier.predict(X_test)
FT_result = pd.DataFrame({"Prediction": y_FT_pred_ros, "Actual": y_FT_test})
FT_result.head(5)

Unnamed: 0,Prediction,Actual
2802,1,1
2166,0,1
1919,1,1
360,0,0
1115,1,1


In [None]:
# save the model weights
pickle.dump(classifier, open('../../Resources/model_weights/model_FT.pkl','wb'))

### J-P

In [29]:
# resample J-P combination
y_JP_train = y_all_train['J0-P1']
y_JP_test = y_all_test['J0-P1']

X_resampled_ros, y_JP_resampled_ros = ros.fit_sample(X_train, y_JP_train)

Counter(y_JP_resampled_ros)

Counter({1: 3914, 0: 3914})

In [30]:
# Fit J-P combination with oversampled x_train and y_JP_train

classifier.fit(X_resampled_ros, y_JP_resampled_ros)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=1, solver='lbfgs', tol=0.0001,
          verbose=0, warm_start=False)

In [31]:
# Predict outcomes for test data set
y_JP_pred_ros = classifier.predict(X_test)
JP_result = pd.DataFrame({"Prediction": y_JP_pred_ros, "Actual": y_JP_test})
JP_result.head(5)

Unnamed: 0,Prediction,Actual
2802,1,1
2166,0,0
1919,1,1
360,0,1
1115,0,0


In [None]:
# save the model weights
pickle.dump(classifier, open('../../Resources/model_weights/model_JP.pkl','wb'))

### Model Validation

In [33]:
# Calculate accuracy score for each group
from sklearn.metrics import accuracy_score
print(f" Logistic regression model accuracy for E-I: {accuracy_score(y_EI_test, y_EI_pred_ros):.3f}")
print(f" Logistic regression model accuracy for N-S: {accuracy_score(y_NS_test, y_NS_pred_ros):.3f}")
print(f" Logistic regression model accuracy for F-T: {accuracy_score(y_FT_test, y_FT_pred_ros):.3f}")
print(f" Logistic regression model accuracy for J-P: {accuracy_score(y_JP_test, y_JP_pred_ros):.3f}")

 Logistic regression model accuracy for E-I: 0.857
 Logistic regression model accuracy for N-S: 0.903
 Logistic regression model accuracy for F-T: 0.846
 Logistic regression model accuracy for J-P: 0.802


In [35]:
report_EI = classification_report(y_EI_test, y_EI_pred_ros)
print(f"Classification report for E0-I1 group:")
print(report_EI)
print(f"Accuracy score: {accuracy_score(y_EI_test, y_EI_pred_ros):.3f}")
print("--------------------------")

report_NS = classification_report(y_NS_test, y_NS_pred_ros)
print(f"Classification report for N0-S1 group:")
print(report_NS)
print(f"Accuracy score: {accuracy_score(y_NS_test, y_NS_pred_ros):.3f}")
print("--------------------------")

report_FT = classification_report(y_FT_test, y_FT_pred_ros)
print(f"Classification report for F0-T1 group:")
print(report_FT)
print(f"Accuracy score: {accuracy_score(y_FT_test, y_FT_pred_ros):.3f}")
print("--------------------------")

report_JP = classification_report(y_JP_test, y_JP_pred_ros)
print(f"Classification report for J0-P1 group:")
print(report_JP)
print(f"Accuracy score: {accuracy_score(y_JP_test, y_JP_pred_ros):.3f}")

Classification report for E0-I1 group:
             precision    recall  f1-score   support

          0       0.67      0.68      0.68       473
          1       0.91      0.91      0.91      1696

avg / total       0.86      0.86      0.86      2169

Accuracy score: 0.857
--------------------------
Classification report for N0-S1 group:
             precision    recall  f1-score   support

          0       0.95      0.94      0.94      1862
          1       0.65      0.68      0.66       307

avg / total       0.90      0.90      0.90      2169

Accuracy score: 0.903
--------------------------
Classification report for F0-T1 group:
             precision    recall  f1-score   support

          0       0.88      0.84      0.86      1179
          1       0.81      0.86      0.84       990

avg / total       0.85      0.85      0.85      2169

Accuracy score: 0.846
--------------------------
Classification report for J0-P1 group:
             precision    recall  f1-score   support