In [2]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC,LinearSVC
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
import plotly.express as px
import warnings

warnings.filterwarnings('ignore')



In [3]:
# read csv data
datas = pd.read_csv('./data/csv/mbti_1.csv')

In [4]:
# remove text noise
def clear_text(data):
    data_length=[]
    lemmatizer=WordNetLemmatizer()
    cleaned_text=[]
    for sentence in tqdm(data.posts):
      sentence=sentence.lower()
        
      #  removing links from text data
      sentence=re.sub('https?://[^\s<>"]+|www\.[^\s<>"]+',' ',sentence)
    
      # removing other symbols
      sentence=re.sub('[^0-9a-z]',' ',sentence)
           
      data_length.append(len(sentence.split()))
      cleaned_text.append(sentence)
    
    return cleaned_text,data_length

In [5]:
train_data,test_data=train_test_split(datas,test_size=0.2,random_state=42,stratify=datas.type)

train_data.posts,train_length=clear_text(train_data)
test_data.posts,test_length=clear_text(test_data)

100%|██████████| 6940/6940 [00:02<00:00, 2515.25it/s]
100%|██████████| 1735/1735 [00:00<00:00, 2298.37it/s]


In [6]:
vectorizer=TfidfVectorizer(max_features=5000,stop_words='english')
vectorizer.fit(train_data.posts)

train_post=vectorizer.transform(train_data.posts).toarray()
test_post=vectorizer.transform(test_data.posts).toarray()

target_encoder=LabelEncoder()
train_target=target_encoder.fit_transform(train_data.type)
test_target=target_encoder.fit_transform(test_data.type)

In [7]:
# OneVsRestClassifier
estimator = SVC()
model_one_vs_rest=OneVsRestClassifier(estimator)

In [8]:
model_one_vs_rest.fit(train_post, train_target)

OneVsRestClassifier(estimator=SVC())

In [9]:
print('test classification report \n ',classification_report(test_target,model_one_vs_rest.predict(test_post),target_names=target_encoder.inverse_transform([i for i in range(16)])))

test classification report 
                precision    recall  f1-score   support

        ENFJ       0.69      0.24      0.35        38
        ENFP       0.73      0.59      0.65       135
        ENTJ       0.65      0.37      0.47        46
        ENTP       0.60      0.56      0.58       137
        ESFJ       1.00      0.33      0.50         9
        ESFP       0.00      0.00      0.00        10
        ESTJ       1.00      0.12      0.22         8
        ESTP       0.60      0.33      0.43        18
        INFJ       0.70      0.72      0.71       294
        INFP       0.64      0.86      0.73       366
        INTJ       0.64      0.65      0.64       218
        INTP       0.70      0.82      0.76       261
        ISFJ       0.64      0.42      0.51        33
        ISFP       0.81      0.41      0.54        54
        ISTJ       0.73      0.27      0.39        41
        ISTP       0.69      0.63      0.66        67

    accuracy                           0.67      1

In [10]:
from copy import copy

# 学習モデルをコピー
model_one_vs_rest_tuning = copy(model_one_vs_rest)
print(id(model_one_vs_rest), id(model_one_vs_rest_tuning))

2359885482496 2359902745408


In [11]:
# ファインチューニング
tuning_datas = pd.DataFrame(data=[["ENFP", open("./data/text/lookback/momoka_en.txt", "r", encoding='UTF-8').read()],
                ["ESFJ", open("./data/text/lookback/issei_en.txt", "r", encoding='UTF-8').read()],
                ["ESFP", open("./data/text/lookback/kai_en.txt", "r", encoding='UTF-8').read()],
                ["ENFP", open("./data/text/lookback/keito_en.txt", "r", encoding='UTF-8').read()],
                ["ISFP", open("./data/text/lookback/katuki_en.txt", "r", encoding='UTF-8').read()]], columns=["type", "posts"])
tuning_posts = vectorizer.transform(tuning_datas.posts).toarray()
tuning_target = target_encoder.transform(tuning_datas.type)
model_one_vs_rest_tuning.fit(tuning_posts, tuning_target)

OneVsRestClassifier(estimator=SVC())

In [12]:
def getMBTI(words):
  words_vec = vectorizer.transform([words]).toarray()
  print("チューニングなし: ", model_one_vs_rest.predict(words_vec), target_encoder.inverse_transform(model_one_vs_rest.predict(words_vec))[0])
  print("チューニングあり: ", model_one_vs_rest_tuning.predict(words_vec), target_encoder.inverse_transform(model_one_vs_rest_tuning.predict(words_vec))[0])

In [13]:
# momoka ENFP
# session10 5min
print("session10 発話データ")
getMBTI("Certainly, with today's technology, I think you don't have to go and see them anymore to see them in a great way, but then I thought, why did you bring them here at the zoo? But then, why did you come to the zoo? Plus, the zoo fees are fairly inexpensive, but it would probably cost a fair amount of money to go and see the animals in person, or to use the equipment. I thought that the zoo was created to make it easy to see the animals. What do you think? I don't know. I don't know. Yes, I think that the loss of the wild function would take away the good parts of the animal. Nowadays, people talk about the importance of individuality, so I wonder if that is part of it.")
print("session1~10 振り返りシート")
getMBTI(open("./data/text/lookback/momoka_en.txt", "r", encoding='UTF-8').read())

session10 発話データ
チューニングなし:  [11] INTP
チューニングあり:  [1] ENFP
session1~10 振り返りシート
チューニングなし:  [10] INTJ
チューニングあり:  [1] ENFP


In [14]:
# issei ESFJ session10
print("session10 発話データ")
getMBTI("Yes, for three weeks, but the reason for this is that I think zoos also have the role of protecting animals, and animals that cannot live on their own due to the harsh environment, global warming, deforestation, etc., have a difficult time living in zoos. I wondered which of the two was more necessary. I think it is true that humans are responsible for the destruction of the environment, but I also believe that it is because of the actions of the next generation of humans that we are able to protect the environment.")
print("session1~10 振り返りシート")
getMBTI(open("./data/text/lookback/issei_en.txt", "r", encoding='UTF-8').read())

session10 発話データ
チューニングなし:  [11] INTP
チューニングあり:  [1] ENFP
session1~10 振り返りシート
チューニングなし:  [10] INTJ
チューニングあり:  [4] ESFJ


In [15]:
# kai ESFP session10
print("session10 発話データ")
getMBTI("Yes, I am. I would like to hear the opinions of those who are in favor of the zoo first. I see. Thank you very much. Do you have any questions for the main section? I see. Thank you very much. I think that one of the opinions of the proponents is the protection of animals, and the other is that people will be interested in animals when they see them. Then I would like to move on to 3, the opponents' opinions. I see. Thank you very much. I would like to speak next, but I think my opinion would be a bit like a rebuttal to your opinion, but nowadays, various technologies have developed. In the past, there was little such technology, and many people had no choice but to go and see the animals in person, or there was nothing to see even if they took pictures of the animals. I think the value of zoos was high as a place to learn about things, but nowadays there are a lot of resources available, and I think it's fine not to go to the trouble of putting animals in a zoo and observing them, but it's easy to do so. I think it is better to casually take animals that are living in the wild and view them in that way than to put them in a world that is like a prison, and I think it is better for me to study them. So, you know, one thing I disagree with is your reaction. I also responded to Momoka's opinion, saying that there must be some way to learn about animals or to get people interested in them, even if it is not in an individual zoo. I would like to ask you, Mr. Ishii, if you have any objections or even sympathies for Mr. Keita's opinion, that you heard earlier, that would be totally fine. Do you have any?")
print("session1~10 振り返りシート")
getMBTI(open("./data/text/lookback/kai_en.txt", "r", encoding='UTF-8').read())

session10 発話データ
チューニングなし:  [9] INFP
チューニングあり:  [1] ENFP
session1~10 振り返りシート
チューニングなし:  [10] INTJ
チューニングあり:  [5] ESFP


In [16]:
# keito ENFP session10
print("session10 発話データ")
getMBTI("The opponents' point of view is protection, but it was humans who created the conditions that made it necessary to reinforce the situation in the first place, and it was not humans who experienced all the environmental destruction and global warming. For example, in the case of global warming, it was exhaust emissions, deforestation, and wages for the land needed to build towns and cities, but the animals could no longer live there, so we said we would build zoos and protect them. But only a small percentage of the animals can be protected, and all the animals living in the logged area cannot be sent to the zoo, so they are driven away. So, I think that what the zoo has created is only English for humans, and that's why I have this opinion. It's a different opposing opinion. Yeah.")
print("session1~10 振り返りシート")
getMBTI(open("./data/text/lookback/keito_en.txt", "r", encoding='UTF-8').read())

session10 発話データ
チューニングなし:  [11] INTP
チューニングあり:  [1] ENFP
session1~10 振り返りシート
チューニングなし:  [10] INTJ
チューニングあり:  [1] ENFP


In [17]:
# katuki ISFP
print("session1~10 振り返りシート")
getMBTI(open("./data/text/lookback/katuki_en.txt", "r", encoding='UTF-8').read())

session1~10 振り返りシート
チューニングなし:  [10] INTJ
チューニングあり:  [13] ISFP
