In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
# Set the working directory
os.chdir(r'C:\Users\Ryo\OneDrive\Desktop\Master Thesis\master_thesis\study1')

pd.set_option('display.max_columns', None)

In [3]:
version = 'exactDist' # 'original' or 'exactDist'

In [4]:
if version == 'exactDist':
    gpt35_result = pd.read_excel('raw/result_GPT35_exactDist.xlsx')
    gpt4o_result = pd.read_excel('raw/result_GPT4o_exactDist.xlsx')
    claude_result = pd.read_excel('raw/result_Claude_exactDist.xlsx')
else:
    gpt4o_result = pd.read_excel('raw/result_GPT4o.xlsx')
    claude_result = pd.read_excel('raw/result_Claude.xlsx')
    gpt35_result = pd.read_excel('raw/result_GPT35.xlsx')
    gpt4_result = pd.read_excel('raw/result_GPT4.xlsx')
    gemini_result = pd.read_excel('raw/result_Gemini.xlsx')
    gemini_old_result = pd.read_excel('raw/result_Gemini (proto).xlsx')



In [5]:
# dial the target models to True
run_flag = {'gpt35'       : True,
            'gpt4'        : False,
            'gpt4o'       : True,
            'gemini_old'  : False,
            'gemini'      : False,  #ultra
            'claude'      : True}   #sonnet

In [6]:
gpt4o_result.head()

Unnamed: 0,type,posts_filtered,Row,1st choice,2nd choice,3rd choice,Justification for 1st choice (EI),Justification for 1st choice (NS),Justification for 1st choice (TF),Justification for 1st choice (PJ)
0,INFJ,I find users with Fi and Ti quite attractive. ...,#1,INFP,ISFP,INFJ,Reflects introspection and a preference for so...,Emphasizes emotional depth and abstract expres...,Prioritizes personal values and emotional unde...,Shows a contemplative and open-ended approach ...
1,INFJ,'I'm {MBTI type} and I'd honestly like to know...,#2,INFJ,INFP,ENFJ,Reflects introspection and a preference for so...,Shows a preference for abstract ideas and disc...,Emphasizes understanding and empathy while mak...,Expresses a need for organization and long-ter...
2,INFJ,{html link} back. Good to hear of you. I can't...,#3,INFP,ENFP,INFJ,Reflects introspection and a preference for so...,"Emphasizes emotional depth, abstract expressio...",Prioritizes personal values and emotional unde...,Shows a contemplative and open-ended approach ...
3,INFJ,'oh my god they waste no time hahahahah|||Well...,#1,ENFP,ENTP,INFP,The speaker engages actively with various peop...,"Uses abstract language, engages in discussions...",Often expresses appreciation for people and pe...,"The speaker demonstrates spontaneity, flexibil..."
4,INFJ,'28 :/|||I like a person that makes me a bette...,#2,INTP,INFJ,INTJ,The speaker shows a preference for introspecti...,"The emphasis on abstract thinking, future poss...",The speaker values logical analysis and reason...,The flexibility in thoughts and openness to po...


In [7]:
def split_type (row: str, index: int): #j
    splitted = list(row)
    return splitted[index]

In [8]:
def splitting_types_to_columns (cols_list, df):
    naming = [' (EI)', ' (NS)', ' (FT)', ' (JP)']
    for i, col in enumerate(cols_list):
        col_index = df.columns.tolist().index(col)
        for j, name in enumerate(naming):
            new_name = col + name
            value = df[col].apply(lambda row: split_type(row, j))
            df.insert(loc=col_index+j+1, column= new_name, value=value)
    return df

In [9]:
def add_bool (df):
    namings = ['full', '(EI)', '(NS)', '(FT)', '(JP)']
    choices = ['1st ', '2nd ', '3rd ']
    for i, choice in enumerate(choices):
        for j, col in enumerate(namings):
            start_index = (i+1)*(len(namings))
            actual = df.iloc[:,j]
            pred = df.iloc[:,start_index + j]
            bool_series = actual == pred
            new_col_name = 'is matched ' + choice + col
            df[new_col_name] = bool_series
    return df

In [10]:
def generate_df_for_accuracy (df):
    col_names = df.columns.tolist()

    accuracy_check_cols = ['type', '1st choice', '2nd choice', '3rd choice']
    accuracy_check = df[accuracy_check_cols]
    accuracy_check = accuracy_check[~accuracy_check.apply(lambda row: 'na' in row.values, axis=1)]

    accuracy_check = splitting_types_to_columns(accuracy_check_cols, accuracy_check)
    accuracy_check = add_bool(accuracy_check)
    return accuracy_check

In [11]:
def generate_df_for_justification (df):
    selected_cols = ['type', 'posts_filtered', '1st choice', 'Justification for 1st choice (EI)', 'Justification for 1st choice (NS)',	'Justification for 1st choice (TF)', 'Justification for 1st choice (PJ)']
    split_target_cols = ['type', '1st choice']
    new_df = df[selected_cols]
    new_df = new_df[~new_df.apply(lambda row: 'na' in row.values, axis=1)]
    new_df = splitting_types_to_columns (split_target_cols, new_df)

    #----------purify df by changing columns order------------
    popped_data = new_df.pop('posts_filtered')
    index = new_df.columns.tolist().index('Justification for 1st choice (EI)')
    new_df.insert(loc=index, column='posts_filtered', value=popped_data)

    #---------add matched flag b/w original/predicted---------
    col_0 = new_df.columns.tolist()[:index]
    new_df_0 = new_df[col_0]
    new_cols = ['is_matched(full)', 'is_matched(EI)', 'is_matched(NS)', 'is_matched(FT)', 'is_matched(JP)']
    for i, col in enumerate(new_cols):
        new_df_0[col] = new_df_0.iloc[:,i] == new_df_0.iloc[:,i+len(new_cols)]
    new_df = pd.concat([new_df_0, new_df[new_df.columns.tolist()[index:]]], axis = 1)
    
    return new_df

In [12]:
def generate_dfs (df):
    return generate_df_for_accuracy (df), generate_df_for_justification(df)

# gpt4o data transformation

In [13]:
if run_flag['gpt4o'] == True:
    gpt4o_result_accuracy, gpt4o_result_justification = generate_dfs(gpt4o_result)
    print(gpt4o_result_accuracy.shape)
    print(gpt4o_result_justification.shape)

(622, 35)
(622, 20)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df_0[col] = new_df_0.iloc[:,i] == new_df_0.iloc[:,i+len(new_cols)]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df_0[col] = new_df_0.iloc[:,i] == new_df_0.iloc[:,i+len(new_cols)]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df_0[col] = new_df_0.iloc[:,i] == new_df_0.iloc[:,i+len(new

In [14]:
if run_flag['gpt4o'] == True:
    print(gpt4o_result_accuracy['type'].value_counts())

type
INFP    130
INFJ    106
INTP     94
INTJ     78
ENTP     49
ENFP     49
ISTP     24
ISFP     19
ENTJ     17
ISTJ     15
ENFJ     14
ISFJ     12
ESTP      6
ESFP      3
ESTJ      3
ESFJ      3
Name: count, dtype: int64


# gpt4 data transformation

In [15]:
if run_flag['gpt4'] == True:
    gpt4_result_accuracy, gpt4_result_justification = generate_dfs(gpt4_result)
    print(gpt4_result_accuracy.shape)
    print(gpt4_result_justification.shape)

In [16]:
if run_flag['gpt4'] == True:
    print(gpt4_result_accuracy['type'].value_counts())

In [17]:
if run_flag['claude'] == True:
    claude_result_accuracy, claude_result_justification = generate_dfs(claude_result)
    print(claude_result_accuracy.shape)
    print(claude_result_justification.shape)

(620, 35)
(620, 20)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df_0[col] = new_df_0.iloc[:,i] == new_df_0.iloc[:,i+len(new_cols)]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df_0[col] = new_df_0.iloc[:,i] == new_df_0.iloc[:,i+len(new_cols)]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df_0[col] = new_df_0.iloc[:,i] == new_df_0.iloc[:,i+len(new

In [18]:
if run_flag['claude'] == True:
    print(claude_result_accuracy['type'].value_counts())

type
INFP    130
INFJ    105
INTP     93
INTJ     78
ENTP     49
ENFP     49
ISTP     24
ISFP     19
ENTJ     17
ISTJ     15
ENFJ     14
ISFJ     12
ESTP      6
ESFP      3
ESTJ      3
ESFJ      3
Name: count, dtype: int64


# gemini ultra data transformation

In [19]:
if run_flag['gemini'] == True:
    gemini_result_accuracy, gemini_result_justification = generate_dfs(gemini_result)

In [20]:
if run_flag['gemini'] == True:
    gemini_result_accuracy['type'].value_counts()

# gpt3.5 data transformation

In [21]:
if run_flag['gpt35'] == True:
    gpt35_result_accuracy, gpt35_result_justification = generate_dfs(gpt35_result)
    print(gpt35_result_accuracy.shape)
    print(gpt35_result_justification.shape)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df_0[col] = new_df_0.iloc[:,i] == new_df_0.iloc[:,i+len(new_cols)]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df_0[col] = new_df_0.iloc[:,i] == new_df_0.iloc[:,i+len(new_cols)]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df_0[col] = new_df_0.iloc[:,i] == new_df_0.iloc[:,i+len(new

(621, 35)
(621, 20)


# gemini data tranformation

In [22]:
if run_flag['gemini_old'] == True:
    gemini_old_result_accuracy, gemini_old_result_justification = generate_dfs(gemini_old_result)
    print(gemini_old_result_accuracy.shape)
    print(gemini_old_result_justification.shape)

In [23]:
def save_to_excel(variable_name, 
                  directory='transformed',
                  version = version):
    if version == 'exactDist':
        if variable_name in globals():
            globals()[variable_name].to_excel(f'{directory}/{variable_name}_exactDist.xlsx', index=False)
        else:
            print(f"Variable {variable_name} does not exist.")        
    else:
        if variable_name in globals():
            globals()[variable_name].to_excel(f'{directory}/{variable_name}.xlsx', index=False)
        else:
            print(f"Variable {variable_name} does not exist.")

# Loop through the run flags
for model_name, flag in run_flag.items():
    if flag:
        accuracy_table_name = f"{model_name}_result_accuracy"
        justification_table_name = f"{model_name}_result_justification"
        
        save_to_excel(accuracy_table_name)
        save_to_excel(justification_table_name)

        print(f"saved: {accuracy_table_name}")
        print(f"saved: {justification_table_name}")

saved: gpt35_result_accuracy
saved: gpt35_result_justification
saved: gpt4o_result_accuracy
saved: gpt4o_result_justification
saved: claude_result_accuracy
saved: claude_result_justification
