In [1]:
import openai
import anthropic
import yaml
import os
import pandas as pd
import numpy as np
import json

# set up GPT

In [2]:
# Use absolute path to the config.yaml file
config_path = "config.yaml"

In [3]:
if not os.path.exists(config_path):
    print("Config file not found. Please follow these steps:")
    print("1. Copy 'config.yaml.template' to 'config.yaml'")
    print("2. Open 'config.yaml' and replace 'YOUR_GPT_API_KEY_HERE' and 'YOUR_CLAUDE_API_KEY_HERE' with your actual API keys")
    raise FileNotFoundError("config.yaml not found")


In [4]:
with open(config_path) as f:
    config_yaml = yaml.safe_load(f)
gpt_api_key = config_yaml['gpt_token']
claude_api_key = config_yaml['claude_token']

## Setup Configuration
Before running, set the following variables:
1. `llm_type`: Choose the language model type
    - Options: "gpt4o", "gpt35", or "claude" 
2. `sampling_type`: Select the sampling method
    - Options: "normal" or "exact_dist"
3. `mode`: Determine the execution scope
    - 'pilot': Process only the first 10 rows
    - 'full': Process all rows in the dataset

These settings will affect how the script processes the data and interacts with the language model.

In [5]:
# Define variables for LLM and sampling type
llm_type = "claude"             # Options: "gpt4o", "gpt35", or "claude"
sampling_type = "exact_dist"    # Options: "normal" or "exact_dist"

# Define the mode: 'pilot' or 'full'
mode = 'pilot'                  # Change this to 'full' for a full commit

In [6]:
# Automatically select model and client based on LLM type
if llm_type == "gpt4o":
    model = "gpt-4o-2024-05-13"
    client = openai.OpenAI(api_key=gpt_api_key)
elif llm_type == "gpt35":
    model = "gpt-3.5-turbo-0125"
    client = openai.OpenAI(api_key=gpt_api_key)
elif llm_type == "claude":
    model = "claude-3-5-sonnet-20240620"
    client = anthropic.Anthropic(api_key=claude_api_key)
else:
    raise ValueError("Invalid LLM type. Choose 'gpt4', 'gpt35', or 'claude'.")

print(f"Selected model: {model}")


Selected model: claude-3-5-sonnet-20240620


# prepare text data

In [7]:
# Get the directory of the current notebook
notebook_dir = os.path.dirname(os.path.abspath('__file__'))

# Construct the path to the data file
file_name = f"sample_forFeeding{'_exactDist' if sampling_type == 'exact_dist' else ''}.xlsx"
file_path = os.path.join(notebook_dir, '..', '..', 'raw', file_name)

text_set = pd.read_excel(file_path, sheet_name='text_only') 
sample_set = pd.read_excel(file_path, sheet_name='sample')


In [8]:
text_set.head()

Unnamed: 0,posts_filtered
0,I find users with Fi and Ti quite attractive. ...
1,'I'm {MBTI type} and I'd honestly like to know...
2,{html link} back. Good to hear of you. I can't...
3,'oh my god they waste no time hahahahah|||Well...
4,'28 :/|||I like a person that makes me a bette...


In [9]:
sample_set

Unnamed: 0,1st choice,2nd choice,3rd choice,Justification for 1st choice (EI),Justification for 1st choice (NS),Justification for 1st choice (TF),Justification for 1st choice (PJ)
0,ENTP,ENFP,INTP,Displays a range of interests and adaptability...,Shows a preference for exploring abstract idea...,Focuses on logical analysis but also values em...,"Demonstrates flexibility and spontaneity, char..."
1,INFP,ISFP,INFJ,Reflects introspection and a preference for so...,Emphasizes emotional depth and abstract expres...,Prioritizes personal values and emotional unde...,Shows a contemplative and open-ended approach ...
2,ENTP,ENTJ,INTJ,"Engages in diverse discussions, suggesting ext...","Prefers theoretical and abstract discussions, ...","Analytical and objective in decision-making, a...","Adaptable and open to new ideas, traits of per..."


# feed into gpt

In [10]:
keys_list = ['1st choice',	'2nd choice',	'3rd choice',	'Justification for 1st choice (EI)',	'Justification for 1st choice (NS)',	'Justification for 1st choice (TF)',	'Justification for 1st choice (PJ)']

## Predictions

In [11]:
# Set the number of samples for the pilot mode
pilot_samples = 10

if mode == 'pilot':
    target_df = text_set.head(pilot_samples)
    print(f"Running in pilot mode with {pilot_samples} samples")
elif mode == 'full':
    target_df = text_set
    print(f"Running full commit with {len(text_set)} samples")
else:
    raise ValueError("Invalid mode. Choose 'pilot' or 'full'")

Running in pilot mode with 10 samples


In [12]:
results = []

for index, row in target_df.iterrows():
    print(row['posts_filtered'])
    prompt = f"Analyze the given one group of quotes containing multiple posts from the same speaker (each post is splitted using the symbol '|||') and classify him/her according to the Myers-Briggs Personality Indicator (MBTI). Provide the top three MBTI predictions with justifications only for the first choice focusing on the four core aspects of MBTI: Extraversion vs. Introversion (E vs I), Sensing vs. Intuition (S vs N), Thinking vs. Feeling (T vs F), and Judging vs. Perceiving (J vs P). Make sure your response is in json format ready to be read using json.loads() method properly and follows the template in {sample_set}. Make sure your output contains all keys from {keys_list} and your answers corresponding to each of the keys. Do not add any additional text in your response. Here's set of quotes {row['posts_filtered']}"
    
    if llm_type in ["gpt4", "gpt35"]:
        response = client.chat.completions.create(
            model=model,
            messages=[{"role": "system", "content": "You are a helpful assistant."},
                      {"role": "user", "content": prompt}],
            temperature=1.0,
            max_tokens=500,
            top_p=1.0
        )
        output = response.choices[0].message.content
    elif llm_type == "claude":
        response = client.messages.create(
            model=model,
            messages=[
                {"role": "user", "content": prompt}
            ],
            temperature=1.0,
            max_tokens=500
        )
        output = response.content[0].text
    
    print(output)
    try:
        json_data = output.strip('```json\n```')
        try:
            data = json.loads(json_data)
        except json.JSONDecodeError:
            json_data = output.strip('```json\n```  ')
            data = json.loads(json_data)
        results.append(data)
    except json.JSONDecodeError as e:
        print(f"Failed to decode JSON: {e}")
        print("Faulty JSON data:", json_data)  # This will show the problematic part.
        fallback_data = {
            "1st choice": "na",
            "2nd choice": "na",
            "3rd choice": "na",
            "Justification for 1st choice (EI)": "na",
            "Justification for 1st choice (NS)": "na",
            "Justification for 1st choice (TF)": "na",
            "Justification for 1st choice (PJ)": "na"
        }
        results.append(fallback_data)

# Convert results to DataFrame
output_df = pd.DataFrame(results)

output_df.tail(10)

I find users with Fi and Ti quite attractive. I don't understand how someone with Fe can actually be attractive? Fe makes people focus on the other person's needs and stuff and that makes them appear...|||Haha. I really don't think I can stand another like this one. But yeah, I'm thankful too that you could give me such an accurate perspective. I wasn't able to let go because I didn't know if I was...|||I did. I stopped completely. It'll take me a while to heal, haha.    I wouldn't have considered this perspective in a million years. It makes so much sense. I know this was stupid, but I had...|||I have been frank with her from the beginning. What you say makes a lot of sense. I just stopped talking or replying to her messages completely now. Thanks. I guess I came to the right temperament to...|||Thanks.|||Hey, guys,  I met this {MBTI type}(not N) who didn't want a relationship because she said that her past ones had hurt her. We ended up staying friends and I didn't mind that. But one

Unnamed: 0,1st choice,2nd choice,3rd choice,Justification for 1st choice (EI),Justification for 1st choice (NS),Justification for 1st choice (TF),Justification for 1st choice (PJ)
0,INFP,INFJ,ENFP,Reflects introspection and a preference for so...,"Shows a strong preference for abstract ideas, ...",Demonstrates a strong focus on personal values...,Exhibits flexibility in thinking and openness ...
1,ENFP,INFP,ENTP,Demonstrates a balance between extroversion an...,"Shows a strong preference for intuition, often...","Exhibits a clear preference for feeling, prior...","Displays characteristics of a perceiver, showi..."
2,INFP,ENFP,INTP,Reflects introspection and a preference for so...,Demonstrates a strong inclination towards abst...,Exhibits strong emotional awareness and empath...,Shows adaptability and openness to new experie...
3,ENFP,ENTP,INFP,Displays extroverted tendencies through engagi...,"Shows a preference for intuition, often explor...",Demonstrates a strong inclination towards feel...,Exhibits perceiving traits through adaptabilit...
4,INFJ,INTJ,INTP,Reflects introversion through preference for d...,Demonstrates strong intuition through abstract...,Exhibits a balance of logical analysis and emo...,Shows a tendency towards judging through organ...
5,INFP,ENFP,INFJ,Reflects introspection and a preference for so...,"Shows a strong preference for abstract ideas, ...","Demonstrates a strong focus on emotions, perso...",Exhibits flexibility in thought processes and ...
6,INFP,ENFP,INTP,Reflects introspection and a preference for so...,Shows a preference for abstract ideas and conc...,Demonstrates a strong emphasis on personal val...,Exhibits an open-ended and flexible approach t...
7,INFP,INFJ,INTP,Reflects introversion through preference for s...,Shows strong intuition through love for fantas...,Demonstrates a strong feeling preference throu...,Exhibits a more perceiving nature through open...
8,INFP,INFJ,ISFP,"Reflects introversion through valuing privacy,...",Shows a preference for intuition through inter...,Demonstrates a feeling preference by emphasizi...,Exhibits a perceiving tendency through opennes...
9,INTP,ENTP,INTJ,Demonstrates a preference for introspection an...,Shows a strong inclination towards abstract co...,"Exhibits a logical, analytical approach to und...",Displays openness to new ideas and flexibility...


In [13]:
# Construct output file name
output_filename = f"{llm_type}_result_raw{'_exactDist' if sampling_type == 'exact_dist' else ''}.xlsx"

# Get the directory of the current notebook
notebook_dir = os.path.dirname(os.path.abspath('__file__'))

# Construct the path to the raw directory
raw_dir = os.path.join(notebook_dir, '..', '..', 'raw')

# Construct full path for output file
output_path = os.path.join(raw_dir, output_filename)

# Save to Excel
output_df.to_excel(output_path, index=False)
print(f"Results saved to: {output_path}")

Results saved to: c:\Users\Ryo\OneDrive\Desktop\Master Thesis\master_thesis\study1\notebooks\01_MBTI_predictions\..\..\raw\claude_result_raw_exactDist.xlsx
