# A GPT-4o-assisted, human-in-the-loop solution for intercoder reliability: guide for behavioral scientists

_Performs qualitative deductive coding consistent with the  [CHALET](https://arxiv.org/abs/2405.05758) (**C**ollaborative **H**uman-LLM **A**na**L**ysis for **E**mpowering Conceptualization in Quali**T**ative Research) approach. Requires Ollama and/or OpenAI API key._

_Note: Provided to SBM 2025 attendees for illustrative purposes only. Please adapt as needed and pilot on toy data. Do not pass data to the OpenAI API without reviewing OpenAI's Data Usage Policies and obtaining proper IRB approvals._

> human_llm_synergistic.ipynb<br>
> Simone J. Skeen (03-12-2025)

1. [Prepare](#scrollTo=TMzbQWcLnD3k)
2. [Write](#scrollTo=ro3vWHGknw3w)<br>
&nbsp;&nbsp;&nbsp;&nbsp;[`code_texts_deductively_llama`](#scrollTo=0TXsMF50oDSi)<br>
&nbsp;&nbsp;&nbsp;&nbsp;[`code_instance_deductively_gpt`](#scrollTo=LrgYlrmo1OUW)<br>
&nbsp;&nbsp;&nbsp;&nbsp;[`code_texts_deductively_gpt`](#scrollTo=I6V00vzh2Na1)<br>
3. [Code](#scrollTo=zXYJT6i9pSPf)<br>
[Llama 3.2: local](#scrollTo=6hHjuQXrAqLE)<br>
[GPT-4o: OpenAI API](#scrollTo=G1v8sP42Ah-n)<br>
4. [Fidelity](#scrollTo=6upq1MSmxvoW)<br>
[Compute Cohen's $\kappa$](#scrollTo=DuSQ858FR2Ab)<br>
[Flag disagreements](#scrollTo=mC58zS16Zttc)


### Prepare
Installs, imports, requisite packages; customizes outputs.
***

**Install**

In [None]:
%%capture

%pip install irrCAC
%pip install lime
%pip install ollama
%pip install openai

**Import**

In [None]:
import json
import numpy as np
import ollama
import openai
import os
import pandas as pd
import re
import requests
import time
import warnings

from google.colab import drive
from irrCAC.raw import CAC
from sklearn.metrics import cohen_kappa_score

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

pd.options.mode.copy_on_write = True

pd.set_option(
    'display.max_columns',
    None,
    )

pd.set_option(
    'display.max_rows',
    None,
    )

warnings.simplefilter(
    action = 'ignore',
    category = FutureWarning,
    )

#from langchain_community.llms import Ollama

**Set env variables**

In [None]:
os.environ['OPENAI_API_KEY'] = ' ' ### insert API key between quotation marks
os.environ

**Directory structure**

In [None]:
mhp_subtle_discrimination/
└── CEAI_lunch_and_learn/
    ├── code
    ├── inputs
    ├── outputs
    └── temp

**Ollama**<br>
http://localhost:11434/

#### Google Colab

In [None]:
# mount gdrive

drive.mount(
    '/content/drive',
    force_remount = True,
    )

In [None]:
# creates and structures relative paths in a Google Colab/Drive environment

#%mkdir Colab
#%cd Colab

In [None]:
#%mkdir human_llm_synergistic
#%cd human_llm_synergistic

In [None]:
#%mkdir inputs outputs code temp

In [None]:
%cd /content/drive/My Drive/Colab/human_llm_synergistic

#### JupyterLab

In [None]:
# set wd

wd = ' '
os.chdir(wd)
%pwd

### Write
Defines qualitative.py module.
***

In [None]:
%cd code

#### _code_texts_deductively_llama_

In [None]:
%%writefile qualitative.py

import requests
import json
import pandas as pd

def code_texts_deductively_llama(df, alias, text_column, endpoint_url, prompt_template, model_name):
    """
    Classifies each row of 'text' column in provided df in accord with human-specified prompt,
    includes chain-of-thought reasoning, returning explanations for classification decision.

    Parameters:
    -----------
    df : pandas.DataFrame
        The DataFrame containing the text to classify.
    alias : str
        The alias (for brevity) of the qualitative code to be applied.
    text_column : str
        The column name in df containing the text to be analyzed.
    endpoint_url : str
        The URL where locally hosted Llama model runs.
    prompt_template : str
        The prompt text with a placeholder (e.g., '{text}') where the row's text will be inserted.
    model_name : str
        The model tasked with qualitative deductive coding.

    Returns:
    --------
    pandas.DataFrame
        The original DataFrame with two new columns: '{alias}_llm' (either "0" or "1")
        and '{alias}_expl' (the explanation).
    """

    # dynamically create {alias} column names

    label_column = f'{alias}_llm'
    explanation_column = f'{alias}_expl'

    # create empty tag ['*_llm'] and reasoning ['*_expl'] column

    df[label_column] = None
    df[explanation_column] = None

    for idx, row in df.iterrows():
        row_text = row[text_column]

        # replace '{text}' in prompt_template with df 'text' data

        prompt = prompt_template.format(text = row_text)

        # send request to local Llama endpoint.

        response = requests.post(
            endpoint_url,
            headers = {'Content-Type': 'application/json'},
            json = {
                'model': model_name,
                'prompt': prompt,
                'stream': False
                },
        )

        # print statements for debugging

        print(response.status_code)
        print(response.text)

        if response.status_code == 200:
            try:
                # parse top-level JSON

                result_json = response.json()

                # 'response' field contains JSON string

                raw_response_str = result_json.get('response', ' ')

                # extract only the JSON portion: identify first `{` and last `}` braces

                start_idx = raw_response_str.find("{")
                end_idx = raw_response_str.rfind("}") + 1

                if start_idx != -1 and end_idx != -1:

                # extract and parse JSON portion

                    valid_json_str = raw_response_str[start_idx:end_idx]
                    parsed_output = json.loads(valid_json_str)

                # extract tag and reasoning fields

                    label = parsed_output.get(label_column)
                    explanation = parsed_output.get(explanation_column)
                else:
                    print("No valid JSON found in response.")
                    label = None
                    explanation = None

            except (json.JSONDecodeError, KeyError, TypeError) as e:
                print("Parsing error:", e)
                label = None
                explanation = None

        else:
            label = None
            explanation = None

        # insert classification results into df

        df.at[idx, label_column] = label
        df.at[idx, explanation_column] = explanation

    return df

#### _code_instance_deductively_gpt_

In [None]:
%%writefile -a qualitative.py

import time
import openai

api_key = os.getenv('OPENAI_API_KEY')
#api_key = ' '
client = openai.OpenAI(api_key = api_key)

def code_instance_deductively_gpt(text, prompts):
    """
    Applies annotation decisions, based on multiple prompts, to a given text; provides rationale and explanation.
    Parameters:
    - text: The text to annotate.
    - prompts: A list of prompts to apply to the text.

    Returns:
    - result: The combined result from all prompts.
    """
    try:

        # concatenate prompts

        prompt_content = ' '.join(prompts)

        response = client.chat.completions.create(
            model = 'gpt-4o',
            temperature = 0.2,
            messages = [
                {
                    'role': 'system',
                    'content': prompt_content
                },
                {
                    'role': 'user',
                    'content': text
                }
            ]
        )

        # collect results

        result = ' '
        for choice in response.choices:
            result += choice.message.content

        print(f'{text}: {result}')
        return result
    except Exception as e:
        print(f'Exception: {e}')
        return 'error'

#### _code_texts_deductively_gpt_

In [None]:
%%writefile -a qualitative.py

def code_texts_deductively_gpt(df, prompts_per_code):
    """
    Applies code_instance_deductively_gpt for multiple codes to each row in dataframe 'df'.

    Parameters:
    - df: The dataframe containing texts to annotate.
    - prompts_per_code: A dictionary with tag names as keys and a list of prompts as values.

    Returns:
    - df: The updated dataframe with annotation results.
    """
    for index, row in df.iterrows():
        for tag, prompts in prompts_per_code.items():
            result = code_instance_deductively_gpt(row['text'], prompts)
            if result == 'error':
                continue

            # initialize variables for annotation outputs

            rationale, explanation = None, None

            if f'{tag}_1' in result:
                tag_value = 1

                # extract rationale

                rationale = result.split(f'{tag}_rationale:')[1].split(f'{tag}_explanation:')[0].strip() if f'{tag}_rationale:' in result else None

                # extract explanation

                explanation = result.split(f'{tag}_explanation:')[1].strip() if f'{tag}_explanation:' in result else None

            else:
                tag_value = 0

            # results to df

            df.at[index, f'{tag}_gpt'] = tag_value
            df.at[index, f'{tag}_rtnl_gpt'] = rationale
            df.at[index, f'{tag}_expl_gpt'] = explanation

            # impose delay between API calls

            time.sleep(1)

    return df

#### Import

In [None]:
from qualitative import(
    code_texts_deductively_llama,
    code_instance_deductively_gpt,
    code_texts_deductively_gpt,
)

In [None]:
%cd ../inputs

d = pd.read_excel(
    'd_cycle_3_sjs.xlsx', ### d_cycle_3_sjs - IAA comparison w/ GPT-4o
    index_col = [0],
    )

# replace ' ' w/ NaN

d[[
    #'<my_var>', ### replace with relevant varlist of codes
    ]] = d[[
        #'<my_var>', ### replace with relevant varlist of codes
        ]].replace(
            r'^\s*$',
            np.nan,
            regex = True,
            )

# replace NaN w/ 0

d[[
    #'<my_var>',
    ]] = d[[
        #'<my_var>',
        ]].apply(
            pd.to_numeric,
            downcast = 'integer',
            )

d.fillna(
    0,
    inplace = True,
    )

# texts: delete '<|PII|>' pseudoword

texts = [
    'text',
         ]

pseudoword_tokens = [
    #'<SPL>', ### replace with preprocessing/anonymization artifacts
    #'<|PII|>',
    ]

for t in texts:
    d[t] = d[t].replace(
        pseudoword_tokens,
        ' ',
        regex = True,
        )

# rationales: replace NaN w/ '.'

rationales = [
    #'<my_rtnl>', ### replace with relevant varlist of extracted rationales
              ]

for r in rationales:
    d[r] = d[r].astype(str)
    d[r] = d[r].str.replace(
        r'0',
        '.',
        regex = True,
        )

# inspect

d.info()
d.head(3)

### Code
Enables human-LLM deductive coding: human-specified per-tag prompts, JSON-.xlsx structured outputs.
***

#### Llama 3.2: local

**_my_code_ (alias: `<my_code>`): prompt formulation**

In [None]:
### replace all instances of <my_*> in angle brackets with the specifics of your analysis
role = '''
You are tasked with applying pre-defined qualitative codes to <my_data>

You will be provided a definition, instructions, and key exemplars of text to guide your coding decisions.
'''

definition = '''
Definition of "<my_code>": <my_definition>
'''

instruction = '''
You will be provided with a piece of text. For each piece of text:
- If it meets the definition of "<my_code>," output <code_var> as "1".
- Otherwise, output <code_var> as "0".
- Also provide a short explanation in exactly two sentences, stored in <code_expl>.

Please respond in valid JSON with keys "<code_var>" and "<code_expl>" only.

Text:
{text}
'''

clarification = '''
- "<my_code>": <my_clarification>
'''

examples = '''
Below are human-validated examples of "<my_code>"

- "<my_example>."
'''

**Code deductively**

In [None]:
%%capture

# concatenate prompt as f-string

<my_code_prompt> = f'{role}{definition}{instruction}{clarification}{examples}' ### update with <my_code>_prompt
print(
    #<my_code_prompt>
    )

# locally hosted Llama endpoint

llama_endpoint = 'http://localhost:11434/api/generate'

# classify texts and update df

d = code_texts_deductively_llama(
    d,
    alias = #'<my_code>', ### update with <my_code>
    text_column = 'text',
    endpoint_url = llama_endpoint,
    prompt_template = refl_prompt,
    model_name = 'llama3',
    )


#### GPT-4o: OpenAI API

**Role assignment**

In [None]:
role = '''
You are tasked with applying pre-defined qualitative codes to <my_data>

You will be provided a definition, instructions, and key exemplars of text to guide your coding decisions.
'''

#### **_my_code_ (alias: `<my_code>`): prompt formulation**

In [None]:
### replace all instances of <my_*> in angle brackets with the specifics of your analysis

role = '''
You are tasked with applying pre-defined qualitative codes to <my_data>

You will be provided a definition, instructions, and key exemplars of text to guide your coding decisions.
'''

definition = '''
Definition of "<my_code>": <my_definition>
'''

instruction = '''
You will be provided with a piece of text. For each piece of text:
- If it meets the definition of "<my_code>," output <code_var> as "1".
- Otherwise, output <code_var> as "0".
- Also provide a short explanation in exactly two sentences, stored in <code_expl>.

Please respond in valid JSON with keys "<code_var>" and "<code_expl>" only.

Text:
{text}
'''

clarification = '''
- "<my_code>": <my_clarification>
'''

examples = '''
Below are human-validated examples of "<my_code>"

- "<my_example>."
'''

# concatenate prompt as f-string

<my_code_prompt> = f'{role}{definition}{instruction}{clarification}{examples}' ### update with <my_code>_prompt
print(
    #<my_code_prompt>
    )

**Code deductively**

In [None]:
#%%capture

# define prompts per code

prompts_per_code = {
  #'<my_code>': [<my_code_prompt>],
  }

# annotate df

d = code_texts_deductively_gpt(
  d,
  prompts_per_code,
  )

In [None]:
# inspect

#print(d)
d.head(10)

In [None]:
# export

%cd ../outputs

d.to_excel('d_coded.xlsx')

### 4. Fidelity
Calculates inter-coder reliability scores over indepednent coding applications, dummy codes disagreements for deliberation.
***

#### Compute Cohen's $\kappa$

In [None]:
%cd ../outputs

d = pd.read_excel(
    'd_coded.xlsx',
    index_col = [0],
    )

#print(d.columns)

# drop NaN

d = d.dropna(subset = [
    #'<my_code_gpt>', ### <my_code_gpt> = GPT-4o-output coding applications (dummy-coded)
    ]
             )

# inspect

d.info()
d.head(3)

In [None]:
# define kappa fx

def calculate_kappa(d, col1, col2):
    return cohen_kappa_score(d[col1], d[col2])

col_pairs = [
    #('<my_code>', '<my_code_gpt>'),
    ]

# initialize dict

kappa_results = {}

# % agreement loop

def calculate_percent_agreement(df, col_pairs):
    results = {}
    for col1, col2 in col_pairs:
        agreement = df[col1] == df[col2]
        percent_agreement = (agreement.sum() / len(df)) * 100
        results[f"{col1} & {col2}"] = percent_agreement
    return results

percent_agreement_results = calculate_percent_agreement(d, col_pairs)

for pair, percent in percent_agreement_results.items():
    print(f"Percent agreement for {pair}: {percent:.2f}%")

# kappa loop

for col1, col2 in col_pairs:
    kappa = calculate_kappa(d, col1, col2)
    kappa_results[f'{col1} and {col2}'] = kappa

for pair, kappa in kappa_results.items():
    print(f"Cohen's Kappa for {pair}: {kappa:.2f}")


#### Flag disagreements

In [None]:
# flag disagreements fx

def encode_disagreements(row):
    return 1 if row[0] != row[1] else 0

col_dis = [
    #('<my_code>', '<my_code_gpt>', '<my_code_dis>'), ### <my_code_dis> = dummy-coded disagreements between first two cols passed to encode_disagreements fx
  ]

for col1, col2, dis_col in col_dis:
    d[dis_col] = d[[col1, col2]].apply(
        encode_disagreements,
        axis = 1,
        )

# export

d.to_excel(f'd_coded_iaa.xlsx')

> End of human_llm_synergistic.ipynb