# llm-expert human co-annotation / integration TODO

Performs qualitative deductive coding consistent with the [CHALET](https://arxiv.org/abs/2405.05758) (**C**ollaborative **H**uman-LLM **A**na**L**ysis for **E**mpowering Conceptualization in Quali**T**ative Research) approach. Requires Ollama and/or OpenAI API key.

> llm_hum_annotate.ipynb<br>
> Simone J. Skeen x Claude Code (02-05-2026)

### Prepare
Installs, imports, requisite packages; customizes outputs.
***

**Install**

In [None]:
%%capture

%pip install -r ../requirements.txt
%pip install irrCAC
%pip install ollama
%pip install openai

**Import**

In [None]:
import json
import numpy as np
import ollama
import openai
import os
import pandas as pd
import re
import requests
import sys
import time
import warnings

from dotenv import load_dotenv
from irrCAC.raw import CAC
from sklearn.metrics import cohen_kappa_score

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

pd.options.mode.copy_on_write = True

pd.set_option(
    'display.max_columns',
    None,
    )

pd.set_option(
    'display.max_rows',
    None,
    )

warnings.simplefilter(
    action = 'ignore',
    category = FutureWarning,
    )

**Set env variables**

In [None]:
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [None]:
# set working directory to project root; add src/ to path

if os.path.basename(os.getcwd()) == 'src':
    os.chdir('..')

sys.path.insert(0, 'src')

In [None]:
from qualitative import (
    load_annotation_config,
    build_prompt_llama,
    build_prompts_per_code,
    code_texts_deductively_llama,
    code_texts_deductively_gpt,
)

In [None]:
d = pd.read_excel(
    'data/inputs/d_cycle_3_sjs.xlsx', ### d_cycle_3_sjs - IAA comparison w/ GPT-4o
    index_col = [0],
    )

# replace ' ' w/ NaN

d[[
    'agnt', 'afrm',
    'brdn', 'fitt',
    'just', 'prbl',
    'rbnd', 'refl',
    ]] = d[[
        'agnt', 'afrm',
        'brdn', 'fitt',
        'just', 'prbl',
        'rbnd', 'refl',
        ]].replace(
            r'^\s*$',
            np.nan,
            regex = True,
            )

# replace NaN w/ 0

d[[
    'agnt', 'afrm',
    'brdn', 'fitt',
    'just', 'prbl',
    'rbnd', 'refl',
    ]] = d[[
        'agnt', 'afrm',
        'brdn', 'fitt',
        'just', 'prbl',
        'rbnd', 'refl',
        ]].apply(
            pd.to_numeric,
            downcast = 'integer',
            )

d.fillna(
    0,
    inplace = True,
    )

# texts: delete '<|PII|>' pseudoword

texts = ['text']
pseudoword_tokens = [
    #'<SPL>',
    '<|PII|>', ### 1/10: remove from 'rtnl' prior to training
    ]

for t in texts:
    d[t] = d[t].replace(
        pseudoword_tokens,
        ' ',
        regex = True,
        )

# rationales: replace NaN w/ '.'

rationales = [
    'rtnl',
    #'afrm_llm_rtnl',
    #'agnt_llm_rtnl',
    #'fitt_llm_rtnl',
    #'just_llm_rtnl',
    #'refl_llm_rtnl',
    'note',
              ]

for r in rationales:
    d[r] = d[r].astype(str)
    d[r] = d[r].str.replace(
        r'0',
        '.',
        regex = True,
        )

# inspect

d.info()
d.head(3)

### Code
Enables human-LLM deductive coding: human-specified per-tag prompts, JSON-.xlsx structured outputs.
***

#### Llama 3.2: local

In [None]:
%%capture

config = load_annotation_config()

# build Llama prompt for 'refl'

refl_prompt = build_prompt_llama(config, 'refl')

# locally hosted Llama endpoint

llama_endpoint = 'http://localhost:11434/api/generate'

# classify texts and update df

d = code_texts_deductively_llama(
    d,
    alias = 'refl',
    text_column = 'text',
    endpoint_url = llama_endpoint,
    prompt_template = refl_prompt,
    model_name = 'llama3',
)

#### GPT-4o: OpenAI API

In [None]:
# define which codes to annotate

gpt_codes = ['afrm', 'agnt', 'fitt', 'just', 'rbnd', 'refl']
#gpt_codes = ['afrm', 'agnt', 'fitt', 'frtn', 'just', 'rbnd', 'refl']

# build all GPT prompts from YAML config

prompts_per_code = build_prompts_per_code(config, gpt_codes, backend = 'gpt')

# annotate df

d = code_texts_deductively_gpt(
    d,
    prompts_per_code,
    )

In [None]:
# inspect

#print(d)
d.head(10)

In [None]:
# export

#d.to_excel('data/outputs/d_cycle_3_sjs_gpt.xlsx')

**Inter-coder reliability: Cohen's $\kappa$**

In [None]:
#d = pd.read_excel(
#    'data/outputs/d_cycle_3_sjs_gpt.xlsx',
#    index_col = [0],
#    )

#print(d.columns)

# drop NaN

d = d.dropna(subset = [
    'afrm_gpt',
    'agnt_gpt',
    'fitt_gpt',
    #'frtn_gpt',
    'just_gpt',
    'rbnd_gpt',
    'refl_gpt',
    ]
             )

# inspect

d.info()
d.head(3)

In [None]:
# define kappa fx

def calculate_kappa(d, col1, col2):
    return cohen_kappa_score(d[col1], d[col2])

col_pairs = [
    ('afrm', 'afrm_gpt'),
    ('agnt', 'agnt_gpt'),
    ('fitt', 'fitt_gpt'),
    #('brdn', 'frtn_gpt'),
    ('just', 'just_gpt'),
    ('rbnd', 'rbnd_gpt'),
    ('refl', 'refl_gpt'),
    ]

# initialize dict

kappa_results = {}

# % agreement loop

#agreement = d['afrm'] == d['afrm_gpt']
#percent_agreement = (agreement.sum() / len(d)) * 100
#print("Percent Agreement:", percent_agreement)

# function to compute percent agreement

def calculate_percent_agreement(df, col_pairs):
    results = {}
    for col1, col2 in col_pairs:
        agreement = df[col1] == df[col2]
        percent_agreement = (agreement.sum() / len(df)) * 100
        results[f"{col1} & {col2}"] = percent_agreement
    return results

# compute % agreement

percent_agreement_results = calculate_percent_agreement(d, col_pairs)

# print results

for pair, percent in percent_agreement_results.items():
    print(f"Percent agreement for {pair}: {percent:.2f}%")

# kappa loop

for col1, col2 in col_pairs:
    kappa = calculate_kappa(d, col1, col2)
    kappa_results[f'{col1} and {col2}'] = kappa

for pair, kappa in kappa_results.items():
    print(f"Cohen's Kappa for {pair}: {kappa:.2f}")


In [None]:
d_dal = pd.read_excel('data/outputs/d_cycle_3_dal.xlsx', index_col = [0])
d_dal.columns = [f'{col}_dal' for col in d_dal.columns]

d_sjs = pd.read_excel('data/outputs/d_cycle_3_sjs.xlsx', index_col = [0])
d_sjs.columns = [f'{col}_sjs' for col in d_sjs.columns]

# merge

d = pd.merge(
    d_dal,
    d_sjs,
    left_index = True,
    right_index = True,
    )

targets = [
    'afrm_dal', 'afrm_sjs',
    'agnt_dal', 'agnt_sjs',
#    'dmnd_dal', 'dmnd_sjs',
    'fitt_dal', 'fitt_sjs',
#    'frtn_dal', 'frtn_sjs',
    'just_dal', 'just_sjs',
    'prbl_dal', 'prbl_sjs',
    'rbnd_dal', 'rbnd_sjs',
    'refl_dal', 'refl_sjs',
    ]

d[targets] = d[targets].apply(
    pd.to_numeric,
    errors = 'coerce',
    )

d[targets] = d[targets].fillna(0)

d.head(5)

In [None]:
# convert relevant columns to numeric type before calculating kappa

for col in ['afrm_dal', 'afrm_sjs', 'agnt_dal', 'agnt_sjs', 'brdn_dal', 'brdn_sjs', 'fitt_dal', 'fitt_sjs', 'just_dal', 'just_sjs', 'rbnd_dal', 'rbnd_sjs', 'refl_dal', 'refl_sjs']:
    d[col] = pd.to_numeric(d[col], errors='coerce')  # 'coerce' handles non-numeric values by setting them to NaN

col_pairs = [
    ('afrm_dal', 'afrm_sjs'),
    ('agnt_dal', 'agnt_sjs'),
    ('fitt_dal', 'fitt_sjs'),
#    ('frtn_dal', 'frtn_sjs'),
    ('just_dal', 'just_sjs'),
    ('rbnd_dal', 'rbnd_sjs'),
    ('refl_dal', 'refl_sjs'),
    ]

#d = d.dropna(subset=[col1, col2] for col1, col2 in col_pairs)

# compute % agreement

percent_agreement_results = calculate_percent_agreement(d, col_pairs)

# print results

for pair, percent in percent_agreement_results.items():
    print(f"Percent agreement for {pair}: {percent:.2f}%")

# kappa loop

for col1, col2 in col_pairs:
    kappa = calculate_kappa(d, col1, col2)
    kappa_results[f'{col1} and {col2}'] = kappa

for pair, kappa in kappa_results.items():
    print(f"Cohen's Kappa for {pair}: {kappa:.2f}")

**Flag disagreements** 

In [None]:
# flag disagreements Fx

def encode_disagreements(row):
    return 1 if row[0] != row[1] else 0

col_dis = [
    ('afrm', 'afrm_gpt', 'afrm_dis'),
    ('agnt', 'agnt_gpt', 'agnt_dis'),
    ('fitt', 'fitt_gpt', 'fitt_dis'),
    ('frtn', 'frtn_gpt', 'frtn_dis'),
    ('just', 'just_gpt', 'just_dis'),
    ('rbnd', 'rbnd_gpt', 'rbnd_dis'),
    ('refl', 'refl_gpt', 'refl_dis'),
  ]

for col1, col2, dis_col in col_dis:
    d[dis_col] = d[[col1, col2]].apply(
        encode_disagreements,
        axis = 1,
        )

# export

d.to_excel('data/outputs/d_pilot_coded_iaa.xlsx')