### Set file path, api key, model, and prompt version

In [None]:
input_file_path = "QAS_LLM_testing.xlsx"

api_key = "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"
model = "gpt-4o-mini" # gpt-4o or gpt-4o-mini
prompt_version = 2 # verions 1 or 2

# assistant_id = "asst_wjSEz6GDbt1WpTnpyNSdcES0"

### Self define functions

In [None]:
import re
import time
import pandas as pd
from openai import OpenAI
from openpyxl import load_workbook
from openpyxl.styles import Border, Side
from openpyxl.styles import Font, Alignment, PatternFill
from openpyxl.utils import get_column_letter

with open('prompt_v1.txt', 'r', encoding='utf-8') as file:
    prompt_v1 = file.read()

with open('prompt_v2.txt', 'r', encoding='utf-8') as file:
    prompt_v2 = file.read()

if prompt_version == 1:
    prompt = prompt_v1
elif prompt_version == 2:
    prompt = prompt_v2

def run_a_message(client, thread_id, assistant_id, content):
    message = client.beta.threads.messages.create(
        thread_id=thread_id,
        role="user",
        content=content
    )

    run = client.beta.threads.runs.create(thread_id=thread_id, assistant_id=assistant_id, instructions="Please follow the QAS and provide an answer in the intended format.")

    while run.status != "completed":
        run = client.beta.threads.runs.retrieve(thread_id=thread_id, run_id=run.id)
        time.sleep(1)

    message_response = client.beta.threads.messages.list(thread_id=thread_id)
    message_response = message_response.data
    return message_response[0].content[0].text.value


def process_response(response):
    # 1. Extract variables and values using regular expressions
    variable_match = re.findall(r'(\d+[a-z])\. (\d+)', response)
    coding_results = {variable: int(value) for variable, value in variable_match}

    # 2. Calculate the sum
    all_sum = sum(coding_results.values())

    category_totals = {}
    for key, value in coding_results.items():
        main_category = key[0]  # Extract the main category as the numeric part of the key (e.g., '1' from '1a')

        if main_category in category_totals:
            category_totals[main_category] += value
        else:
            category_totals[main_category] = value

    # 3. Extract the explanation
    explanation_match = re.search(r'Explanations?:\s*(.*)', response)
    if explanation_match:
        explanation = explanation_match.group(1)
    else:
        explanation = ""
    
    # 4. Create a DataFrame with a single row for the wide format
    combined_results = pd.DataFrame({**coding_results, 'Explanations': explanation, 'Sum-All': all_sum}, index=[0])
    
    for category, category_sum in category_totals.items():
        combined_results[f'Sum-{category}'] = category_sum
    
    return combined_results


def save_result_file(df, output_file_path):
    df.to_excel(output_file_path, index=False)

    wb = load_workbook(output_file_path)
    ws = wb.active

    no_border = Border(left=Side(border_style=None),right=Side(border_style=None), top=Side(border_style=None), bottom=Side(border_style=None))
    for row in ws.iter_rows():
        for cell in row:
            cell.border = no_border
            cell.alignment = Alignment(horizontal="center", vertical="center")

    for row in range(2, ws.max_row + 1):
        ws.row_dimensions[row].height = 120

    for col in ws.iter_cols(min_col=1, max_col=ws.max_column):
        col_name = col[0].value
        col_letter = get_column_letter(col[0].column)
        if col_name in ['Variable Name', 'LLM Model', 'Prompt Version', 'Testing Time']:
            ws.column_dimensions[col_letter].width = 15
        elif col_name in ['Full Question Wording', 'Explanations']:
            ws.column_dimensions[col_letter].width = 50
            for cell in col:
                cell.alignment = Alignment(horizontal="left", vertical="top", wrap_text=True)
        elif col_name == 'LLM Coding Results':
            ws.column_dimensions[col_letter].width = 20
            for cell in col:
                cell.alignment = Alignment(horizontal="left", vertical="center", wrap_text=False)
        else:
            ws.column_dimensions[col_letter].width = 8

    for cell in ws["A1:AQ1"][0]:  # Header formatting
        cell.font = Font(bold=True)
        cell.fill = PatternFill("solid", fgColor="00C0C0C0")
        cell.alignment = Alignment(horizontal="center", vertical="center")

    for row in range(1, ws.max_row+1): 
        ws[f'B{row}'].border = Border(right=Side(style="thin"))
        ws[f'F{row}'].border = Border(right=Side(style="thin"))
        ws[f'AH{row}'].border = Border(right=Side(style="thin"))

    wb.save(output_file_path)

### Run the main coding process

In [None]:
# 1. Read the excel file as input
df = pd.read_excel(input_file_path, engine='openpyxl')
df['Combined'] = df['Variable Name'] + ': ' + df['Full Question Wording']

# 2. Create an assistant
client = OpenAI(api_key=api_key)
assistant = client.beta.assistants.create(
    name="QAS Coder",
    instructions=prompt,
    tools=[{"type": "file_search"}],
    model=model,
)

# 3. Create a thread (i.e., start a new conversation)
thread = client.beta.threads.create(
    messages=[
        {
            "role": "user",
            "content": prompt,
        }
    ]
)

# only run the questions in m to n-1 rows!
df = df.iloc[3:6]

# 3. Process each question and store the results in the DataFrame
print(f"Total number of questions: {len(df)}\nStarting the coding process...", end='')
for index, row in df.iterrows():
    print(index+1, end='.')
    question = row['Combined']
    response = run_a_message(client, thread.id, assistant.id, question)
    print('..', end='')

    df.at[index, 'LLM Model'] = model
    df.at[index, 'Prompt Version'] = prompt_version
    df.at[index, 'Testing Time'] = time.strftime("%Y-%m-%d %H:%M")
    df.at[index, 'LLM Coding Results'] = response

    # put the processed reponse after the end of this row 
    processed_response = process_response(response)
    for col in processed_response.columns:
        df.at[index, col] = processed_response[col].values

df.drop(columns=['Combined'], inplace=True)

# 4. Save the results to a new Excel file
output_file_path = input_file_path.replace(".xlsx", "_results.xlsx")
save_result_file(df, output_file_path)
print(f'"{output_file_path}" Saved!')