## Add question_id column to the two main questions dataframe

Add question ids to the three main question csvs for 2024:

* Actual questions on 13 transcripts: `2024_all_questions_coherence_labeled_Meta-Llama-3.1-70B-Instruct.csv`
* Llama generated questions on 13 transcripts: `2024_llm_questions_Meta-Llama-3.1-70B-Instruct.csv`
* GPT4o generated questions on 13 transcripts: `2024_llm_questions_gpt-4o-2024-08-06.csv`

In [1]:
import pandas as pd
import hashlib

In [2]:
# generate a unique question_id based on transcript_id and question_text
def generate_question_id(row):
    unique_string = f"{row['transcript_id']}_{row['question_text']}"
    id_hash = hashlib.md5(unique_string.encode()).hexdigest()[:8] # use first 8 chars
    return f'q_{id_hash}'

def add_question_id_to_df(df):
    df['question_id'] = df.apply(generate_question_id, axis=1)
    df.insert(0, 'question_id', df.pop('question_id'))
    return df

In [3]:
# filepaths
input_dir = '../datasets/2024_questions'
# input_fp_actual = f'{input_dir}/2024_all_questions_coherence_labeled_Meta-Llama-3.1-70B-Instruct.csv'
input_fp_llama = f'{input_dir}/2024_llm_questions_Meta-Llama-3.1-70B-Instruct.csv'
# input_fp_gpt = f'{input_dir}/2024_llm_questions_gpt-4o-2024-08-06.csv'

output_dir = '../datasets/2024_questions/with_qid'
# out_fp_actual = f'{output_dir}/2024_all_questions_coherence_labeled_Meta-Llama-3.1-70B-Instruct_qid.csv'
out_fp_llama = f'{output_dir}/2024_llm_questions_Meta-Llama-3.1-70B-Instruct_qid.csv'
# out_fp_gpt = f'{output_dir}/2024_llm_questions_gpt-4o-2024-08-06_qid.csv'


In [5]:
# # add question_id column
# df_actual_questions = pd.read_csv(input_fp_actual)
df_llama_questions = pd.read_csv(input_fp_llama)
# df_gpt_questions = pd.read_csv(input_fp_gpt)


# df_actual_questions = add_question_id_to_df(df_actual_questions)
df_llama_questions = add_question_id_to_df(df_llama_questions)
# df_gpt_questions = add_question_id_to_df(df_gpt_questions)

In [6]:
# save 
# df_actual_questions.to_csv(out_fp_actual)
df_llama_questions.to_csv(out_fp_llama)
# df_gpt_questions.to_csv(out_fp_gpt)