## Script for reformatting the df for llm-generated questions for easy processing

In [1]:
import pandas as pd
import json

In [2]:
input_dir = '../datasets/llm_outputs/generate_questions'

# input_fp = f'{input_dir}/2024_full_text_sotomayor_alito_questions.csv'
input_fp = f'{input_dir}/2024_full_text_sotomayor_alito_questions_gpt-4o-2024-08-06.csv'

df_llm_questions = pd.read_csv(input_fp)

df_llm_questions.head()

Unnamed: 0.1,Unnamed: 0,transcript_id,petitioner_opening_text,petitioner_full_text,respondent_opening_statement,respondent_full_text,questions_sotomayor_petitioner,questions_sotomayor_respondent,questions_alito_petitioner,questions_alito_respondent
0,0,2024.23-621-t01,<speaker>Erika L. Maley</speaker><text> Mr. Ch...,"<speaker>John G. Roberts, Jr.</speaker><text> ...",<speaker>Anthony A. Yang</speaker><text> Mr. C...,<speaker>Anthony A. Yang</speaker><text> Mr. C...,"[""Can you elaborate on why a preliminary injun...","[""Can you clarify whether there are situations...","[""How do you reconcile your argument with case...","[""Can you clarify the relationship between int..."
1,1,2024.23-365 -t01,<speaker>Lisa S. Blatt</speaker><text> Thank y...,"<speaker>John G. Roberts, Jr.</speaker><text> ...",<speaker>Easha Anand</speaker><text> Mr. Chief...,<speaker>Easha Anand</speaker><text> Mr. Chief...,"[""Can you clarify how you distinguish between ...","[""Could you elaborate on why the phrase \""dama...","[""How do you reconcile your interpretation of ...","[""How do you reconcile the usage of \""damages ..."
2,2,2024.23-852-t01,<speaker>Elizabeth B. Prelogar</speaker><text>...,"<speaker>John G. Roberts, Jr.</speaker><text> ...",<speaker>Peter A. Patterson</speaker><text> Mr...,<speaker>Peter A. Patterson</speaker><text> Mr...,"[""Ms. Prelogar, could you elucidate on how the...","[""Mr. Patterson, how do you reconcile your int...","[""How do you respond to the argument that requ...","[""How do you interpret Congress's decision to ..."
3,3,2024.23-980-t01,<speaker>Kannon K. Shanmugam</speaker><text> T...,"<speaker>John G. Roberts, Jr.</speaker><text> ...",<speaker>Kevin K. Russell</speaker><text> Mr. ...,<speaker>Kevin K. Russell</speaker><text> Mr. ...,"[""Could you explain why you believe the Ninth ...","[""Can you elaborate on the significance of mat...","[""Can you clarify how adopting the Ninth Circu...","[""Can you clarify what specific criteria the p..."
4,4,2024.23-191 -t01,<speaker>Adam G. Unikowsky</speaker><text> Mr....,"<speaker>John G. Roberts, Jr.</speaker><text> ...","<speaker>Edmund G. LaCour, Jr.</speaker><text>...","<speaker>Edmund G. LaCour, Jr.</speaker><text>...","[""How do you distinguish between a neutral pro...","[""How does the doctrine of preemption apply in...","[""How do you respond to the argument that Alab...","[""Can you clarify how the neutral jurisdiction..."


In [3]:
rows = []

for _, row in df_llm_questions.iterrows():
    transcript_id = row['transcript_id']
    
    for justice in ['sotomayor', 'alito']:
        for addressee in ['petitioner', 'respondent']:
            column_name = f'questions_{justice}_{addressee}'
            questions = json.loads(row[column_name])
            for question_text in questions:
                # Add a new row for each question
                rows.append({
                    'transcript_id': transcript_id,
                    'question_addressee': addressee,
                    'justice': justice,
                    'question_text': question_text,
                    'opening_statement': row[f'{addressee}_opening_text'] 
                    if addressee == 'petitioner' else row['respondent_opening_statement'],
                    'full_text': row[f'{addressee}_full_text']
                })

output_df = pd.DataFrame(rows)

In [None]:
out_dir = f'../datasets/2024_questions'
# model_name = 'Meta-Llama-3.1-70B-Instruct'
model_name = 'gpt-4o-2024-08-06'
out_fp = f'2024_llm_questions_{model_name}.csv'
output_df.to_csv(out_fp, index=False)

#### Exploration

In [4]:
output_df.head()

Unnamed: 0,transcript_id,question_addressee,justice,question_text,opening_statement,full_text
0,2024.23-621-t01,petitioner,sotomayor,Can you explain why you believe the Court shou...,<speaker>Erika L. Maley</speaker><text> Mr. Ch...,"<speaker>John G. Roberts, Jr.</speaker><text> ..."
1,2024.23-621-t01,petitioner,sotomayor,How do you respond to the concern that your pr...,<speaker>Erika L. Maley</speaker><text> Mr. Ch...,"<speaker>John G. Roberts, Jr.</speaker><text> ..."
2,2024.23-621-t01,petitioner,sotomayor,You rely on legal dictionaries from the time o...,<speaker>Erika L. Maley</speaker><text> Mr. Ch...,"<speaker>John G. Roberts, Jr.</speaker><text> ..."
3,2024.23-621-t01,petitioner,sotomayor,Don't some of this Court's precedents suggest ...,<speaker>Erika L. Maley</speaker><text> Mr. Ch...,"<speaker>John G. Roberts, Jr.</speaker><text> ..."
4,2024.23-621-t01,petitioner,sotomayor,Can you think of any situations in which a pre...,<speaker>Erika L. Maley</speaker><text> Mr. Ch...,"<speaker>John G. Roberts, Jr.</speaker><text> ..."


In [5]:
output_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 311 entries, 0 to 310
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   transcript_id       311 non-null    object
 1   question_addressee  311 non-null    object
 2   justice             311 non-null    object
 3   question_text       311 non-null    object
 4   opening_statement   311 non-null    object
 5   full_text           311 non-null    object
dtypes: object(6)
memory usage: 14.7+ KB
