## Generate csv for question coherence judgments

In [2]:
import pandas as pd

In [None]:
all_qs_24_df = pd.read_csv('2024_all_questions.csv')
full_text_df = pd.read_csv('2024_full_text_transcripts.csv')

Merge the following two csvs for 2024 transcripts to get a single csv for easy coherence judgment generation:
* `2024_all_questions.csv` (questions)
* `2024_full_text_transcripts.csv` (full_text)

In the merged csv, each row corresponds to a single question and we have the following columns:

* `transcript_id` - same as questions/full_text
* `question_addressee` - same as questions
* `justice` - same as questions
* `question_text` - same as questions
* `opening_statement` - same as either petitioner_opening_text or respondent_opening_statement in full_text
* `full_text` - same as either petitioner_full_text or respondent_full_text in full_text


In [21]:
all_qs_24_df = pd.read_csv('2024_all_questions.csv')
full_text_df = pd.read_csv('2024_full_text_transcripts.csv')

In [54]:
merged_df = all_qs_24_df.merge(full_text_df, on="transcript_id")
merged_df.head()

Unnamed: 0,Unnamed: 0_x,transcript_id,question_addressee,justice,question_text,Unnamed: 0_y,petitioner_opening_text,petitioner_full_text,respondent_opening_statement,respondent_full_text
0,0,2024.23-621-t01,petitioner,Clarence Thomas,You --can a consent decree or a default judgm...,0,<speaker>Erika L. Maley</speaker><text> Mr. Ch...,"<speaker>John G. Roberts, Jr.</speaker><text> ...",<speaker>Anthony A. Yang</speaker><text> Mr. C...,<speaker>Anthony A. Yang</speaker><text> Mr. C...
1,1,2024.23-621-t01,petitioner,Clarence Thomas,But I thought your argument hinged on a court...,0,<speaker>Erika L. Maley</speaker><text> Mr. Ch...,"<speaker>John G. Roberts, Jr.</speaker><text> ...",<speaker>Anthony A. Yang</speaker><text> Mr. C...,<speaker>Anthony A. Yang</speaker><text> Mr. C...
2,2,2024.23-621-t01,petitioner,"John G. Roberts, Jr.",What do you do with the formulation by your f...,0,<speaker>Erika L. Maley</speaker><text> Mr. Ch...,"<speaker>John G. Roberts, Jr.</speaker><text> ...",<speaker>Anthony A. Yang</speaker><text> Mr. C...,<speaker>Anthony A. Yang</speaker><text> Mr. C...
3,3,2024.23-621-t01,petitioner,Elena Kagan,"Well, it's -- it's true that it's only a lik...",0,<speaker>Erika L. Maley</speaker><text> Mr. Ch...,"<speaker>John G. Roberts, Jr.</speaker><text> ...",<speaker>Anthony A. Yang</speaker><text> Mr. C...,<speaker>Anthony A. Yang</speaker><text> Mr. C...
4,4,2024.23-621-t01,petitioner,Ketanji Brown Jackson,But it's not that determination that's making...,0,<speaker>Erika L. Maley</speaker><text> Mr. Ch...,"<speaker>John G. Roberts, Jr.</speaker><text> ...",<speaker>Anthony A. Yang</speaker><text> Mr. C...,<speaker>Anthony A. Yang</speaker><text> Mr. C...


In [None]:
def get_full_text(row):
    """
    Add a column for the full text corresponding to the addressee
    """
    if row['question_addressee'] == 'petitioner':
        return row['petitioner_full_text']
    elif row['question_addressee'] == 'respondent':
        return row['respondent_full_text']
    return None

def get_opening_text(row):
    """
    Add a column for the opening statement corresponding to the addressee
    """

    if row['question_addressee'] == 'petitioner':
        return row['petitioner_opening_text']
    elif row['question_addressee'] == 'respondent':
        return row['respondent_opening_statement']
    return None

merged_df['opening_statement'] = merged_df.apply(get_opening_text, axis=1)
merged_df['full_text'] = merged_df.apply(get_full_text, axis=1)
merged_df.head()

In [62]:
columns = ['transcript_id', 'question_addressee', 'justice', 'question_text', 'opening_statement', 'full_text']
merged_df_questions = merged_df[columns]
merged_df_questions.head()

Unnamed: 0,transcript_id,question_addressee,justice,question_text,opening_statement,full_text
0,2024.23-621-t01,petitioner,Clarence Thomas,You --can a consent decree or a default judgm...,<speaker>Erika L. Maley</speaker><text> Mr. Ch...,"<speaker>John G. Roberts, Jr.</speaker><text> ..."
1,2024.23-621-t01,petitioner,Clarence Thomas,But I thought your argument hinged on a court...,<speaker>Erika L. Maley</speaker><text> Mr. Ch...,"<speaker>John G. Roberts, Jr.</speaker><text> ..."
2,2024.23-621-t01,petitioner,"John G. Roberts, Jr.",What do you do with the formulation by your f...,<speaker>Erika L. Maley</speaker><text> Mr. Ch...,"<speaker>John G. Roberts, Jr.</speaker><text> ..."
3,2024.23-621-t01,petitioner,Elena Kagan,"Well, it's -- it's true that it's only a lik...",<speaker>Erika L. Maley</speaker><text> Mr. Ch...,"<speaker>John G. Roberts, Jr.</speaker><text> ..."
4,2024.23-621-t01,petitioner,Ketanji Brown Jackson,But it's not that determination that's making...,<speaker>Erika L. Maley</speaker><text> Mr. Ch...,"<speaker>John G. Roberts, Jr.</speaker><text> ..."


In [63]:
out_fp = '2024_all_questions_full_text_merged.csv'
merged_df_questions.to_csv(out_fp, index=False)

In [104]:
merged_df_questions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 883 entries, 0 to 882
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   transcript_id       883 non-null    object
 1   question_addressee  883 non-null    object
 2   justice             883 non-null    object
 3   question_text       883 non-null    object
 4   opening_statement   883 non-null    object
 5   full_text           883 non-null    object
dtypes: object(6)
memory usage: 41.5+ KB
