# SCOTUS confirmation transcripts

In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import altair as alt
from PyPDF2 import PdfFileWriter, PdfFileReader
import glob
import re
from collections import Counter

In [3]:
pd.options.display.max_columns = 100
pd.options.display.max_rows = 1000
pd.options.display.max_colwidth = None

In [4]:
### https://www.govinfo.gov/collection/supreme-court-nomination-hearings

#### How many pages in each file? 

In [5]:
# path = "scotus_confirmation_transcripts/"
# all_files = glob.glob(path + "*.pdf")

# pages = []

# for filename in all_files:
#     output = PdfFileWriter()
#     input = PdfFileReader(open(f"{filename}", "rb"))
#     pages.append(
#         f"{filename.replace('scotus_confirmation_transcripts/', '').replace('.pdf', '')}: %d pages"
#         % input.getNumPages()
#     )
#     # !PDFtoText -layout {filename}

# pages

#### Clean up Jackson's individual transcripts

In [6]:
# !cat scotus_confirmation_transcripts/text/jackson*.txt > scotus_confirmation_transcripts/text/jackson.txt
# !mv scotus_confirmation_transcripts/text/jackson-*.txt scotus_confirmation_transcripts/text/jackson

#### How many words in each transcript?

In [7]:
text_files = []
words = 0

text_path = "scotus_confirmation_transcripts/text/"
text_files = glob.glob(text_path + "*.txt")

# loop to read, split and count word of each file
for f in text_files:
    file = open(f, "r", encoding="latin-1")
    read_data = file.read()
    per_word = read_data.split()
    words += len(per_word)
    print(
        f"{f.title().replace('Scotus_Confirmation_Transcripts/Text/', '').replace('.Txt', '')}:{words}"
    )

Barrett:212768
Jackson:428488
Gorsuch:699228
Kennedy:1071817
Alito:1398099
Sotomayor:1651744
Kagan:1886709
Scalia:2050488
Oconnor:2249106
Rehnquist-Chief:2718850
Rehnquist-Powell:3022196
Roberts:3317650
Thomas:4528112
Kavanaugh:5257068
Breyer:5587991
Souter:6091996
Ginsburg:6427427


#### Concatenate Thomas

In [8]:
# !cat scotus_confirmation_transcripts/text/thomas* >> scotus_confirmation_transcripts/text/thomas.txt

#### How many words in each file? 

In [9]:
listed_words = [
    "child pornography",
    "pornography",
    "predator",
    "trafficking",
    "marxist",
    "critical race theory",
    "race",
    "abortion",
    "roe",
    "transgender",
    "bathroom",
    "gender",
    "choice",
    "right to life",
    "taxes",
    "stare decisis",
    "federalism",
    "settled law",
    "founders",
    "laughter",
    "activism",
    "activist",
    "liberal",
    "conservative",
    "corporations",
    "ideologue",
    "right to privacy",
    "corporations",
    "crime",
    "rape",
    "hispanic",
    "book",
    "guantanamo",
    "gitmo",
    "woman",
]

In [10]:
text_path = "scotus_confirmation_transcripts/text/"
text_files = glob.glob(text_path + "*.txt")
listed_words_list = []

for listed_word in listed_words:
    for filename in text_files:
        file = open(f"{filename}", "rt", encoding="latin-1")
        count_specific_word = 0
        count_total_words = 0
        for line in file:
            words = line.lower().strip()
            words_clean = re.sub(r"[^\w\s]", "", words)
            count_total_words += len(words_clean)
            word_searched = listed_word
            count_specific_word += words_clean.count(word_searched)
            word_dict = {
                "word_searched": word_searched,
                "count_specific_word": count_specific_word,
                "count_total_words": count_total_words,
                "scotus_member": filename.title()
                .replace("Scotus_Confirmation_Transcripts/Text/", "")
                .replace(".Txt", ""),
            }
        listed_words_list.append(word_dict)
        file.close()
        # print(
        #     f"The word/phrase '{word_searched}' appears {count_specific_word} times out of {count_total_words} words in the {filename.title().replace('Scotus_Confirmation_Transcripts/Text/', '').replace('.Txt', '')} transcript."
        # )

In [11]:
src = pd.DataFrame(listed_words_list)

#### Word rates

In [12]:
src["rate_per_100000"] = (
    src["count_specific_word"] / src["count_total_words"]
) * 100000

In [13]:
hearing_years = {
    "Alito": "2006",
    "Barrett": "2020",
    "Breyer": "1994",
    "Ginsburg": "1993",
    "Gorsuch": "2017",
    "Jackson": "2022",
    "Kagan": "2010",
    "Kavanaugh": "2018",
    "Kennedy": "1987",
    "Oconnor": "1986",
    "Rehnquist-Chief": "1986",
    "Rehnquist-Powell": "1971",
    "Roberts": "2005",
    "Scalia": "1986",
    "Sotomayor": "2009",
    "Souter": "1990",
    "Stevens": "1975",
    "Thomas": "1991",
}

In [14]:
src["year"] = src["scotus_member"].map(hearing_years)

In [16]:
src[src["word_searched"].str.contains("gender")].sort_values(
    "rate_per_100000", ascending=False
)

Unnamed: 0,word_searched,count_specific_word,count_total_words,scotus_member,rate_per_100000,year
203,gender,230,1901236,Ginsburg,12.097393,1993
202,gender,112,2874966,Souter,3.895698,1990
190,gender,74,2110430,Kennedy,3.506394,1987
199,gender,220,6787590,Thomas,3.241209,1991
192,gender,43,1426628,Sotomayor,3.0141,2009
198,gender,36,1648775,Roberts,2.183439,2005
200,gender,71,4110358,Kavanaugh,1.727343,2018
189,gender,24,1451495,Gorsuch,1.653468,2017
188,gender,17,1195752,Jackson,1.421699,2022
196,gender,36,2598813,Rehnquist-Chief,1.385248,1986


In [17]:
alt.Chart(src[src["word_searched"] == "child pornography"]).mark_bar(
    color="#00d4d8"
).encode(
    x=alt.X(
        "scotus_member:N",
        sort=alt.EncodingSortField(field="year", order="ascending"),
    ),
    y="count_specific_word",
    facet=alt.Facet("word_searched", columns=5),
).properties(
    width=650, height=400
)

In [18]:
child_porn = src[src["word_searched"] == "child pornography"].sort_values(
    "year", ascending=False
)

In [19]:
child_porn["display"] = child_porn["scotus_member"] + " (" + child_porn["year"] + ")"

In [23]:
child_porn[["display", "count_specific_word", "count_total_words"]].to_csv(
    "data/processed/kbj-child-porn-confirmation-hearings.csv", index=False
)

In [22]:
alt.Chart(src[src["word_searched"] != "kavanaugh"]).mark_bar(color="#00d4d8").encode(
    y=alt.Y(
        "scotus_member:N",
        sort=alt.EncodingSortField(field="year", order="ascending"),
    ),
    x="count_specific_word",
    facet=alt.Facet("word_searched", columns=5),
).properties(width=150, height=150)