In [None]:
"""
Authors: Markela Zeneli & Mindy Ng
"""

In [129]:
from collections import Counter
import re
import os
import numpy as np
import pandas as pd
from dotenv import load_dotenv

from openai import OpenAI
import goodfire

load_dotenv()

True

In [2]:
df = pd.read_csv('justice.csv')

In [3]:
df.columns

Index(['Unnamed: 0', 'ID', 'name', 'href', 'docket', 'term', 'first_party',
       'second_party', 'facts', 'facts_len', 'majority_vote', 'minority_vote',
       'first_party_winner', 'decision_type', 'disposition', 'issue_area'],
      dtype='object')

In [4]:
df['decision_type'].value_counts()

decision_type
majority opinion                     2829
per curiam                            267
plurality opinion                     153
equally divided                        17
dismissal - rule 46                     9
dismissal - other                       8
dismissal - improvidently granted       6
dismissal - moot                        5
memorandum                              1
opinion of the court                    1
Name: count, dtype: int64

In [5]:
df['issue_area'].value_counts()

issue_area
Criminal Procedure      859
Civil Rights            568
Economic Activity       542
First Amendment         353
Judicial Power          342
Due Process             128
Federalism              125
Privacy                  70
Unions                   60
Federal Taxation         51
Attorneys                37
Miscellaneous            20
Private Action            4
Interstate Relations      2
Name: count, dtype: int64

In [6]:
majority_opinions = df[(df["decision_type"] == "majority opinion")&(df["facts_len"] >= 2500)][["name","term","facts","decision_type","first_party","second_party","first_party_winner","issue_area","facts_len"]]

In [8]:
majority_opinions["issue_area"].value_counts()

issue_area
Civil Rights          11
Economic Activity      9
Criminal Procedure     7
Judicial Power         6
Federalism             2
First Amendment        2
Federal Taxation       1
Unions                 1
Miscellaneous          1
Name: count, dtype: int64

In [8]:
facts = majority_opinions['facts']

In [None]:
facts = facts.str.replace('<p>','')
facts = facts.str.replace('<p dir="ltr">','')
facts = facts.str.replace('</p>','')
facts = facts.str.replace('/n','')
facts = facts.str.replace('<em>','')
facts = facts.str.replace('</em>','')

In [13]:
terms = majority_opinions['term'].astype(str)

In [17]:
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"), base_url="https://api.deepseek.com")

llm_reas_judge = []

for f,t in zip(facts,terms):
    persona = f""""
        Your task is to clean the text that proceeds the next appearing ':' character. 
        To clean it, you must remove all information that indicates the Supreme Court's decision. 
        Please note that there are other decisions and verdicts mentioned in the text, which should 
        be retained. It is only the Supreme Court decisions which should be ommitted. Additionally, 
        any Supreme Court decisions that were made prior to the year {t} (if specified), should be retained, not removed.
        This may require splitting a sentence and only removing the judgement part of the 
        sentence, whilst still retaining all other information in the sentence.
        The text to clean is:

    """
    content = persona + f

    response = client.chat.completions.create(
        model="deepseek-chat",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": content},
        ],
        stream=False
    )

    matches = re.search(r'\n\n---\n\n(.*?)\n\n---\n\n', str(response.choices[0].message.content), re.DOTALL)

    if matches:
        result = matches.group(1)
        llm_reas_judge.append(result)
    else:
        print("No content found between the specified substrings.")
        print(response.choices[0].message.content)
        llm_reas_judge.append(np.nan)

No content found between the specified substrings.
The text provided does not contain any explicit mention of a Supreme Court decision, so no cleaning is required based on the instructions given. However, if there were a Supreme Court decision mentioned after a colon, it would be removed unless it was made prior to 2011. Since there is no such instance in the text, the text remains unchanged. Here is the original text for reference:

---

Plaintiffs Stephen R. Chandler and Robert L. Pierce were the sole shareholders of Home Oil and Coal Company, Inc. In 1999, Pierce contemplated selling his share of the business and sought professional advice in an effort to minimize tax liability generated by the sale of his interest in Home Oil. Each of the taxpayers initiated short sales of United States Treasury Bonds for $7,472,405. They then transferred the proceeds from that sale to Home Concrete as capital contributions. Home Concrete then closed the short sales by purchasing and returning esse

In [19]:
majority_opinions["deepseek_cleaned"] = llm_reas_judge

In [20]:
majority_opinions.to_csv("deepseek_cleaned.csv", index=True)

In [10]:
facts_v2 = pd.read_csv('cleaned_facts_winner.csv')

In [11]:
facts_v2.index = facts_v2['Unnamed: 0']

In [12]:
majority_opinions['facts_cleaned'] = facts_v2['facts']

In [13]:
GOODFIRE_API_KEY = os.getenv("goodfire_key")

In [14]:
client = goodfire.Client(api_key=GOODFIRE_API_KEY)

# Instantiate a model variant. 
variant = goodfire.Variant("meta-llama/Llama-3.3-70B-Instruct")

In [15]:
year = majority_opinions['term']
first_party = majority_opinions['first_party']
second_party = majority_opinions['second_party']
facts = majority_opinions['facts_cleaned']

In [None]:
llama_one_pass = []
for y, f, s, fa in zip(year, first_party, second_party, facts):
    response = ""  # Initialize an empty string to collect response
    
    for token in client.chat.completions.create(
        [{"role": "user", "content": 
        f'''
        You are a supreme court justice delivering a decision on a case. 
        
        I will give you the year of the case, the first party, second party and facts of the case. 
        
        Please determine whether you will rule in favor of the first party.
        
        Return an answer of TRUE if you decide in favor of the first party, or FALSE if not. 
        
        FALSE is a catch-all response for any scenario other than a ruling in favor of the first party.

        In your response, please give your answer (either TRUE or FALSE) as the first part of a response, then
        a semicolon ";", and then follow with your reasoning for why you gave your answer.

        In your reasoning, do not use any information or examples from after the year of the case. 
         
        This is the case:
         
        Year: {y}
        First party: {f}
        Second Party: {s}
        Facts: {fa}
        '''
        }],
        model=variant,
        stream=True,
    ):
        response += token.choices[0].delta.content  # Append tokens to a string
    
    llama_one_pass.append(response)  # Append the full response instead of tokens

In [None]:
llama_multi_pass = []

for y, f, s, fa in zip(year, first_party, second_party, facts):
    case_pass = []

    for iteration in range(20):
        response = ""
        
        for token in client.chat.completions.create(
            [{"role": "user", "content": f"""
            
            You are a supreme court justice delivering a decision on a case. 
        
            I will give you the year of the case, the first party, second party and facts of the case. 
            
            Please determine whether you will rule in favor of the first party.
            
            Return an answer of TRUE if you decide in favor of the first party, or FALSE if not. 
            
            FALSE is a catch-all response for any scenario other than a ruling in favor of the first party.

            In your response, please give your answer (either TRUE or FALSE) as the first part of a response, then
            a semicolon ";", and then follow with your reasoning for why you gave your answer.

            In your reasoning, do not use any information or examples from after the year of the case. 
             
            This is the case:
            Year: {y}
            First party: {f}
            Second Party: {s}
            Facts: {fa}
            """}],
            model=variant,
            stream=True,
        ):
            response += token.choices[0].delta.content

        case_pass.append(response)

    llama_multi_pass.append(case_pass)

In [None]:
llama_reasonings = []
for i in llama_one_pass:
    if ";" in i:
        llama_reasonings.append(i.split(";")[1])
    else:
        llama_reasonings.append(i)

In [None]:
llama_predictions = []

for i in llama_one_pass:
    llama_predictions.append(i.split(";")[0])

In [None]:
llama_predictions_binary = []
for i in llama_predictions:
    if i == "TRUE":
        llama_predictions.append(1)
    else:
        llama_predictions.append(0)

In [35]:
majority_opinions["first_party_winner"].value_counts()

first_party_winner
True     28
False    12
Name: count, dtype: int64

In [41]:
first_party_winner_binary = []

for i in majority_opinions["first_party_winner"].tolist():
    if i == True:
        first_party_winner_binary.append(1)
    else:
        first_party_winner_binary.append(0)

In [None]:
majority_opinions["first_party_winner_binary"] = first_party_winner_binary
majority_opinions["llama_prediction"] = llama_predictions_binary

In [43]:
majority_opinions["first_party_winner_binary"].value_counts()

first_party_winner_binary
1    28
0    12
Name: count, dtype: int64

In [None]:
contingency_matrix = pd.crosstab(majority_opinions["llama_prediction"], majority_opinions["first_party_winner_binary"])

print(contingency_matrix)

first_party_winner_binary   0   1
llm_winner_prediction            
0                          10  19
1                           2   9


In [None]:
majority_opinions["llama_reasoning"] = llama_reasonings
majority_opinions.to_csv("single_pass_predictions_and_reasonings.csv",index=True)

In [None]:
multi_pass_answers = []
multi_pass_reasonings = []

for i in llama_multi_pass:
    case_answers = []
    case_reasonings = []
    for j in i:
        if ";" in j:
            case_answers.append(j.split(";")[0])
            case_reasonings.append(j.split(";")[1])
        else:
            case_answers.append("FALSE")
            case_reasonings.append(j)
    multi_pass_answers.append(case_answers)
    multi_pass_reasonings.append(case_reasonings)


In [None]:
consistency_scores = []

for i in multi_pass_answers:
    counter = Counter(i)
    consistency_scores.append((counter.most_common(1)[0][1]/20)*100)


[100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0]


In [132]:
majority_opinions["llama_consistency_scores"] = consistency_scores

In [133]:
majority_opinions.to_csv("llama_consistency_scores.csv", index=True)

In [134]:
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"), base_url="https://api.deepseek.com")

multi_pass_deepseek = []

for y, f, s, fa in zip(year, first_party, second_party, facts):
    case_pass = []

    for iteration in range(20):

        persona = f""""
            You are a Supreme Court Justice delivering a decision on a case. 
        
            I will give you the year of the case, the first party, second party and facts of the case. 
            
            Please determine whether you will rule in favor of the first party.
            
            Return an answer of TRUE if you decide in favor of the first party, or FALSE if not. 
            
            FALSE is a catch-all response for any scenario other than a ruling in favor of the first party.

            In your response, please give your answer (either TRUE or FALSE) as the first part of a response, then
            a semicolon ";", and then follow with your reasoning for why you gave your answer.

            In your reasoning, do not use any information or examples from after the year of the case. 
             
            This is the case:
            Year: {y}
            First party: {f}
            Second Party: {s}
            Facts: {fa}

        """
        content = persona + f

        response = client.chat.completions.create(
            model="deepseek-chat",
            messages=[
                {"role": "system", "content": "You are a Supreme Court Justice."},
                {"role": "user", "content": content},
            ],
            stream=False
        )

        case_pass.append(response)

    multi_pass_deepseek.append(case_pass)

In [143]:
deepseek_multi_pass_answers = []
deepseek_multi_pass_reasonings = []

for i in multi_pass_deepseek:
    case_answers = []
    case_reasonings = []
    for j in i:
        if ";" in j.choices[0].message.content:
            case_answers.append(j.choices[0].message.content.split(";")[0])
            case_reasonings.append(j.choices[0].message.content.split(";")[1])
        else:
            case_answers.append("FALSE")
            case_reasonings.append(j.choices[0].message.content)
    deepseek_multi_pass_answers.append(case_answers)
    deepseek_multi_pass_reasonings.append(case_reasonings)

In [144]:
consistency_scores_deepseek = []

for i in deepseek_multi_pass_answers:
    counter = Counter(i)
    consistency_scores_deepseek.append((counter.most_common(1)[0][1]/20)*100)

print(consistency_scores_deepseek)

[100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 85.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 65.0, 100.0, 100.0, 90.0, 100.0, 100.0, 100.0]


In [187]:
majority_opinions["deepseek_consistency_scores"] = consistency_scores_deepseek

In [151]:
majority_opinions["llama_consistency_reasonings"] = multi_pass_reasonings
majority_opinions["deepseek_consistency_reasonings"] = deepseek_multi_pass_reasonings

In [142]:
deepseek_multi_pass_reasonings[0][0].choices[0].message.content

"FALSE; The reasoning for this decision is based on the facts presented and the legal framework of the Education of the Handicapped Act (now known as the Individuals with Disabilities Education Act, or IDEA). The Act mandates that state and local education agencies provide handicapped children with a free and appropriate public education (FAPE), which includes the development of an Individualized Education Program (IEP) tailored to the child's needs. In this case, the Massachusetts Department of Education’s Bureau of Special Education Appeals (BSEA) determined that the town’s proposed placement at Pine Glen School was inappropriate and that the Carroll School was better equipped to meet Michael Panico's needs. The BSEA ordered the town to pay for Michael’s tuition and transportation, as well as reimburse the Panicos for expenses already incurred.\n\nThe district court initially ruled in favor of the town, but the U.S. Court of Appeals for the First Circuit reversed this decision, findi

In [139]:
multi_pass_reasonings[1]

[' I rule in favor of Dethorne Graham because the facts of the case suggest that the police officers used excessive force in handling Graham, who was experiencing a medical emergency and posed no threat to the officers. The officers\' actions, including shoving Graham\'s head into the hood of the car and struggling to place him in the squad car, were not objectively reasonable given the circumstances. The Fourth Amendment\'s protection against unreasonable seizures requires that the force used by law enforcement be proportionate to the threat posed by the individual, and in this case, it appears that the officers\' use of force was excessive and violated Graham\'s rights. The "objective reasonableness" standard, which considers the totality of the circumstances from the perspective of a reasonable officer on the scene, is the appropriate test to apply in this case, rather than the four-part test from Johnson v. Glick.',
 ' I rule in favor of Dethorne Graham because the facts of the cas

In [145]:
deepseek_single_pass_answers = []
deepseek_single_pass_reasonings = []

for i in multi_pass_deepseek:
    if ";" in i[0].choices[0].message.content:
        deepseek_single_pass_answers.append(i[0].choices[0].message.content.split(";")[0])
        deepseek_single_pass_reasonings.append(i[0].choices[0].message.content.split(";")[1])
    else:
        deepseek_single_pass_answers.append("FALSE")
        deepseek_single_pass_reasonings.append(i[0].choices[0].message.content)

In [174]:
deepseek_single_pass_answers_binary = []

for i in deepseek_single_pass_answers:
    if i == "TRUE":
        deepseek_single_pass_answers_binary.append(1)
    else:
        deepseek_single_pass_answers_binary.append(0)

In [175]:
majority_opinions["deepseek_prediction"] = deepseek_single_pass_answers_binary
majority_opinions["deepseek_reasoning"] = deepseek_single_pass_reasonings

In [156]:
llama_consistency_predictions = []
for i in multi_pass_answers:
    case_answers = []
    for j in i:
        if j == "TRUE":
            case_answers.append(1)
        else:
            case_answers.append(0)
    llama_consistency_predictions.append(case_answers)

In [157]:
deepseek_consistency_predictions = []
for i in deepseek_multi_pass_answers:
    case_answers = []
    for j in i:
        if j == "TRUE":
            case_answers.append(1)
        else:
            case_answers.append(0)
    deepseek_consistency_predictions.append(case_answers)

In [158]:
majority_opinions["llama_consistency_predictions"] = llama_consistency_predictions
majority_opinions["deepseek_consistency_predictions"] = deepseek_consistency_predictions

In [188]:
all_results = majority_opinions.copy()

In [189]:
all_results.rename(columns={'llm_winner_prediction':'llama_prediction',
       'first_party_winner_reasonings':'llama_reasoning'}, inplace=True)

In [190]:
all_results.columns

Index(['name', 'term', 'facts', 'decision_type', 'first_party', 'second_party',
       'first_party_winner', 'issue_area', 'facts_len', 'facts_cleaned',
       'first_party_winner_binary', 'llama_prediction', 'llama_reasoning',
       'llama_consistency_scores', 'deepseek_consistency_scores',
       'deepseek_prediction', 'deepseek_reasoning',
       'llama_consistency_reasonings', 'deepseek_consistency_reasonings',
       'llama_consistency_predictions', 'deepseek_consistency_predictions'],
      dtype='object')

In [191]:
all_results = all_results[['name', 'term', 'facts', 'decision_type', 'first_party', 'second_party',
       'first_party_winner', 'issue_area', 'facts_len', 'facts_cleaned',
       'first_party_winner_binary', 'llama_prediction', 'llama_reasoning',
       'llama_consistency_predictions', 'llama_consistency_reasonings', 'llama_consistency_scores',
       'deepseek_prediction', 'deepseek_reasoning', 'deepseek_consistency_predictions','deepseek_consistency_reasonings', 'deepseek_consistency_scores']]

In [192]:
all_results.columns

Index(['name', 'term', 'facts', 'decision_type', 'first_party', 'second_party',
       'first_party_winner', 'issue_area', 'facts_len', 'facts_cleaned',
       'first_party_winner_binary', 'llama_prediction', 'llama_reasoning',
       'llama_consistency_predictions', 'llama_consistency_reasonings',
       'llama_consistency_scores', 'deepseek_prediction', 'deepseek_reasoning',
       'deepseek_consistency_predictions', 'deepseek_consistency_reasonings',
       'deepseek_consistency_scores'],
      dtype='object')

In [193]:
all_results.to_csv('final_results.csv', index=True)

In [207]:
all_results[all_results['deepseek_consistency_scores']<100][['first_party_winner_binary','deepseek_consistency_predictions', 'deepseek_consistency_reasonings', 'deepseek_consistency_scores']]

Unnamed: 0,first_party_winner_binary,deepseek_consistency_predictions,deepseek_consistency_reasonings,deepseek_consistency_scores
3086,1,"[1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, ...",[ The Supreme Court ruled in favor of Fane Loz...,85.0
3218,1,"[0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, ...",[ The Supreme Court would likely rule against ...,65.0
3264,1,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...",[ The reasoning for this decision is based on ...,90.0


In [198]:
contingency_matrix = pd.crosstab(all_results["deepseek_prediction"], all_results["first_party_winner_binary"])

print(contingency_matrix)

first_party_winner_binary   0   1
deepseek_prediction              
0                          12  22
1                           0   6


In [201]:
for i in all_results.loc[3086, "deepseek_consistency_reasonings"]:
    print(i)
    print ("----------")
    print ("----------")

 The Supreme Court ruled in favor of Fane Lozman, holding that a plaintiff can pursue a First Amendment retaliatory arrest claim even if there was probable cause for the arrest. The Court found that the existence of probable cause does not automatically defeat a First Amendment retaliatory arrest claim. The Court emphasized that Lozman's case presented unique circumstances, including evidence that the City had a policy of targeting him for his speech, which warranted further examination of his claim. The Court remanded the case for further proceedings consistent with its opinion.
----------
----------
 The Supreme Court ruled in favor of Fane Lozman, holding that the existence of probable cause does not necessarily bar a First Amendment retaliatory arrest claim. The Court emphasized that Lozman's case presented unique circumstances where the arrest was allegedly motivated by retaliation for his protected speech, rather than solely based on probable cause. The Court found that Lozman ha

In [202]:
for i in all_results.loc[3218, "deepseek_consistency_reasonings"]:
    print(i)
    print ("----------")
    print ("----------")

 The Supreme Court would likely rule against Noris Babb based on the legal standards in place at the time of the case. Under Title VII of the Civil Rights Act of 1964 and the Age Discrimination in Employment Act of 1967 (ADEA), federal sector employees like Babb must demonstrate that discrimination or retaliation was a "but for" cause of the adverse employment action. The Eleventh Circuit, bound by precedent, affirmed the district court's decision that Babb failed to meet this burden. The Supreme Court, adhering to the established legal framework, would likely uphold the lower courts' rulings, as Babb did not provide sufficient evidence to prove that discrimination or retaliation was the decisive factor in the VA's decision-making process.
----------
----------
 The facts presented indicate that Noris Babb was subjected to potential gender-plus-age discrimination and retaliation for her participation in protected Equal Employment Opportunity (EEO) activities. The denial of opportunitie

In [203]:
for i in all_results.loc[3264, "deepseek_consistency_reasonings"]:
    print(i)
    print ("----------")
    print ("----------")

 The reasoning for this decision is based on the procedural posture and the legal principles at play in 2019. The case involves the expansion of exemptions to the contraceptive coverage requirement under the ACA, which was challenged by the Commonwealth of Pennsylvania and the State of New Jersey. The district court issued a nationwide injunction against the enforcement of the expanded exemptions, and the Third Circuit affirmed this decision. The Supreme Court, in considering whether to rule in favor of The Little Sisters of the Poor Saints Peter and Paul Home, would need to evaluate whether the expanded exemptions violate the Administrative Procedure Act (APA) and other relevant laws. Given that the lower courts found the states likely to succeed on their APA claim, and considering the procedural deference typically given to lower courts' findings, the Supreme Court would likely uphold the injunction and not rule in favor of the first party. Therefore, the decision would be FALSE.
---

In [206]:
all_results["deepseek_consistency_scores"].mean()

np.float64(98.5)