In [None]:
%%capture
!pip install openai

In [1]:
import numpy as np
# import openai
import os
import pandas as pd
from   pathlib import Path
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
import time

In [2]:
from google.colab import drive
drive.mount('/content/drive')

from google.colab import auth
import gspread
from google.auth import default

auth.authenticate_user()
creds, _ = default()
gc = gspread.authorize(creds)

Mounted at /content/drive


In [3]:
interp_dir = '/content/drive/MyDrive/legal_interpretation/code/generative_testing'
train_test_dir = os.path.join(interp_dir, 'train_test_splits', 'train_test_splits_2')
output_path = os.path.join(interp_dir, 'chain_of_thought_results')
generated_output_path = os.path.join(output_path, 'generations')
descriptive_errors_dir = os.path.join(output_path, 'errors')

In [4]:
worksheet = gc.open('final_cleaned_paragraphs').sheet1
rows = worksheet.get_all_values()
interpretation_df = pd.DataFrame(rows)

In [5]:
interpretation_df.columns = interpretation_df.iloc[0]
interpretation_df = interpretation_df.iloc[1:]

In [None]:
from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
)  # for exponential backoff



In [None]:
openai.api_key = open(os.path.join(interp_dir, 'private', 'openai_key.txt')).read().strip()

In [None]:
# intro_statement = "Some paragraphs in court cases interpret statutes. Within interpretation, there are two types:  GRAND and FORMAL."
# formal_description = "FORMAL theory is a legal decision made according to a rule, often viewing the law as a closed and mechanical system. It screens the decision-maker off from the political, social, and economic choices involved in the decision. "
# grand_description = "GRAND theory is legal decision that views law as an open-ended and on-going enterprise for the production and improvement of decisions that make sense on their face and in light of political, social, and economic factors."
# none_description = "NONE is assigned to a passage or mode of reasoning that does not reflect either the Grand or Formal approaches. Note that this coding would include areas of substantive law outside of statutory interpretation, including procedural matters. "
# prompt_text = "Determine the type of legal interpretation in the following passage. Return a single choice from GRAND, FORMAL, or NONE."

prompt_text = "Some paragraphs in court cases interpret statutes. Within interpretation, there are two types:  grand and formal. \n\nGrand interpretation represents a legal decision that views law as an open-ended and on-going enterprise for the production and improvement of decisions that make sense on their face and in light of political, social, and economic factors.  \n\nFormal interpretation is a legal decision made according to a rule, often viewing the law as a closed and mechanical system. It screens the decision-maker off from the political, social, and economic choices involved in the decision.  \n\nLet's analyze the following passage step-by-step. First, determine if it interprets a statute. Second, if it interprets a statute, determine whether the interpretation is grand or formal. The first word in your response should label the passage with \"GRAND\", \"FORMAL\", or \"NONE\" and then explain why you chose that label. \n\n"

In [None]:
print(prompt_text)

Some paragraphs in court cases interpret statutes. Within interpretation, there are two types:  grand and formal. 

Grand interpretation represents a legal decision that views law as an open-ended and on-going enterprise for the production and improvement of decisions that make sense on their face and in light of political, social, and economic factors.  

Formal interpretation is a legal decision made according to a rule, often viewing the law as a closed and mechanical system. It screens the decision-maker off from the political, social, and economic choices involved in the decision.  

Let's analyze the following passage step-by-step. First, determine if it interprets a statute. Second, if it interprets a statute, determine whether the interpretation is grand or formal. The first word in your response should label the passage with "GRAND", "FORMAL", or "NONE" and then explain why you chose that label. 




In [None]:
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
def get_descriptive_interp(text, prompt_text):
  full_prompt = f"{prompt_text} ### \n\n Text: {text} \n\n ###"
  response = openai.ChatCompletion.create(
      model='gpt-4',
      max_tokens=2,
      messages = [{'role': 'user', 'content': full_prompt}]
  )
  return response['choices'][0]['message']['content'].strip().lower()


In [6]:
interpretation_df = interpretation_df[interpretation_df['class'].notna()]

In [7]:
interpretation_df.sample(5)

Unnamed: 0,Unnamed: 1,section_id,paragraph,annotator_name,old_class,class,low_confidence,week,houston,josiah,...,_task_hash,_view_id,config,accept,answer,_timestamp,_annotator_id,_session_id,user_input,flagged
2047,477,6142476_3,The Reno Indian Colony is composed of several ...,michael,,NONE,False,8,,,...,-1482884239.0,blocks,{'choice_style': 'multiple'},['NONE'],accept,1681695884.0,michael_sample_8-michael,michael_sample_8-michael,,
69,0,6931705_6,Nor can we accept respondents’ contention that...,,,NONE,True,6,NONE,NONE,...,,,,,,,,,,
640,642,547898_39,The Amalgamated Clothing Workers of America is...,houston,,NONE,False,6,,,...,-500839881.0,blocks,{'choice_style': 'multiple'},['NONE'],accept,1679938679.0,houston_sample_6-houston,houston_sample_6-houston,,
112,122,3573000_16,"Counsel agreed, while adding that Rita’s poor ...",josiah,,NONE,False,5,,,...,-461658340.0,blocks,{'choice_style': 'multiple'},['NONE'],accept,1679362600.0,josiah_sample_5-josiah,josiah_sample_5-josiah,,
1511,666,3382397_11,"But the possession then- rightful may, by the ...",sarah,,FORMAL,False,7,,,...,-1015963461.0,blocks,{'choice_style': 'multiple'},['FORMAL'],accept,1681180739.0,sarah_sample_7-sarah,sarah_sample_7-sarah,,


In [8]:
macro_f1_l = []
macro_precision_l = []
macro_recall_l = []

weighted_f1_l = []
weighted_precision_l = []
weighted_recall_l = []

grand_f1_l = []
grand_precision_l = []
grand_recall_l = []

formal_f1_l = []
formal_precision_l = []
formal_recall_l = []

none_f1_l = []
none_precision_l = []
none_recall_l = []

In [9]:
full_df = pd.DataFrame()

In [None]:
for split in range(0, 5): # limit to first five splits
  start_time = time.time()

  split_id_file = os.path.join(train_test_dir, f'split_{split}')

  with open(split_id_file, 'r') as file:
      train_ids = file.read().split("\n")

  interpretation_train_df = interpretation_df[interpretation_df["section_id"].isin(train_ids)]
  interpretation_test_df = interpretation_df[~interpretation_df["section_id"].isin(train_ids)]


  X_test = interpretation_test_df["paragraph"].to_list()
  y_test = interpretation_test_df["class"].to_list()

  total = len(X_test)

  predicted_labels = []
  for i, text in enumerate(X_test):
    prediction = get_descriptive_interp(text, prompt_text).upper()
    predicted_labels.append(prediction)

    if i % 50 == 0:
      precent = round((i/total)*100, 2)
      print(f"{precent}% through processing.")

  with open(os.path.join(generated_output_path, f'predictions_{split}.txt'), 'w') as file:
     for label in predicted_labels:
        file.write(f"{label}\n")

  predictions_df = pd.DataFrame(
  {'section_id': interpretation_test_df["section_id"].tolist(),
    'gold': y_test,
    'predicted': predicted_labels,
    'text': X_test
  })
  errors_df = predictions_df.query('gold != predicted')
  errors_df.to_csv(os.path.join(descriptive_errors_dir, f"{split}_errors.csv"))

  class_report = classification_report(y_test, predicted_labels, output_dict=True)

  sample_dict = {
      "model": "descriptive_generative",
      "split": split,

      "macro_f1": round(class_report["macro avg"]["f1-score"], 3),
      "macro_precision": round(class_report["macro avg"]["precision"], 3),
      "macro_recall": round(class_report["macro avg"]["recall"], 3),

      "weighted_f1": round(class_report["weighted avg"]["f1-score"], 3),
      "weighted_precision": round(class_report["weighted avg"]["precision"], 3),
      "weighted_recall": round(class_report["weighted avg"]["recall"], 3),

      "grand_f1": round(class_report["GRAND"]["f1-score"], 3),
      "grand_precision": round(class_report["GRAND"]["precision"], 3),
      "grand_recall": round(class_report["GRAND"]["recall"], 3),

      "formal_f1": round(class_report["FORMAL"]["f1-score"], 3),
      "formal_precision": round(class_report["FORMAL"]["precision"], 3),
      "formal_recall": round(class_report["FORMAL"]["recall"], 3),

      "none_f1": round(class_report["NONE"]["f1-score"], 3),
      "none_precision": round(class_report["NONE"]["precision"], 3),
      "none_recall": round(class_report["NONE"]["recall"], 3),
  }

  new_row = pd.DataFrame(sample_dict, index = [0])
  full_df = pd.concat([full_df, new_row])


  macro_f1_l.append(class_report["macro avg"]["f1-score"])
  macro_precision_l.append(class_report["macro avg"]["precision"])
  macro_recall_l.append(class_report["macro avg"]["recall"])

  weighted_f1_l.append(class_report["weighted avg"]["f1-score"])
  weighted_precision_l.append(class_report["weighted avg"]["precision"])
  weighted_recall_l.append(class_report["weighted avg"]["recall"])

  grand_f1_l.append(class_report["GRAND"]["f1-score"])
  grand_precision_l.append(class_report["GRAND"]["precision"])
  grand_recall_l.append(class_report["GRAND"]["recall"])

  formal_f1_l.append(class_report["FORMAL"]["f1-score"])
  formal_precision_l.append(class_report["FORMAL"]["precision"])
  formal_recall_l.append(class_report["FORMAL"]["recall"])

  none_f1_l.append(class_report["NONE"]["f1-score"])
  none_precision_l.append(class_report["NONE"]["precision"])
  none_recall_l.append(class_report["NONE"]["recall"])

  end_time = time.time()

  total_minutes = round((end_time - start_time) / 60, 2)
  print(f"Total time: {total_minutes} minutes.")

0.0% through processing.
7.28% through processing.
14.56% through processing.
21.83% through processing.
29.11% through processing.
36.39% through processing.
43.67% through processing.
50.95% through processing.
58.22% through processing.
65.5% through processing.
72.78% through processing.
80.06% through processing.
87.34% through processing.
94.61% through processing.


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Total time: 10.13 minutes.
0.0% through processing.
7.28% through processing.
14.56% through processing.
21.83% through processing.
29.11% through processing.
36.39% through processing.
43.67% through processing.
50.95% through processing.
58.22% through processing.
65.5% through processing.
72.78% through processing.
80.06% through processing.
87.34% through processing.
94.61% through processing.


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Total time: 10.1 minutes.
0.0% through processing.
7.28% through processing.
14.56% through processing.
21.83% through processing.
29.11% through processing.
36.39% through processing.
43.67% through processing.
50.95% through processing.
58.22% through processing.
65.5% through processing.
72.78% through processing.
80.06% through processing.
87.34% through processing.
94.61% through processing.


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Total time: 8.47 minutes.
0.0% through processing.
7.28% through processing.
14.56% through processing.
21.83% through processing.
29.11% through processing.
36.39% through processing.
43.67% through processing.
50.95% through processing.
58.22% through processing.
65.5% through processing.
72.78% through processing.
80.06% through processing.
87.34% through processing.
94.61% through processing.


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Total time: 8.47 minutes.
0.0% through processing.
7.28% through processing.
14.56% through processing.
21.83% through processing.
29.11% through processing.
36.39% through processing.
43.67% through processing.
50.95% through processing.
58.22% through processing.
65.5% through processing.
72.78% through processing.
80.06% through processing.
87.34% through processing.
94.61% through processing.
Total time: 8.75 minutes.


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [10]:
import string
for split in range(0, 5):

  split_id_file = os.path.join(train_test_dir, f'split_{split}')

  with open(split_id_file, 'r') as file:
      train_ids = file.read().split("\n")

  interpretation_train_df = interpretation_df[interpretation_df["section_id"].isin(train_ids)]
  interpretation_test_df = interpretation_df[~interpretation_df["section_id"].isin(train_ids)]


  X_test = interpretation_test_df["paragraph"].to_list()
  y_test = interpretation_test_df["class"].to_list()

  with open(os.path.join(generated_output_path, f'predictions_{split}.txt'), 'r') as file:
    predicted_labels = [line.rstrip() for line in file]

  predicted_labels = [label.translate(str.maketrans('', '', string.punctuation)) for label in predicted_labels]

  class_report = classification_report(y_test, predicted_labels, output_dict=True)

  sample_dict = {
      "model": "class_examples",
      "split": split,

      "macro_f1": round(class_report["macro avg"]["f1-score"], 3),
      "macro_precision": round(class_report["macro avg"]["precision"], 3),
      "macro_recall": round(class_report["macro avg"]["recall"], 3),

      "weighted_f1": round(class_report["weighted avg"]["f1-score"], 3),
      "weighted_precision": round(class_report["weighted avg"]["precision"], 3),
      "weighted_recall": round(class_report["weighted avg"]["recall"], 3),

      "grand_f1": round(class_report["GRAND"]["f1-score"], 3),
      "grand_precision": round(class_report["GRAND"]["precision"], 3),
      "grand_recall": round(class_report["GRAND"]["recall"], 3),

      "formal_f1": round(class_report["FORMAL"]["f1-score"], 3),
      "formal_precision": round(class_report["FORMAL"]["precision"], 3),
      "formal_recall": round(class_report["FORMAL"]["recall"], 3),

      "none_f1": round(class_report["NONE"]["f1-score"], 3),
      "none_precision": round(class_report["NONE"]["precision"], 3),
      "none_recall": round(class_report["NONE"]["recall"], 3),

  }

  new_row = pd.DataFrame(sample_dict, index = [0])
  full_df = pd.concat([full_df, new_row])

  macro_f1_l.append(class_report["macro avg"]["f1-score"])
  macro_precision_l.append(class_report["macro avg"]["precision"])
  macro_recall_l.append(class_report["macro avg"]["recall"])

  weighted_f1_l.append(class_report["weighted avg"]["f1-score"])
  weighted_precision_l.append(class_report["weighted avg"]["precision"])
  weighted_recall_l.append(class_report["weighted avg"]["recall"])

  grand_f1_l.append(class_report["GRAND"]["f1-score"])
  grand_precision_l.append(class_report["GRAND"]["precision"])
  grand_recall_l.append(class_report["GRAND"]["recall"])

  formal_f1_l.append(class_report["FORMAL"]["f1-score"])
  formal_precision_l.append(class_report["FORMAL"]["precision"])
  formal_recall_l.append(class_report["FORMAL"]["recall"])

  none_f1_l.append(class_report["NONE"]["f1-score"])
  none_precision_l.append(class_report["NONE"]["precision"])
  none_recall_l.append(class_report["NONE"]["recall"])

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [11]:
macro_f1 = sum(macro_f1_l) / len(macro_f1_l)
macro_precision = sum(macro_precision_l) / len(macro_precision_l)
macro_recall = sum(macro_recall_l) / len(macro_recall_l)

weighted_f1 = sum(weighted_f1_l) / len(weighted_f1_l)
weighted_precision = sum(weighted_precision_l) / len(weighted_precision_l)
weighted_recall = sum(weighted_recall_l) / len(weighted_recall_l)

grand_f1 = sum(grand_f1_l) / len(grand_f1_l)
grand_precision = sum(grand_precision_l) / len(grand_precision_l)
grand_recall = sum(grand_recall_l) / len(grand_recall_l)

formal_f1 = sum(formal_f1_l) / len(formal_f1_l)
formal_precision = sum(formal_precision_l) / len(formal_precision_l)
formal_recall = sum(formal_recall_l) / len(formal_recall_l)

none_f1 = sum(none_f1_l) / len(none_f1_l)
none_precision = sum(none_precision_l) / len(none_precision_l)
none_recall = sum(none_recall_l) / len(none_recall_l)



In [12]:
model_dict = {
    "model": "descriptive_generative",
    "split": "averages",

    "macro_f1": round(macro_f1, 3),
    "macro_precision": round(macro_precision, 3),
    "macro_recall": round(macro_recall, 3),

    "weighted_f1": round(weighted_f1, 3),
    "weighted_precision": round(weighted_precision, 3),
    "weighted_recall": round(weighted_recall, 3),

    "grand_f1": round(grand_f1, 3),
    "grand_precision": round(grand_precision, 3),
    "grand_recall": round(grand_recall, 3),

    "formal_f1": round(formal_f1, 3),
    "formal_precision": round(formal_precision, 3),
    "formal_recall": round(formal_recall, 3),

    "none_f1": round(none_f1, 3),
    "none_precision": round(none_precision, 3),
    "none_recall": round(none_recall, 3),
}

new_row = pd.DataFrame(model_dict, index = [0])
full_df = pd.concat([full_df, new_row])

# full_df.to_csv(os.path.join(output_path, 'gpt_generated_results.csv'))



In [13]:
averages_df = full_df[full_df["split"] == "averages"]

In [14]:
averages_df = averages_df.drop(columns = ["weighted_f1", "weighted_precision", "weighted_recall"])

In [15]:
print(averages_df.to_latex(
                  formatters={"name": str.upper},
                  float_format="{:.2f}".format,
                  index = False
                  ))

\begin{tabular}{llrrrrrrrrrrrr}
\toprule
                 model &    split &  macro\_f1 &  macro\_precision &  macro\_recall &  grand\_f1 &  grand\_precision &  grand\_recall &  formal\_f1 &  formal\_precision &  formal\_recall &  none\_f1 &  none\_precision &  none\_recall \\
\midrule
descriptive\_generative & averages &      0.34 &             0.37 &          0.37 &      0.25 &             0.50 &          0.17 &       0.43 &              0.32 &           0.67 &     0.78 &            0.80 &         0.76 \\
\bottomrule
\end{tabular}



  print(averages_df.to_latex(
