In [None]:
%%capture
!pip install transformers

In [1]:
import numpy as np
import os
import pandas as pd
from   pathlib import Path
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
import time

In [2]:

from google.colab import drive
drive.mount('/content/drive')

from google.colab import auth
import gspread
from google.auth import default

auth.authenticate_user()
creds, _ = default()
gc = gspread.authorize(creds)

Mounted at /content/drive


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = "google/flan-t5-large"

In [3]:
interp_dir = '/content/drive/MyDrive/legal_interpretation/code/generative_testing'
train_test_dir = os.path.join(interp_dir, 'train_test_splits', 'train_test_splits_2')
output_path = os.path.join(interp_dir, 'flan_interpretation_results')
generated_output_path = os.path.join(output_path, 'generations')
descriptive_errors_dir = os.path.join(output_path, 'errors')

In [4]:
worksheet = gc.open('final_cleaned_paragraphs').sheet1
rows = worksheet.get_all_values()
interpretation_df = pd.DataFrame(rows)

In [5]:
interpretation_df.columns = interpretation_df.iloc[0]
interpretation_df = interpretation_df.iloc[1:]

In [None]:
prompt_text = "Some paragraphs in court cases interpret statutes. In this type of paragraph, there is an analysis of a statute and a claim made about its meaning. \n\nIn the following paragraph, determine if legal interpretation occurs. If yes, respond with \”interpretation\” and if not, respond with \”no interpretation\”"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

Downloading (…)okenizer_config.json: 0.00B [00:00, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json: 0.00B [00:00, ?B/s]

Downloading (…)cial_tokens_map.json: 0.00B [00:00, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [21]:

macro_f1_l = []
macro_precision_l = []
macro_recall_l = []

weighted_f1_l = []
weighted_precision_l = []
weighted_recall_l = []

one_f1_l = []
one_precision_l = []
one_recall_l = []

zero_f1_l = []
zero_precision_l = []
zero_recall_l = []

In [22]:
full_df = pd.DataFrame()

In [23]:
interpretation_df = interpretation_df[interpretation_df['class'].notna()]
interpretation_df["interpretation"] = np.where(interpretation_df["class"].isin(["FORMAL", "GRAND"]), "INTERPRETATION", "NO INTERPRETATION")

In [None]:
for split in range(0, 5): # limit to first five splits
  start_time = time.time()

  split_id_file = os.path.join(train_test_dir, f'split_{split}')

  with open(split_id_file, 'r') as file:
      train_ids = file.read().split("\n")

  interpretation_train_df = interpretation_df[interpretation_df["section_id"].isin(train_ids)]
  interpretation_test_df = interpretation_df[~interpretation_df["section_id"].isin(train_ids)]


  X_test = interpretation_test_df["paragraph"].to_list()
  y_test = interpretation_test_df["interpretation"].to_list()

  total = len(X_test)

  predicted_labels = []
  for i, text in enumerate(X_test):
    full_prompt = f"{prompt_text} ### \n\n Text: {text} \n\n ###"

    input_ids = tokenizer(full_prompt, return_tensors="pt").input_ids
    prediction = tokenizer.decode(model.generate(input_ids)[0], skip_special_tokens=True).upper()
    predicted_labels.append(prediction)

    if i % 50 == 0:
      precent = round((i/total)*100, 2)
      print(f"{precent}% through processing.")

  with open(os.path.join(generated_output_path, f'predictions_{split}.txt'), 'w') as file:
     for label in predicted_labels:
        file.write(f"{label}\n")

  predictions_df = pd.DataFrame(
  {'section_id': interpretation_test_df["section_id"].tolist(),
    'gold': y_test,
    'predicted': predicted_labels,
    'text': X_test
  })
  errors_df = predictions_df.query('gold != predicted')
  errors_df.to_csv(os.path.join(descriptive_errors_dir, f"{split}_errors.csv"))

  class_report = classification_report(y_test, predicted_labels, output_dict=True)


  sample_dict = {
      "model": "flan_interpretation",
      "split": split,

      "macro_f1": round(class_report["macro avg"]["f1-score"], 3),
      "macro_precision": round(class_report["macro avg"]["precision"], 3),
      "macro_recall": round(class_report["macro avg"]["recall"], 3),

      "weighted_f1": round(class_report["weighted avg"]["f1-score"], 3),
      "weighted_precision": round(class_report["weighted avg"]["precision"], 3),
      "weighted_recall": round(class_report["weighted avg"]["recall"], 3),

      "1_f1": round(class_report["interpretation"]["f1-score"], 3),
      "1_precision": round(class_report["interpretation"]["precision"], 3),
      "1_recall": round(class_report["interpretation"]["recall"], 3),

      "0_f1": round(class_report["no interpretation"]["f1-score"], 3),
      "0_precision": round(class_report["no interpretation"]["precision"], 3),
      "0_recall": round(class_report["no interpretation"]["recall"], 3),

  }

  new_row = pd.DataFrame(sample_dict, index = [0])
  full_df = pd.concat([full_df, new_row])

  macro_f1_l.append(class_report["macro avg"]["f1-score"])
  macro_precision_l.append(class_report["macro avg"]["precision"])
  macro_recall_l.append(class_report["macro avg"]["recall"])

  weighted_f1_l.append(class_report["weighted avg"]["f1-score"])
  weighted_precision_l.append(class_report["weighted avg"]["precision"])
  weighted_recall_l.append(class_report["weighted avg"]["recall"])

  one_f1_l.append(class_report["interpretation"]["f1-score"])
  one_precision_l.append(class_report["interpretation"]["precision"])
  one_recall_l.append(class_report["interpretation"]["recall"])

  zero_f1_l.append(class_report["no interpretation"]["f1-score"])
  zero_precision_l.append(class_report["no interpretation"]["precision"])
  zero_recall_l.append(class_report["no interpretation"]["recall"])

  end_time = time.time()

  total_minutes = round((end_time - start_time) / 60, 2)
  print(f"Total time: {total_minutes} minutes.")



0.0% through processing.
7.28% through processing.


Token indices sequence length is longer than the specified maximum sequence length for this model (622 > 512). Running this sequence through the model will result in indexing errors


14.56% through processing.
21.83% through processing.
29.11% through processing.
36.39% through processing.
43.67% through processing.
50.95% through processing.
58.22% through processing.
65.5% through processing.
72.78% through processing.
80.06% through processing.
87.34% through processing.
94.61% through processing.


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Total time: 41.21 minutes.




0.0% through processing.
7.28% through processing.
14.56% through processing.
21.83% through processing.
29.11% through processing.
36.39% through processing.
43.67% through processing.
50.95% through processing.
58.22% through processing.
65.5% through processing.
72.78% through processing.
80.06% through processing.
87.34% through processing.
94.61% through processing.
Total time: 41.59 minutes.


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [24]:
for split in range(0, 5):

  split_id_file = os.path.join(train_test_dir, f'split_{split}')

  with open(split_id_file, 'r') as file:
      train_ids = file.read().split("\n")

  interpretation_train_df = interpretation_df[interpretation_df["section_id"].isin(train_ids)]
  interpretation_test_df = interpretation_df[~interpretation_df["section_id"].isin(train_ids)]


  X_test = interpretation_test_df["paragraph"].to_list()
  y_test = interpretation_test_df["interpretation"].to_list()

  with open(os.path.join(generated_output_path, f'predictions_{split}.txt'), 'r') as file:
    print(file)
    predicted_labels = [line.rstrip().upper() for line in file]

  print(y_test, predicted_labels)
  class_report = classification_report(y_test, predicted_labels, output_dict=True)

  sample_dict = {
      "model": "interpretation_generative",
      "split": split,

      "macro_f1": round(class_report["macro avg"]["f1-score"], 3),
      "macro_precision": round(class_report["macro avg"]["precision"], 3),
      "macro_recall": round(class_report["macro avg"]["recall"], 3),

      "weighted_f1": round(class_report["weighted avg"]["f1-score"], 3),
      "weighted_precision": round(class_report["weighted avg"]["precision"], 3),
      "weighted_recall": round(class_report["weighted avg"]["recall"], 3),

      "1_f1": round(class_report["INTERPRETATION"]["f1-score"], 3),
      "1_precision": round(class_report["INTERPRETATION"]["precision"], 3),
      "1_recall": round(class_report["INTERPRETATION"]["recall"], 3),

      "0_f1": round(class_report["NO INTERPRETATION"]["f1-score"], 3),
      "0_precision": round(class_report["NO INTERPRETATION"]["precision"], 3),
      "0_recall": round(class_report["NO INTERPRETATION"]["recall"], 3),

  }

  new_row = pd.DataFrame(sample_dict, index = [0])
  full_df = pd.concat([full_df, new_row])

  macro_f1_l.append(class_report["macro avg"]["f1-score"])
  macro_precision_l.append(class_report["macro avg"]["precision"])
  macro_recall_l.append(class_report["macro avg"]["recall"])

  weighted_f1_l.append(class_report["weighted avg"]["f1-score"])
  weighted_precision_l.append(class_report["weighted avg"]["precision"])
  weighted_recall_l.append(class_report["weighted avg"]["recall"])

  one_f1_l.append(class_report["INTERPRETATION"]["f1-score"])
  one_precision_l.append(class_report["INTERPRETATION"]["precision"])
  one_recall_l.append(class_report["INTERPRETATION"]["recall"])

  zero_f1_l.append(class_report["NO INTERPRETATION"]["f1-score"])
  zero_precision_l.append(class_report["NO INTERPRETATION"]["precision"])
  zero_recall_l.append(class_report["NO INTERPRETATION"]["recall"])

  # end_time = time.time()

  # total_minutes = round((end_time - start_time) / 60, 2)
  # print(f"Total time: {total_minutes} minutes.")

<_io.TextIOWrapper name='/content/drive/MyDrive/legal_interpretation/code/generative_testing/flan_interpretation_results/generations/predictions_0.txt' mode='r' encoding='UTF-8'>
['NO INTERPRETATION', 'INTERPRETATION', 'NO INTERPRETATION', 'INTERPRETATION', 'INTERPRETATION', 'INTERPRETATION', 'INTERPRETATION', 'NO INTERPRETATION', 'NO INTERPRETATION', 'INTERPRETATION', 'INTERPRETATION', 'INTERPRETATION', 'INTERPRETATION', 'INTERPRETATION', 'INTERPRETATION', 'INTERPRETATION', 'NO INTERPRETATION', 'INTERPRETATION', 'INTERPRETATION', 'NO INTERPRETATION', 'NO INTERPRETATION', 'NO INTERPRETATION', 'NO INTERPRETATION', 'INTERPRETATION', 'INTERPRETATION', 'INTERPRETATION', 'INTERPRETATION', 'INTERPRETATION', 'NO INTERPRETATION', 'NO INTERPRETATION', 'NO INTERPRETATION', 'NO INTERPRETATION', 'NO INTERPRETATION', 'NO INTERPRETATION', 'NO INTERPRETATION', 'NO INTERPRETATION', 'NO INTERPRETATION', 'NO INTERPRETATION', 'INTERPRETATION', 'NO INTERPRETATION', 'NO INTERPRETATION', 'NO INTERPRETATION'

In [25]:

macro_f1 = sum(macro_f1_l) / len(macro_f1_l)
macro_precision = sum(macro_precision_l) / len(macro_precision_l)
macro_recall = sum(macro_recall_l) / len(macro_recall_l)

weighted_f1 = sum(weighted_f1_l) / len(weighted_f1_l)
weighted_precision = sum(weighted_precision_l) / len(weighted_precision_l)
weighted_recall = sum(weighted_recall_l) / len(weighted_recall_l)

one_f1 = sum(one_f1_l) / len(one_f1_l)
one_precision = sum(one_precision_l) / len(one_precision_l)
one_recall = sum(one_recall_l) / len(one_recall_l)

zero_f1 = sum(zero_f1_l) / len(zero_f1_l)
zero_precision = sum(zero_precision_l) / len(zero_precision_l)
zero_recall = sum(zero_recall_l) / len(zero_recall_l)

In [26]:
model_dict = {
    "model": "flan_interpretation",
    "split": "averages",

    "macro_f1": round(macro_f1, 3),
    "macro_precision": round(macro_precision, 3),
    "macro_recall": round(macro_recall, 3),

    "weighted_f1": round(weighted_f1, 3),
    "weighted_precision": round(weighted_precision, 3),
    "weighted_recall": round(weighted_recall, 3),

    "1_f1": round(one_f1, 3),
    "1_precision": round(one_precision, 3),
    "1_recall": round(one_recall, 3),

    "0_f1": round(zero_f1, 3),
    "0_precision": round(zero_precision, 3),
    "0_recall": round(zero_recall, 3),

}

new_row = pd.DataFrame(model_dict, index = [0])
full_df = pd.concat([full_df, new_row])

# full_df.to_csv(os.path.join(output_path, 'flan_interpretation_results.csv'))


In [27]:
full_df

Unnamed: 0,model,split,macro_f1,macro_precision,macro_recall,weighted_f1,weighted_precision,weighted_recall,1_f1,1_precision,1_recall,0_f1,0_precision,0_recall
0,interpretation_generative,0,0.424,0.568,0.505,0.56,0.608,0.677,0.043,0.455,0.023,0.806,0.68,0.987
0,interpretation_generative,1,0.42,0.544,0.503,0.548,0.587,0.667,0.042,0.417,0.022,0.798,0.671,0.985
0,interpretation_generative,2,0.418,0.46,0.497,0.568,0.548,0.681,0.027,0.231,0.014,0.809,0.69,0.979
0,interpretation_generative,3,0.413,0.504,0.5,0.548,0.563,0.67,0.026,0.333,0.013,0.801,0.674,0.987
0,interpretation_generative,4,0.43,0.64,0.512,0.546,0.648,0.664,0.065,0.615,0.034,0.795,0.665,0.989
0,flan_interpretation,averages,0.421,0.543,0.503,0.554,0.591,0.672,0.04,0.41,0.021,0.802,0.676,0.985


In [28]:
averages_df = full_df[full_df["split"] == "averages"]

In [29]:
averages_df = averages_df.drop(columns = ["weighted_f1", "weighted_precision", "weighted_recall"])

In [30]:
print(averages_df.to_latex(
                  formatters={"name": str.upper},
                  float_format="{:.2f}".format,
                  index = False
                  ))

\begin{tabular}{llrrrrrrrrr}
\toprule
              model &    split &  macro\_f1 &  macro\_precision &  macro\_recall &  1\_f1 &  1\_precision &  1\_recall &  0\_f1 &  0\_precision &  0\_recall \\
\midrule
flan\_interpretation & averages &      0.42 &             0.54 &          0.50 &  0.04 &         0.41 &      0.02 &  0.80 &         0.68 &      0.98 \\
\bottomrule
\end{tabular}



  print(averages_df.to_latex(
