In [None]:
%%capture
!pip install openai

In [1]:
import numpy as np
import openai
import os
import pandas as pd
from   pathlib import Path
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
import time

In [2]:
from google.colab import drive
drive.mount('/content/drive')

from google.colab import auth
import gspread
from google.auth import default

auth.authenticate_user()
creds, _ = default()
gc = gspread.authorize(creds)

Mounted at /content/drive


In [4]:
interp_dir = '/content/drive/MyDrive/legal_interpretation/code/generative_testing'
train_test_dir = os.path.join(interp_dir, 'train_test_splits', 'train_test_splits_2')
output_path = os.path.join(interp_dir, 'results')
generated_output_path = os.path.join(output_path, 'generations')
descriptive_errors_dir = os.path.join(output_path, 'errors')

In [5]:
worksheet = gc.open('final_cleaned_paragraphs').sheet1
rows = worksheet.get_all_values()
interpretation_df = pd.DataFrame(rows)

In [6]:
interpretation_df.columns = interpretation_df.iloc[0]
interpretation_df = interpretation_df.iloc[1:]

In [None]:
from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
)  # for exponential backoff



In [None]:
openai.api_key = open(os.path.join(interp_dir, 'private', 'openai_key.txt')).read().strip()

In [None]:
intro_statement = "Some paragraphs in court cases interpret statutes. Within interpretation, there are two types:  GRAND and FORMAL."
formal_description = "FORMAL theory is a legal decision made according to a rule, often viewing the law as a closed and mechanical system. It screens the decision-maker off from the political, social, and economic choices involved in the decision. "
grand_description = "GRAND theory is legal decision that views law as an open-ended and on-going enterprise for the production and improvement of decisions that make sense on their face and in light of political, social, and economic factors."
none_description = "NONE is assigned to a passage or mode of reasoning that does not reflect either the Grand or Formal approaches. Note that this coding would include areas of substantive law outside of statutory interpretation, including procedural matters. "
prompt_text = "Determine the type of legal interpretation in the following passage. Return a single choice from GRAND, FORMAL, or NONE."

In [None]:
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
def get_descriptive_interp(text, formal_description, grand_description, none_description):
  prompt = f"{intro_statement} \n\n {formal_description}\n\n {grand_description}\n\n {none_description}\n\n {prompt_text}\n\n"
  response = openai.ChatCompletion.create(
      model='gpt-4',
      max_tokens=2,
      messages = [{'role': 'user', 'content': prompt + text}]
  )
  return response['choices'][0]['message']['content'].strip().lower()


In [7]:
interpretation_df = interpretation_df[interpretation_df['class'].notna()]

In [8]:
interpretation_df.sample(5)

Unnamed: 0,Unnamed: 1,section_id,paragraph,annotator_name,old_class,class,low_confidence,week,houston,josiah,...,_task_hash,_view_id,config,accept,answer,_timestamp,_annotator_id,_session_id,user_input,flagged
602,461,1313773_24,It appears that the origin of the passage the ...,houston,,GRAND,False,6,,,...,-708765039.0,blocks,{'choice_style': 'multiple'},['GRAND'],accept,1679936928.0,houston_sample_6-houston,houston_sample_6-houston,,
728,520,6169982_14,"The Commission held that, although the Mech Fu...",josiah,,NONE,False,6,,,...,1454572437.0,blocks,{'choice_style': 'multiple'},['NONE'],accept,1679788253.0,josiah_sample_6-josiah,josiah_sample_6-josiah,,
99,30,11301135_39,That the cost and indeed the availability of ...,,,NONE,True,6,NONE,NONE,...,,,,,,,,,,
1131,746,372903_47,For general discussion of the scope of Part II...,josiah,,NONE,False,7,,,...,-341981856.0,blocks,{'choice_style': 'multiple'},['NONE'],accept,1680969386.0,josiah_sample_7-josiah,josiah_sample_7-josiah,,
1198,10,6161985_28,Hearings before the House Committee on Inters...,houston,,NONE,False,7,,,...,494730634.0,blocks,{'choice_style': 'multiple'},['NONE'],accept,1681178486.0,houston_sample_7-houston,houston_sample_7-houston,,


In [9]:
macro_f1_l = []
macro_precision_l = []
macro_recall_l = []

weighted_f1_l = []
weighted_precision_l = []
weighted_recall_l = []

grand_f1_l = []
grand_precision_l = []
grand_recall_l = []

formal_f1_l = []
formal_precision_l = []
formal_recall_l = []

none_f1_l = []
none_precision_l = []
none_recall_l = []

In [10]:
full_df = pd.DataFrame()

In [None]:
for split in range(0, 5): # limit to first five splits
  start_time = time.time()

  split_id_file = os.path.join(train_test_dir, f'split_{split}')

  with open(split_id_file, 'r') as file:
      train_ids = file.read().split("\n")

  interpretation_train_df = interpretation_df[interpretation_df["section_id"].isin(train_ids)]
  interpretation_test_df = interpretation_df[~interpretation_df["section_id"].isin(train_ids)]


  X_test = interpretation_test_df["paragraph"].to_list()
  y_test = interpretation_test_df["class"].to_list()

  total = len(X_test)

  predicted_labels = []
  for i, text in enumerate(X_test):
    prediction = get_descriptive_interp(text, formal_description, grand_description, none_description).upper()
    predicted_labels.append(prediction)

    if i % 50 == 0:
      precent = round((i/total)*100, 2)
      print(f"{precent}% through processing.")

  with open(os.path.join(generated_output_path, f'predictions_{split}.txt'), 'w') as file:
     for label in predicted_labels:
        file.write(f"{label}\n")

  predictions_df = pd.DataFrame(
  {'section_id': interpretation_test_df["section_id"].tolist(),
    'gold': y_test,
    'predicted': predicted_labels,
    'text': X_test
  })
  errors_df = predictions_df.query('gold != predicted')
  errors_df.to_csv(os.path.join(descriptive_errors_dir, f"{split}_errors.csv"))

  class_report = classification_report(y_test, predicted_labels, output_dict=True)

  sample_dict = {
      "model": "descriptive_generative",
      "split": split,

      "macro_f1": round(class_report["macro avg"]["f1-score"], 3),
      "macro_precision": round(class_report["macro avg"]["precision"], 3),
      "macro_recall": round(class_report["macro avg"]["recall"], 3),

      "weighted_f1": round(class_report["weighted avg"]["f1-score"], 3),
      "weighted_precision": round(class_report["weighted avg"]["precision"], 3),
      "weighted_recall": round(class_report["weighted avg"]["recall"], 3),

      "grand_f1": round(class_report["GRAND"]["f1-score"], 3),
      "grand_precision": round(class_report["GRAND"]["precision"], 3),
      "grand_recall": round(class_report["GRAND"]["recall"], 3),

      "formal_f1": round(class_report["FORMAL"]["f1-score"], 3),
      "formal_precision": round(class_report["FORMAL"]["precision"], 3),
      "formal_recall": round(class_report["FORMAL"]["recall"], 3),

      "none_f1": round(class_report["NONE"]["f1-score"], 3),
      "none_precision": round(class_report["NONE"]["precision"], 3),
      "none_recall": round(class_report["NONE"]["recall"], 3),
  }

  new_row = pd.DataFrame(sample_dict, index = [0])
  full_df = pd.concat([full_df, new_row])


  macro_f1_l.append(class_report["macro avg"]["f1-score"])
  macro_precision_l.append(class_report["macro avg"]["precision"])
  macro_recall_l.append(class_report["macro avg"]["recall"])

  weighted_f1_l.append(class_report["weighted avg"]["f1-score"])
  weighted_precision_l.append(class_report["weighted avg"]["precision"])
  weighted_recall_l.append(class_report["weighted avg"]["recall"])

  grand_f1_l.append(class_report["GRAND"]["f1-score"])
  grand_precision_l.append(class_report["GRAND"]["precision"])
  grand_recall_l.append(class_report["GRAND"]["recall"])

  formal_f1_l.append(class_report["FORMAL"]["f1-score"])
  formal_precision_l.append(class_report["FORMAL"]["precision"])
  formal_recall_l.append(class_report["FORMAL"]["recall"])

  none_f1_l.append(class_report["NONE"]["f1-score"])
  none_precision_l.append(class_report["NONE"]["precision"])
  none_recall_l.append(class_report["NONE"]["recall"])

  end_time = time.time()

  total_minutes = round((end_time - start_time) / 60, 2)
  print(f"Total time: {total_minutes} minutes.")

0.0 through processing.
7.278 through processing.
14.556 through processing.
21.834 through processing.
29.112 through processing.
36.39 through processing.
43.668 through processing.
50.946 through processing.
58.224 through processing.
65.502 through processing.
72.78 through processing.
80.058 through processing.
87.336 through processing.
94.614 through processing.


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Total time: 18.57 minutes.
0.0 through processing.
7.278 through processing.
14.556 through processing.
21.834 through processing.
29.112 through processing.
36.39 through processing.
43.668 through processing.
50.946 through processing.
58.224 through processing.
65.502 through processing.
72.78 through processing.
80.058 through processing.
87.336 through processing.
94.614 through processing.


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Total time: 13.94 minutes.
0.0 through processing.
7.278 through processing.
14.556 through processing.
21.834 through processing.
29.112 through processing.
36.39 through processing.
43.668 through processing.
50.946 through processing.
58.224 through processing.
65.502 through processing.
72.78 through processing.
80.058 through processing.
87.336 through processing.
94.614 through processing.


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Total time: 14.04 minutes.
0.0 through processing.
7.278 through processing.
14.556 through processing.
21.834 through processing.
29.112 through processing.
36.39 through processing.
43.668 through processing.
50.946 through processing.
58.224 through processing.
65.502 through processing.
72.78 through processing.
80.058 through processing.
87.336 through processing.
94.614 through processing.


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Total time: 20.0 minutes.
0.0 through processing.
7.278 through processing.
14.556 through processing.
21.834 through processing.
29.112 through processing.
36.39 through processing.
43.668 through processing.
50.946 through processing.
58.224 through processing.
65.502 through processing.
72.78 through processing.
80.058 through processing.
87.336 through processing.
94.614 through processing.
Total time: 22.8 minutes.


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [11]:
for split in range(0, 5):

  split_id_file = os.path.join(train_test_dir, f'split_{split}')

  with open(split_id_file, 'r') as file:
      train_ids = file.read().split("\n")

  interpretation_train_df = interpretation_df[interpretation_df["section_id"].isin(train_ids)]
  interpretation_test_df = interpretation_df[~interpretation_df["section_id"].isin(train_ids)]


  X_test = interpretation_test_df["paragraph"].to_list()
  y_test = interpretation_test_df["class"].to_list()

  with open(os.path.join(generated_output_path, f'predictions_{split}.txt'), 'r') as file:
    print(file)
    predicted_labels = [line.rstrip() for line in file]

  print(y_test, predicted_labels)
  class_report = classification_report(y_test, predicted_labels, output_dict=True)

  sample_dict = {
      "model": "descriptive_generative",
      "split": split,

      "macro_f1": round(class_report["macro avg"]["f1-score"], 3),
      "macro_precision": round(class_report["macro avg"]["precision"], 3),
      "macro_recall": round(class_report["macro avg"]["recall"], 3),

      "weighted_f1": round(class_report["weighted avg"]["f1-score"], 3),
      "weighted_precision": round(class_report["weighted avg"]["precision"], 3),
      "weighted_recall": round(class_report["weighted avg"]["recall"], 3),

      "grand_f1": round(class_report["GRAND"]["f1-score"], 3),
      "grand_precision": round(class_report["GRAND"]["precision"], 3),
      "grand_recall": round(class_report["GRAND"]["recall"], 3),

      "formal_f1": round(class_report["FORMAL"]["f1-score"], 3),
      "formal_precision": round(class_report["FORMAL"]["precision"], 3),
      "formal_recall": round(class_report["FORMAL"]["recall"], 3),

      "none_f1": round(class_report["NONE"]["f1-score"], 3),
      "none_precision": round(class_report["NONE"]["precision"], 3),
      "none_recall": round(class_report["NONE"]["recall"], 3),

  }

  new_row = pd.DataFrame(sample_dict, index = [0])
  full_df = pd.concat([full_df, new_row])

  macro_f1_l.append(class_report["macro avg"]["f1-score"])
  macro_precision_l.append(class_report["macro avg"]["precision"])
  macro_recall_l.append(class_report["macro avg"]["recall"])

  weighted_f1_l.append(class_report["weighted avg"]["f1-score"])
  weighted_precision_l.append(class_report["weighted avg"]["precision"])
  weighted_recall_l.append(class_report["weighted avg"]["recall"])

  grand_f1_l.append(class_report["GRAND"]["f1-score"])
  grand_precision_l.append(class_report["GRAND"]["precision"])
  grand_recall_l.append(class_report["GRAND"]["recall"])

  formal_f1_l.append(class_report["FORMAL"]["f1-score"])
  formal_precision_l.append(class_report["FORMAL"]["precision"])
  formal_recall_l.append(class_report["FORMAL"]["recall"])

  none_f1_l.append(class_report["NONE"]["f1-score"])
  none_precision_l.append(class_report["NONE"]["precision"])
  none_recall_l.append(class_report["NONE"]["recall"])

<_io.TextIOWrapper name='/content/drive/MyDrive/legal_interpretation/code/generative_testing/results/generations/predictions_0.txt' mode='r' encoding='UTF-8'>
['NONE', 'GRAND', 'NONE', 'GRAND', 'FORMAL', 'FORMAL', 'GRAND', 'NONE', 'NONE', 'GRAND', 'FORMAL', 'GRAND', 'GRAND', 'GRAND', 'FORMAL', 'FORMAL', 'NONE', 'GRAND', 'FORMAL', 'NONE', 'NONE', 'NONE', 'NONE', 'GRAND', 'FORMAL', 'FORMAL', 'GRAND', 'FORMAL', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'GRAND', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'GRAND', 'FORMAL', 'FORMAL', 'NONE', 'GRAND', 'NONE', 'GRAND', 'NONE', 'GRAND', 'NONE', 'NONE', 'NONE', 'GRAND', 'FORMAL', 'NONE', 'NONE', 'GRAND', 'GRAND', 'NONE', 'NONE', 'NONE', 'GRAND', 'GRAND', 'GRAND', 'NONE', 'GRAND', 'GRAND', 'NONE', 'FORMAL', 'NONE', 'GRAND', 'NONE', 'NONE', 'NONE', 'NONE', 'GRAND', 'NONE', 'NONE', 'FORMAL', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'GRAND', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'N

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


<_io.TextIOWrapper name='/content/drive/MyDrive/legal_interpretation/code/generative_testing/results/generations/predictions_1.txt' mode='r' encoding='UTF-8'>
['FORMAL', 'FORMAL', 'NONE', 'NONE', 'GRAND', 'FORMAL', 'FORMAL', 'NONE', 'FORMAL', 'GRAND', 'FORMAL', 'NONE', 'NONE', 'GRAND', 'NONE', 'NONE', 'NONE', 'GRAND', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'GRAND', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'FORMAL', 'FORMAL', 'NONE', 'NONE', 'NONE', 'NONE', 'GRAND', 'NONE', 'NONE', 'GRAND', 'NONE', 'NONE', 'GRAND', 'NONE', 'NONE', 'GRAND', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'GRAND', 'NONE', 'FORMAL', 'NONE', 'NONE', 'NONE', 'NONE', 'GRAND', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'GRAND', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'FORMAL', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'FORMAL', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'GRAND', 'NONE', 'NONE', 'NONE

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


<_io.TextIOWrapper name='/content/drive/MyDrive/legal_interpretation/code/generative_testing/results/generations/predictions_2.txt' mode='r' encoding='UTF-8'>
['FORMAL', 'NONE', 'FORMAL', 'GRAND', 'GRAND', 'GRAND', 'GRAND', 'GRAND', 'FORMAL', 'NONE', 'FORMAL', 'GRAND', 'FORMAL', 'FORMAL', 'GRAND', 'NONE', 'NONE', 'FORMAL', 'NONE', 'GRAND', 'GRAND', 'FORMAL', 'NONE', 'NONE', 'NONE', 'GRAND', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'GRAND', 'GRAND', 'GRAND', 'FORMAL', 'NONE', 'NONE', 'NONE', 'GRAND', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'FORMAL', 'GRAND', 'FORMAL', 'NONE', 'NONE', 'GRAND', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'GRAND', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'FORMAL', 'NONE', 'NONE', 'GRAND', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'GRAND', 'GRAND', 'NONE', 'NONE', 'GRAND', 'NONE', 'NONE', 'NONE', 'GR

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


<_io.TextIOWrapper name='/content/drive/MyDrive/legal_interpretation/code/generative_testing/results/generations/predictions_3.txt' mode='r' encoding='UTF-8'>
['FORMAL', 'FORMAL', 'FORMAL', 'FORMAL', 'FORMAL', 'FORMAL', 'GRAND', 'FORMAL', 'NONE', 'NONE', 'NONE', 'NONE', 'GRAND', 'FORMAL', 'FORMAL', 'FORMAL', 'FORMAL', 'FORMAL', 'GRAND', 'FORMAL', 'NONE', 'GRAND', 'NONE', 'NONE', 'NONE', 'GRAND', 'GRAND', 'GRAND', 'GRAND', 'FORMAL', 'FORMAL', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'GRAND', 'NONE', 'GRAND', 'NONE', 'NONE', 'GRAND', 'NONE', 'GRAND', 'NONE', 'NONE', 'FORMAL', 'NONE', 'GRAND', 'NONE', 'NONE', 'NONE', 'GRAND', 'NONE', 'NONE', 'GRAND', 'NONE', 'NONE', 'NONE', 'GRAND', 'NONE', 'GRAND', 'NONE', 'FORMAL', 'NONE', 'GRAND', 'GRAND', 'NONE', 'FORMAL', 'FORMAL', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE',

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


<_io.TextIOWrapper name='/content/drive/MyDrive/legal_interpretation/code/generative_testing/results/generations/predictions_4.txt' mode='r' encoding='UTF-8'>
['FORMAL', 'NONE', 'NONE', 'FORMAL', 'GRAND', 'FORMAL', 'GRAND', 'GRAND', 'FORMAL', 'FORMAL', 'GRAND', 'FORMAL', 'NONE', 'NONE', 'NONE', 'NONE', 'GRAND', 'GRAND', 'FORMAL', 'NONE', 'GRAND', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'GRAND', 'GRAND', 'NONE', 'GRAND', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'GRAND', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'GRAND', 'FORMAL', 'NONE', 'NONE', 'NONE', 'GRAND', 'GRAND', 'NONE', 'NONE', 'NONE', 'FORMAL', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'GRAND', 'NONE', 'FORMAL', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'GRAND', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'NONE', 'GRAND', 'NONE', 'NONE', 'NONE', 'NONE', 'GRAND', 'NONE', 'FORMAL', 'NONE', 'GRAND', 'GRAND', 'NONE', 'NONE', 'GRAND', 'NONE', 'FORM

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [14]:
macro_f1 = sum(macro_f1_l) / len(macro_f1_l)
macro_precision = sum(macro_precision_l) / len(macro_precision_l)
macro_recall = sum(macro_recall_l) / len(macro_recall_l)

weighted_f1 = sum(weighted_f1_l) / len(weighted_f1_l)
weighted_precision = sum(weighted_precision_l) / len(weighted_precision_l)
weighted_recall = sum(weighted_recall_l) / len(weighted_recall_l)

grand_f1 = sum(grand_f1_l) / len(grand_f1_l)
grand_precision = sum(grand_precision_l) / len(grand_precision_l)
grand_recall = sum(grand_recall_l) / len(grand_recall_l)

formal_f1 = sum(formal_f1_l) / len(formal_f1_l)
formal_precision = sum(formal_precision_l) / len(formal_precision_l)
formal_recall = sum(formal_recall_l) / len(formal_recall_l)

none_f1 = sum(none_f1_l) / len(none_f1_l)
none_precision = sum(none_precision_l) / len(none_precision_l)
none_recall = sum(none_recall_l) / len(none_recall_l)



In [15]:
model_dict = {
    "model": "descriptive_generative",
    "split": "averages",

    "macro_f1": round(macro_f1, 3),
    "macro_precision": round(macro_precision, 3),
    "macro_recall": round(macro_recall, 3),

    "weighted_f1": round(weighted_f1, 3),
    "weighted_precision": round(weighted_precision, 3),
    "weighted_recall": round(weighted_recall, 3),

    "grand_f1": round(grand_f1, 3),
    "grand_precision": round(grand_precision, 3),
    "grand_recall": round(grand_recall, 3),

    "formal_f1": round(formal_f1, 3),
    "formal_precision": round(formal_precision, 3),
    "formal_recall": round(formal_recall, 3),

    "none_f1": round(none_f1, 3),
    "none_precision": round(none_precision, 3),
    "none_recall": round(none_recall, 3),
}

new_row = pd.DataFrame(model_dict, index = [0])
full_df = pd.concat([full_df, new_row])

# full_df.to_csv(os.path.join(output_path, 'gpt_generated_results.csv'))



In [16]:
full_df

Unnamed: 0,model,split,macro_f1,macro_precision,macro_recall,weighted_f1,weighted_precision,weighted_recall,grand_f1,grand_precision,grand_recall,formal_f1,formal_precision,formal_recall,none_f1,none_precision,none_recall
0,descriptive_generative,0,0.233,0.258,0.29,0.539,0.738,0.498,0.45,0.389,0.536,0.352,0.229,0.765,0.598,0.932,0.44
0,descriptive_generative,1,0.168,0.187,0.21,0.519,0.71,0.477,0.412,0.379,0.453,0.346,0.221,0.797,0.582,0.9,0.43
0,descriptive_generative,2,0.177,0.194,0.227,0.532,0.742,0.496,0.451,0.378,0.561,0.383,0.248,0.838,0.579,0.926,0.421
0,descriptive_generative,3,0.279,0.311,0.353,0.525,0.73,0.489,0.447,0.396,0.514,0.372,0.24,0.833,0.576,0.919,0.419
0,descriptive_generative,4,0.223,0.255,0.285,0.498,0.718,0.464,0.426,0.375,0.493,0.363,0.232,0.829,0.547,0.921,0.389
0,descriptive_generative,averages,0.216,0.241,0.273,0.522,0.727,0.485,0.437,0.383,0.511,0.363,0.234,0.813,0.576,0.92,0.42


In [17]:
averages_df = full_df[full_df["split"] == "averages"]

In [18]:
averages_df = averages_df.drop(columns = ["weighted_f1", "weighted_precision", "weighted_recall"])

In [19]:
print(averages_df.to_latex(
                  formatters={"name": str.upper},
                  float_format="{:.2f}".format,
                  index = False
                  ))

\begin{tabular}{llrrrrrrrrrrrr}
\toprule
                 model &    split &  macro\_f1 &  macro\_precision &  macro\_recall &  grand\_f1 &  grand\_precision &  grand\_recall &  formal\_f1 &  formal\_precision &  formal\_recall &  none\_f1 &  none\_precision &  none\_recall \\
\midrule
descriptive\_generative & averages &      0.22 &             0.24 &          0.27 &      0.44 &             0.38 &          0.51 &       0.36 &              0.23 &           0.81 &     0.58 &            0.92 &         0.42 \\
\bottomrule
\end{tabular}



  print(averages_df.to_latex(
