In [67]:
import pandas as pd
from openai import OpenAI
from dotenv import load_dotenv

In [68]:
import json

# Path to your JSONL file
file_path = 'train_data.json'

with open(file_path, 'r') as file:
    data_list = json.load(file)

In [69]:
train_data, test_data = data_list[:700], data_list[701:]

In [70]:
# Open a file in write mode
with open('train_data.jsonl', 'w') as outfile:
    for data in train_data:
        # Combine tokens and ner_tags into space-separated strings
        prompt = ' '.join(data['tokens'])
        completion = ' '.join(data['ner_tags'])
        
        # Create the JSONL entry
        jsonl_entry = {
            "messages": [
                {"role": "system", "content": "You are performing Named Entity Recognition (NER) to extract legal entities such as violation, violation by, violation on, and law from the text.  You will return a list of IOB tags."},
                {"role": "user", "content": prompt},
                {"role": "assistant", "content": completion}
                ]}
        
        # Write the JSONL entry to the file
        json.dump(jsonl_entry, outfile)
        outfile.write('\n')

In [71]:
# Open a file in write mode
with open('eval_data.jsonl', 'w') as outfile:
    for data in test_data:
        # Combine tokens and ner_tags into space-separated strings
        prompt = ' '.join(data['tokens'])
        completion = ' '.join(data['ner_tags'])
        
        # Create the JSONL entry
        jsonl_entry = {
            "messages": [
                {"role": "system", "content": "You are performing Named Entity Recognition (NER) to extract legal entities such as violation, violation by, violation on, and law from the text.  You will return a list of IOB tags."},
                {"role": "user", "content": prompt},
                {"role": "assistant", "content": completion}
                ]}
        # Write the JSONL entry to the file
        json.dump(jsonl_entry, outfile)
        outfile.write('\n')

In [72]:
load_dotenv()

True

In [73]:
import os
client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

In [74]:
client.files.create(
  file=open("train_data.jsonl", "rb"),
  purpose="fine-tune"
)

FileObject(id='file-WIru8EaROpmFFgbahYdJEsiP', bytes=656999, created_at=1724716608, filename='train_data.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)

In [75]:
client.files.create(
  file=open("eval_data.jsonl", "rb"),
  purpose="fine-tune"
)

FileObject(id='file-9ZOez4FskUrENFCChyWzScsV', bytes=258376, created_at=1724716617, filename='eval_data.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)

In [76]:
client.fine_tuning.jobs.create(
  training_file="file-WIru8EaROpmFFgbahYdJEsiP", 
  model="gpt-4o-2024-08-06",
  validation_file="file-9ZOez4FskUrENFCChyWzScsV"
)

FineTuningJob(id='ftjob-MReXReh8762EVcIQXlckSB6z', created_at=1724716652, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs='auto', batch_size='auto', learning_rate_multiplier='auto'), model='gpt-4o-2024-08-06', object='fine_tuning.job', organization_id='org-Qxm4Gb8DM4gPh1hlxL8eCrsO', result_files=[], seed=1908658738, status='validating_files', trained_tokens=None, training_file='file-WIru8EaROpmFFgbahYdJEsiP', validation_file='file-9ZOez4FskUrENFCChyWzScsV', estimated_finish=None, integrations=[], user_provided_suffix=None)

In [77]:
fine_tune_job = client.fine_tuning.jobs.retrieve("ftjob-MReXReh8762EVcIQXlckSB6z")
print(fine_tune_job)

FineTuningJob(id='ftjob-MReXReh8762EVcIQXlckSB6z', created_at=1724716652, error=Error(code=None, message=None, param=None), fine_tuned_model='ft:gpt-4o-2024-08-06:georgian::A0eqm7lq', finished_at=1724720391, hyperparameters=Hyperparameters(n_epochs=3, batch_size=1, learning_rate_multiplier=2), model='gpt-4o-2024-08-06', object='fine_tuning.job', organization_id='org-Qxm4Gb8DM4gPh1hlxL8eCrsO', result_files=['file-9EcuXYpvXxkRqnMVhib50rU5'], seed=1908658738, status='succeeded', trained_tokens=509676, training_file='file-WIru8EaROpmFFgbahYdJEsiP', validation_file='file-9ZOez4FskUrENFCChyWzScsV', estimated_finish=None, integrations=[], user_provided_suffix=None)


In [79]:
df = pd.read_excel("NER_test_set.xlsx")
df

Unnamed: 0,id,tokens
0,14243437,"[""a"",""class"",""action"",""lawsuit"",""has"",""been"",""..."
1,18871551,"[""a"",""media"",""company"",""recently"",""came"",""unde..."
2,51514749,"[""a"",""national"",""bank"",""was"",""recently"",""held""..."
3,99676183,"[""a"",""recent"",""case"",""has"",""come"",""to"",""light""..."
4,14188948,"[""a"",""recent"",""incident"",""has"",""come"",""to"",""li..."
...,...,...
375,24828975,"[""the"",""defendants"",""have"",""been"",""accused"",""o..."
376,69861119,"[""the"",""plaintiff"",""has"",""been"",""subjected"",""t..."
377,97047066,"[""the"",""plaintiff"",""has"",""raised"",""serious"",""a..."
378,90651371,"[""the"",""plaintiffs,"",""representing"",""a"",""group..."


In [80]:
def model_inference(tokens: list[str]) -> list[str]:
    input_str = ' '.join(tokens)
    
    output = client.chat.completions.create(
        model="ft:gpt-4o-2024-08-06:georgian::A0eqm7lq",
        messages=[
            {"role": "system", "content": "You are performing Named Entity Recognition (NER) to detect legal violations on a provided list of tokens.  You will return a list of IOB tags."},
            {"role": "user", "content": input_str}
        ]
    )
    
    output_str = output.choices[0].message.content
    
    ner_tags = output_str.split()
    
    return ner_tags

In [81]:
df["ner_tags"] = df["tokens"].apply(model_inference)

In [82]:
df.to_csv("NER_test_set_results_gpt4o_finetuned.csv")

In [105]:
df = pd.read_csv("/Users/royalsequiera/Projects/Darrow-LegalLens-NER/NER_Task_Submission/predictions_NERLens.csv")

In [106]:
df.head()

Unnamed: 0,id,tokens,predicted_tags
0,14243437,"[""a"",""class"",""action"",""lawsuit"",""has"",""been"",""...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
1,18871551,"[""a"",""media"",""company"",""recently"",""came"",""unde...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
2,51514749,"[""a"",""national"",""bank"",""was"",""recently"",""held""...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
3,99676183,"[""a"",""recent"",""case"",""has"",""come"",""to"",""light""...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
4,14188948,"[""a"",""recent"",""incident"",""has"",""come"",""to"",""li...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."


In [118]:
import ast

def format_token_tags_from_df(df):
    formatted_data = []

    # Iterate through each row in the DataFrame
    for _, row in df.iterrows():
        # Safely evaluate strings that look like lists
        tokens = row['tokens']
        predicted_tags = row['predicted_tags']
        
        if isinstance(tokens, str):
            tokens = ast.literal_eval(tokens)
        if isinstance(predicted_tags, str):
            predicted_tags = ast.literal_eval(predicted_tags)
        
        token_tag_pairs = [f"{token}/{tag}" for token, tag in zip(tokens, predicted_tags)]
        formatted_string = ', '.join(token_tag_pairs)
        formatted_data.append({'id': row['id'], 'formatted': formatted_string})

    return formatted_data

In [124]:
format_token_tags_from_df(df.sample(2))

[{'id': 40410727,
  'formatted': 'the/O, lead/O, plaintiff/O, alleges/O, that/O, the/O, app/O, has/O, been/O, issuing/B-VIOLATION, false/I-VIOLATION, and/I-VIOLATION, misleading/I-VIOLATION, financial/I-VIOLATION, statements,/I-VIOLATION, thereby/I-VIOLATION, artificially/I-VIOLATION, inflating/I-VIOLATION, the/I-VIOLATION, price/I-VIOLATION, of/I-VIOLATION, its/I-VIOLATION, securities/I-VIOLATION, ./O, this/O, unethical/O, behavior/O, was/O, orchestrated/O, by/O, the/O, apps/O, lead/O, developer/O, and/O, has/O, raised/O, serious/O, doubts/O, about/O, the/O, apps/O, reliability/O, ./O'},
 {'id': 61979338,
  'formatted': 'a/O, restaurant/O, was/O, found/O, guilty/O, of/O, violating/O, the/O, fair/B-LAW, labor/I-LAW, standards/I-LAW, act/I-LAW, ./O, the/O, court/O, found/O, that/O, the/B-VIOLATED BY, restaurant/I-VIOLATED BY, management/I-VIOLATED BY, had/O, failed/O, to/O, comply/O, with/O, the/O, strict/O, notification/I-LAW, requirements/I-LAW, for/O, claiming/O, a/O, tip/O, credit,/

In [128]:
import pandas as pd

def check_ner_format(predictions_file_path, test_file_path):
    """
    Check the format of the NER prediction file.
    The file should be in CSV format with columns: id, tokens, ner_tags
    """
    try:
        df = pd.read_csv(predictions_file_path)

    except Exception as e:
        return False, f"Error reading predictions CSV file: {e}"
    
    try:
        test_df = pd.read_csv(test_file_path)
    except Exception as e:
        return False, f"Error reading test CSV file: {e}"
    
    # Check expected columns
    expected_columns = ['id', 'tokens', 'ner_tags']
    pred_columns = list(df.columns)
    for expected_col in expected_columns:
        if expected_col not in pred_columns:
            return False, f"Incorrect columns. Expected: {expected_columns}, Found: {pred_columns}"
    
    # Check number of rows
    expected_ner_num_rows = len(test_df)
    predictions_ner_num_rows = len(df)
    if predictions_ner_num_rows != expected_ner_num_rows:
        return False, f"Incorrect number of predictions. Expected: {expected_ner_num_rows}, Found: {predictions_ner_num_rows}"

    return True, "NER prediction file format is correct."

def check_nli_format(predictions_file_path, test_file_path):
    """
    Check the format of the NLI prediction file.
    The file should be in CSV format with columns: Premise, hypothesis, label
    """
    try:
        df = pd.read_csv(predictions_file_path)
    except Exception as e:
        return False, f"Error reading predictions CSV file: {e}"
    
    try:
        test_df = pd.read_csv(test_file_path)
    except Exception as e:
        return False, f"Error reading test CSV file: {e}"
    
    # Check expected columns
    expected_columns = ['premise', 'hypothesis', 'label']
    pred_columns = list(df.columns)
    for expected_col in expected_columns:
        if expected_col not in pred_columns:
            return False, f"Incorrect columns. Expected: {expected_columns}, Found: {pred_columns}"
    
    # Check number of rows
    expected_nli_num_rows = len(test_df)
    predictions_nli_num_rows = len(df)
    if predictions_nli_num_rows != expected_nli_num_rows:
        return False, f"Incorrect number of predictions. Expected: {expected_nli_num_rows}, Found: {predictions_nli_num_rows}"
    
    return True, "NLI prediction file format is correct."

# Check NER prediction file
ner_predictions_file_path = '/Users/royalsequiera/Projects/Darrow-LegalLens-NER/NER_Task_Submission/predictions_NERLens.csv' # replace with file path
ner_test_file_path = '/Users/royalsequiera/Projects/Darrow-LegalLens-NER/NER_Task_Submission/test_NERLens.csv' # replace with file path
is_valid, message = check_ner_format(ner_predictions_file_path, ner_test_file_path)
print(f"NER File Check: {message}")

# # Check NLI prediction file
# nli_predictions_file_path = 'predictions_NLILens.csv' # replace with file path
# nli_test_file_path = 'predictions_NLILens.csv' # replace with file path
# is_valid, message = check_nli_format(nli_predictions_file_path, nli_test_file_path)
# print(f"NLI File Check: {message}")

NER File Check: NER prediction file format is correct.


In [126]:
import pandas as pd

# Path to the Excel file
excel_file_path = '/Users/royalsequiera/Projects/Darrow-LegalLens-NER/NER_Task_Submission/NER_test_set.xlsx'

# Read the Excel file into a DataFrame
df = pd.read_excel(excel_file_path)

# Path to the CSV file
csv_file_path = 'output_file.csv'

# Save the DataFrame as a CSV file
df.to_csv(csv_file_path, index=False)
