# Prep Data for Finetuning

In [2]:
import json
import os
import pandas as pd
from collections import deque

TRANSCRIPTS_DIR = "../transcripts_up_to_2024/"      # directory of raw JSONs of oral arguments
OUT_DIR = "../datasets/finetune"

def save_jsonl(df, filename):
    df.to_json(filename, orient="records", lines=True)

def read_jsonl(filename):
    with open(filename, "r") as f:
        data = [json.loads(line) for line in f]
    return data

## Prep training samples for finetuning in the following format:
```
training_samples = [
    {
        "transcript_id": "2022.21-376-t01",
        "chunk_id": 0,
        "content": [
            {"role": "advocate", "content": "Advocate gives their opening statement"},
            {"role": "justice_sonia_sotomayor", "content": "Sotomayor asks a question based on advocate's opening statement."},
            {"role": "advocate", "content": "Advocate says something in response to Sotomayor"},
            {"role": "justice_samuel_a_alito_jr", "content": "Something Justice Alito said in transcript."},
        ]
    },
...
]
```


In [3]:
MAX_CHARS = 5000  # Maximum characters per chunk

def get_formatted_text_of_turn(turn):
    '''
    Return all text within a turn as a dict denoting speaker, role, and text.

    @param turn -- JSON representing a single speaker turn
    @return -- Dict with keys "role", "content"
    '''
    if not turn["speaker"]:  # skip turns that have no speaker like "Laughter"
        return None

    if not turn["speaker"]["roles"]:
        role = "attorney"
    elif ('2' in turn["speaker"]["roles"] and turn["speaker"]["roles"]['2']["type"] == "scotus_justice") or \
         turn["speaker"]["roles"][0]["type"] == "scotus_justice":
        role = "scotus_justice"

    if role == "scotus_justice":
        identifier = f'justice_{turn["speaker"]["identifier"]}'
    else:
        identifier = "advocate"

    text = " ".join([block["text"] for block in turn["text_blocks"]])

    return {
        "role": identifier,
        "content": text
    }

def chunk_transcript_content(transcript_id, formatted_turns, max_chars=MAX_CHARS):
    '''
    Chunks formatted turns into samples using a sliding window

    @param transcript_id -- The ID of the transcript
    @param formatted_turns -- List of formatted speaker turns
    @param max_chars -- Maximum characters allowed per chunk
    @return -- List of chunked transcript segments
    '''
    chunks = []
    current_chunk = deque()
    current_length = 0
    chunk_id = 0

    for turn in formatted_turns:
        turn_text = turn["content"]
        turn_length = len(turn_text)

        # If adding this turn exceeds max_chars, remove old turns from the front
        while current_length + turn_length > max_chars and current_chunk:
            removed_turn = current_chunk.popleft()
            current_length -= len(removed_turn["content"])

        # Add new turn
        current_chunk.append(turn)
        current_length += turn_length

        # Save current window as a chunk
        chunks.append({
            "transcript_id": transcript_id,
            "chunk_id": chunk_id,
            "content": list(current_chunk)
        })
        chunk_id += 1

    return chunks

def get_transcript_data(json_file_name):
    '''
    @param json_file_name -- Name of the oral argument JSON file
    @return -- List of chunked transcript samples
    '''

    transcript_file_path = os.path.join(TRANSCRIPTS_DIR, json_file_name)
    with open(transcript_file_path, 'r') as json_file:
        transcript_json = json.load(json_file)

    transcript_id = json_file_name[:-5]
    formatted_turns = []

    for section in [0, 1]:
        section_turns = transcript_json["transcript"]["sections"][section]["turns"]
        section_turns = [get_formatted_text_of_turn(turn) for turn in section_turns]
        section_turns = [turn for turn in section_turns if turn]  # remove None values
        formatted_turns.extend(section_turns)

    return chunk_transcript_content(transcript_id, formatted_turns, MAX_CHARS)

data_transcripts = []
cases_dir = os.fsencode(TRANSCRIPTS_DIR)
for json_file_name in os.listdir(TRANSCRIPTS_DIR):
    if json_file_name.endswith('.json'):
        data_transcripts.extend(get_transcript_data(json_file_name))

In [5]:
len(data_transcripts)

455031

In [6]:
data_transcripts[-1]

{'transcript_id': '2009.08-1008-t01',
 'chunk_id': 164,
 'content': [{'role': 'advocate',
   'content': "--Well, Your Honor, I think that would raise some interesting questions about New York's power to--"},
  {'role': 'justice_john_g_roberts_jr',
   'content': 'What it would do, it seems to me, is make it clear that was not a substantive decision, but, instead, a procedural decision.'},
  {'role': 'advocate',
   'content': "--Correct, Your Honor. That's right. And, again -- and, again--"},
  {'role': 'justice_ruth_bader_ginsburg',
   'content': "But it could be -- it could be, as I -- the example of the statute of limitations. We create a claim. It has a certain life. It's dead after that time. That's New York law. A sister State may say, we create the same claim, but we think it has a longer life. New York would say, that's fine. Bring that claim in your own State. Don't clutter up our courts with out-of-State claims when we would not hear the identical claim under our own law. There

**Question**: Should we do any cleaning on these like filtering out turns that are inaudible/too-short? I was initially thinking yes, but maybe it's okay?

## Sanity test on chat template

Sample input json:

```
chat = [
    {"role": "advocate", "content": "Advocate gives their opening statement"},
    {"role": "justice_sonia_sotomayor", "content": "Sotomayor asks a question based on advocate's opening statement."},
    {"role": "advocate", "content": "Advocate says something in response to Sotomayor"},
    {"role": "justice_samuel_a_alito_jr", "content": "Something Justice Alito said in transcript."},
]
```

This should map to something like the following after applying chat template (added new-lines for readability)
```
<|begin_of_text|>
<|start_header_id|>
advocate
<|end_header_id|>
Advocate gives their opening statement
<|eot_id|>
<|start_header_id|>
justice_sonia_sotomayor
<|end_header_id|>
Sotomayor asks a question based on advocate's opening statement.
<|eot_id|>
<|start_header_id|>
advocate
<|end_header_id|>
Advocate says something in response to Sotomayor
<|eot_id|>
<|start_header_id|>
justice_samuel_a_alito_jr
<|end_header_id|>
Something Justice Alito said in transcript.
<|eot_id|>
```

In [None]:
from unsloth import FastLanguageModel

In [2]:
# MODEL_NAME = "Llama-3.3-70B-Instruct-bnb-4bit"
MODEL_NAME = "Meta-Llama-3.1-8B-Instruct-bnb-4bit"
# MODEL_NAME = "Qwen2.5-32B-bnb-4bit"

model_name = f"/scratch/gpfs/nnadeem/transformer_cache/{MODEL_NAME}/"
max_seq_length = 65536
dtype = None
load_in_4bit = True
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

==((====))==  Unsloth 2025.1.7: Fast Llama patching. Transformers: 4.48.1.
   \\   /|    GPU: NVIDIA A100 80GB PCIe MIG 1g.10gb. Max memory: 9.5 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1. CUDA: 8.0. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [16]:
def set_chat_template():
    return """<|begin_of_text|>{%- for message in messages %}<|start_header_id|>{{ message['role'] }}<|end_header_id|>\n\n{{ message['content'] }}<|eot_id|>{%- endfor %}"""
tokenizer.chat_template = set_chat_template()

In [22]:
chat = [
    {"role": "advocate", "content": "Advocate gives their opening statement"},
    {"role": "justice_sonia_sotomayor", "content": "Sotomayor asks a question based on advocate's opening statement."},
    {"role": "advocate", "content": "Advocate says something in response to Sotomayor"},
    {"role": "justice_samuel_a_alito_jr", "content": "Something Justice Alito said in transcript."},
]

templated_chat = tokenizer.apply_chat_template(chat, tokenize=False)
# templated_chat

**Note**: Rather than using the custom `set_chat_template()` function defined above, we could alternatively modify the original chat template function of the model (modified to not include a system prompt with today's data always by default). Right now I just used a simpler template for clarity.

For reference, original chat template for `unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit` is:
```
{{- bos_token }}\n{%- if custom_tools is defined %}\n    {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n    {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n    {%- set date_string = "26 Jul 2024" %}\n{%- endif %}\n{%- if not tools is defined %}\n    {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0][\'role\'] == \'system\' %}\n    {%- set system_message = messages[0][\'content\']|trim %}\n    {%- set messages = messages[1:] %}\n{%- else %}\n    {%- set system_message = "" %}\n{%- endif %}\n\n{#- System message + builtin tools #}\n{{- "<|start_header_id|>system<|end_header_id|>\\n\\n" }}\n{%- if builtin_tools is defined or tools is not none %}\n    {{- "Environment: ipython\\n" }}\n{%- endif %}\n{%- if builtin_tools is defined %}\n    {{- "Tools: " + builtin_tools | reject(\'equalto\', \'code_interpreter\') | join(", ") + "\\n\\n"}}\n{%- endif %}\n{{- "Cutting Knowledge Date: December 2023\\n" }}\n{{- "Today Date: " + date_string + "\\n\\n" }}\n{%- if tools is not none and not tools_in_user_message %}\n    {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }}\n    {{- \'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.\' }}\n    {{- "Do not use variables.\\n\\n" }}\n    {%- for t in tools %}\n        {{- t | tojson(indent=4) }}\n        {{- "\\n\\n" }}\n    {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- "<|eot_id|>" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n    {#- Extract the first user message so we can plug it in here #}\n    {%- if messages | length != 0 %}\n        {%- set first_user_message = messages[0][\'content\']|trim %}\n        {%- set messages = messages[1:] %}\n    {%- else %}\n        {{- raise_exception("Cannot put tools in the first user message when there\'s no first user message!") }}\n{%- endif %}\n    {{- \'<|start_header_id|>user<|end_header_id|>\\n\\n\' -}}\n    {{- "Given the following functions, please respond with a JSON for a function call " }}\n    {{- "with its proper arguments that best answers the given prompt.\\n\\n" }}\n    {{- \'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.\' }}\n    {{- "Do not use variables.\\n\\n" }}\n    {%- for t in tools %}\n        {{- t | tojson(indent=4) }}\n        {{- "\\n\\n" }}\n    {%- endfor %}\n    {{- first_user_message + "<|eot_id|>"}}\n{%- endif %}\n\n{%- for message in messages %}\n    {%- if not (message.role == \'ipython\' or message.role == \'tool\' or \'tool_calls\' in message) %}\n        {{- \'<|start_header_id|>\' + message[\'role\'] + \'<|end_header_id|>\\n\\n\'+ message[\'content\'] | trim + \'<|eot_id|>\' }}\n    {%- elif \'tool_calls\' in message %}\n        {%- if not message.tool_calls|length == 1 %}\n            {{- raise_exception("This model only supports single tool-calls at once!") }}\n        {%- endif %}\n        {%- set tool_call = message.tool_calls[0].function %}\n        {%- if builtin_tools is defined and tool_call.name in builtin_tools %}\n            {{- \'<|start_header_id|>assistant<|end_header_id|>\\n\\n\' -}}\n            {{- "<|python_tag|>" + tool_call.name + ".call(" }}\n            {%- for arg_name, arg_val in tool_call.arguments | items %}\n                {{- arg_name + \'="\' + arg_val + \'"\' }}\n                {%- if not loop.last %}\n                    {{- ", " }}\n                {%- endif %}\n                {%- endfor %}\n            {{- ")" }}\n        {%- else  %}\n            {{- \'<|start_header_id|>assistant<|end_header_id|>\\n\\n\' -}}\n            {{- \'{"name": "\' + tool_call.name + \'", \' }}\n            {{- \'"parameters": \' }}\n            {{- tool_call.arguments | tojson }}\n            {{- "}" }}\n        {%- endif %}\n        {%- if builtin_tools is defined %}\n            {#- This means we\'re in ipython mode #}\n            {{- "<|eom_id|>" }}\n        {%- else %}\n            {{- "<|eot_id|>" }}\n        {%- endif %}\n    {%- elif message.role == "tool" or message.role == "ipython" %}\n        {{- "<|start_header_id|>ipython<|end_header_id|>\\n\\n" }}\n        {%- if message.content is mapping or message.content is iterable %}\n            {{- message.content | tojson }}\n        {%- else %}\n            {{- message.content }}\n        {%- endif %}\n        {{- "<|eot_id|>" }}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- \'<|start_header_id|>assistant<|end_header_id|>\\n\\n\' }}\n{%- endif %}\n
```