## Cluster metacog labels using the Della-Inference API

In [3]:
import requests
import json
import os
import pandas as pd

In [4]:
# port = 12257 # CHANGE THIS TO THE PORT WHERE DELLA API SSH PORT FORWARDING IS SET UP
port = 59967
url = f"http://localhost:{port}/v1/chat/completions"

headers={
    "Content-Type": "application/json",
    "Authorization": "token-abc123"
}

model_name = 'meta-llama/Meta-Llama-3.1-70B-Instruct'

In [5]:
def get_metacog_cluster_prompt(properties):
    system_prompt = """I have a list of properties describing questions asked during Supreme Court oral arguments. Your task is to group similar properties into broader categories and provide a descriptive name for each category. The category name should be lower case letters only. If the category name has multiple words, join them with an underscore. Basically, you should be able to use the category name as a dictionary key in python.
    The output should be in JSON format for easy parsing. Each category should contain its name, a brief description and a list of the grouped properties in that category.
    
      ### Instructions:
        1. Group similar properties together based on shared themes or purposes.
        2. Name each category with a descriptive label.
        3. Return the result in JSON format structured as follows:

        {
            "category_name": "<Your Category Name>",
            "description": "<A brief description of the category>",
            "properties_in_category": [
                "property_1",
                "property_2",
                ...
            ],
            "category_name": "<Your Category Name>",
            "description": "<A brief description of the category>",
            "properties_in_category": [
                "property_1",
                "property_2",
                ...
            ],
        }
    """
    
    user_prompt = f"""### Your Task:
        Properties: 
        {properties}

        ### Output:
    """

    messages = [
            {
                "role": "system",
                "content": system_prompt,
            },
            {"role": "user", "content": user_prompt}
        ]
    return messages

In [6]:
def get_model_response(messages):

    payload = {
        "model": model_name,
        "messages": messages
    }

    response = requests.post(url, data=json.dumps(payload), headers=headers)
    return response

def parse_response(response):
    decoded = response.content.decode('utf-8')
    response_data = json.loads(decoded)
    content = response_data['choices'][0]['message']['content']

    # try:
    #     tags = json.dumps(content)
    # except Exception as e:
    #     print(f"Unable to jsonify response, saving string itself. ERROR: {e}")
    #     tags = str(content)
    # return tags
    return content

## Call model

In [7]:
def generate_clusters(property_list):
    messages = get_metacog_cluster_prompt(property_list)
    response = get_model_response(messages)
    tags = parse_response(response)
    return tags

In [21]:
def process_json_files(input_dir, out_dir):
    for file_name in os.listdir(input_dir):
        input_fp = os.path.join(input_dir, file_name)

        with open(input_fp, 'r') as f:
            properties = json.load(f)
        print(f"Read and processing response for {file_name}")


        # print(type(properties))
        # model_response = ['']
        model_response = generate_clusters(properties)


        out_fp = os.path.join(out_dir, f"cluster_raw_{file_name}")
        with open(out_fp, 'w') as f:
            json.dump(model_response, f, indent=4)

        print(f"Processed and saved response to {out_fp}")



In [22]:
input_directory = f'../analysis/metacog/compiled_chunks/'
output_directory = "../analysis/metacog/clustering_responses/"

# Run the processing
process_json_files(input_directory, output_directory)


Read and processing response for chunk_13.json
Processed and saved response to ../analysis/metacog/clustering_responses/cluster_raw_chunk_13.json
Read and processing response for chunk_11.json
Processed and saved response to ../analysis/metacog/clustering_responses/cluster_raw_chunk_11.json
Read and processing response for chunk_26.json
Processed and saved response to ../analysis/metacog/clustering_responses/cluster_raw_chunk_26.json
Read and processing response for chunk_29.json
Processed and saved response to ../analysis/metacog/clustering_responses/cluster_raw_chunk_29.json
Read and processing response for chunk_12.json
Processed and saved response to ../analysis/metacog/clustering_responses/cluster_raw_chunk_12.json
Read and processing response for chunk_2.json
Processed and saved response to ../analysis/metacog/clustering_responses/cluster_raw_chunk_2.json
Read and processing response for chunk_20.json
Processed and saved response to ../analysis/metacog/clustering_responses/cluste

## Cluster chunk level clustered labels

In [9]:
input_fp = '../analysis/metacog/metacog_clusters_compiled.json'
with open(input_fp, 'r') as f:
    properties = json.load(f)

In [10]:
model_response = generate_clusters(properties)

In [12]:
out_fp = '../analysis/metacog/metacog_clusters_all_data_raw.json'

In [13]:
with open(out_fp, 'w') as f:
    json.dump(model_response, f, indent=4)