# Fine Tuning with LLaMA CPP

## Create Inventory Instance Training Set

In [1]:
import datetime
import json
import random
import string

from copy import deepcopy
from folioclient import FolioClient

folio_client = FolioClient(
    'https://okapi-bugfest-quesnelia.int.aws.folio.org',
    'fs09000000',
    'folio',
    'folio'
)

In [6]:
def _expand_property(**kwargs):
    instance: dict = kwargs["instance"]
    prop_name: str = kwargs["property_name"]
    type_id: str = kwargs["property_type_id"]
    type_lookups: dict = kwargs["type_lookups"]

    type_id_text = type_id.replace("Id", "Text")
    
    for row in instance.get(prop_name, []):
        if type_id not in row:
            continue
        instance_prop_type = row.pop(type_id)
        row[type_id_text] = type_lookups[type_id].get(instance_prop_type, "Unknown") 
        
    
def expand_references(instance: dict, ref_data_lookups: dict) -> list:
    for row in [
        ("contributors", "contributorNameTypeId"),
        ("contributors", "contributorTypeId"),
        ("classifications", "classificationTypeId"),
        ("identifiers", "identifierTypeId"),
        ("notes", "instanceNoteTypeId")]:
        _expand_property(
            instance=instance,
            property_name=row[0],
            property_type_id=row[1],
            type_lookups=ref_data_lookups)
    for row in ["instanceTypeId", "modeOfIssuanceId"]:
        if row in instance:
            prop_id = instance.pop(row)
            prop_text = row.replace("Id", "Text")
            instance[prop_text] = ref_data_lookups[row].get(prop_id, "Unknown")
    return instance


def folio_id_lookups(folio_client: FolioClient) -> dict:
    lookups = {}
    for row in [
        ("/classification-types", "classificationTypes", "classificationTypeId"),
        ("/contributor-name-types", "contributorNameTypes", "contributorNameTypeId"),
        ("/contributor-types", "contributorTypes", "contributorTypeId"),
        ("/identifier-types", "identifierTypes", "identifierTypeId"),
        ("/instance-note-types", "instanceNoteTypes", "instanceNoteTypeId"),
        ("/instance-statuses", "instanceStatuses", "statusId"),
        ("/instance-types", "instanceTypes", "instanceTypeId"),
        ("/modes-of-issuance", "issuanceModes", "modeOfIssuanceId")
    ]:
        folio_result_list = folio_client.folio_get(row[0], key=row[1], query_params={"limit": 500})
        lookups[row[2]] = {}
        for result in folio_result_list:
            lookups[row[2]][result["id"]] = result["name"]
    return lookups

keep_properties = ['classifications',
 'contributors',
 'identifiers',
 'instanceTypeId',
 'instanceType',
 'modeOfIssuanceId',
 'publication',
 'subjects',
 'title'
]

def normalize(instance, folio_lookups):
    mod_instance = deepcopy(instance)
    # Filter on keep properties
    for key,value in instance.items():
        if key not in keep_properties:
            mod_instance.pop(key)
    mod_instance = expand_references(mod_instance, folio_lookups)
    # Remove authorityIds
    for prop_name in ["contributors", "subjects"]:
        for row in mod_instance[prop_name]:
            if "authorityId" in row:
                row.pop("authorityId")
    return mod_instance

def create_prompt(instance, extra_properties=[]):
    prompt = f"For {instance['title']}"
    creators, editors_publishers, contributors = [], [], []
    for contributor in instance.get("contributors", []):
        if contributor['contributorTypeText'] in ["Artist", "Author", "Contributor"]:
            creators.append(contributor["name"])
            contributors.append(contributor)
        if contributor['contributorTypeText'] in ["Editor", "Narrator", "Publisher"]:
            editors_publishers.append(f"{contributor['contributorTypeText']} {contributor["name"]}")
            contributors.append(contributor)
    if len(creators) > 0:
        creator_string = ", ".join(creators)
        prompt = f"{prompt} by {creator_string}."
    if len(editors_publishers) > 0:
        editor_publisher_string = ", ".join(editors_publishers)
        prompt = f"{prompt}{editor_publisher_string}. "
    instance["contributors"] = contributors
    # Use first Publisher
    publication = instance.get("publication", [])
    if len(publication) > 0:
        publication_string = "Published"
        first_publisher = publication[0]
        if first_publisher['dateOfPublication'] is not None:
            publication_string += f"in {first_publisher['dateOfPublication']}"
        if first_publisher['publisher'] is not None:
            publication_string += f" by {first_publisher['publisher']}"
        if first_publisher.get('place') is not None:
            place = first_publisher.get('place','').translate(str.maketrans('', '', string.punctuation))
            publication_string += f", {place}"
        prompt = f"{prompt} {publication_string}"

    for prop in extra_properties:
        instance_properties = instance.get(prop['name'], [])
        if len(instance_properties) < 1:
            continue
        extra_prop_string = f"{prop['name'].capitalize()} {prop['predicate']} "
        extra_prop_values = []
        prefix = prop.get("prefix")
        for row in instance_properties:
            row_str = ""
            if prefix:
                row_str = f"{row.get(prefix)} "
            value_key = prop.get("value", "value")
            row_str += f"{row.get(value_key)}"
            extra_prop_values.append(row_str)
        extra_prop_string += ', '.join(extra_prop_values)
        prompt = f"{prompt} {extra_prop_string}."
    return prompt

In [3]:
lookups = folio_id_lookups(folio_client)

In [90]:
print(folio_client.folio_get("/inventory/instances").get("totalRecords"))

8328626


In [111]:
lookups["contributorTypeId"].get("d1249dbe-5f1c-4dc3-9192-239621c71bf4")

'Interviewee'

## Create 1k Sample

In [93]:
start = datetime.datetime.now(datetime.UTC)
print(f"Starting 1k sample for fine-tuning at {start}")
original_records = []
for i in range(1_000):
    offset = random.randint(1, 8328626) - 1
    instance = folio_client.folio_get("/inventory/instances", key="instances", query_params={ "limit": 1, "offset": offset})
    original_records.append(instance[0])
    if not i%100:
        print(f"{i}", end="")
    if not i%10 and i > 0:
        print(".", end="")
with open("sample-02024-09-16-original.jsonl", "w+") as fo:
    for record in original_records:
        fo.write(f"{json.dumps(record)}\n")

Starting 1k sample for fine-tuning at 2024-09-16 20:58:46.268627+00:00
0.........100..........200..........300..........400..........500..........600..........700..........800..........900..........Starting Sample for Training


TypeError: normalize() missing 1 required positional argument: 'folio_lookups'

In [95]:
unique_uuids = set([r['id'] for r in original_records])

In [4]:
with open("sample-02024-09-16-original.jsonl") as fo:
    original_records = [json.loads(line) for line in fo.readlines()]

In [7]:
start = datetime.datetime.now(datetime.UTC)
print(f"Starting Sample for Training at {start}")
with open("training-02024-09-16.jsonl", "w+") as fo:
    for record in original_records:
        record = normalize(record, lookups)
        extra_properties = []
        # For 40% of the time include subjects
        if "subjects" in record and random.random() >= .6:
            extra_properties.append(
                {
                    "name": "subjects",
                    "predicate": "are"
                }
            )
        else:
            # Remove subjects
            if "subjects" in record:
                record.pop("subjects")
        # For 20% of the time include identifiers
        if "identifiers" in record and random.random() >= .8:
            extra_properties.append(
                {
                    "name": "identifiers",
                    "predicate": "are",
                    "prefix": "identifierTypeText"
                }
            )
        else:
            if "identifiers" in record:
                record.pop("identifiers")
        # For 30% of the time include classification
        if "classifications" in record and random.random() >= .7:
            extra_properties.append(
                {
                    "name": "classifications",
                    "predicate": "are",
                    "prefix": "classificationTypeText",
                    "value": "classificationNumber"
                }
            )
        else:
            if "classifications" in record:
                record.pop("classifications")
        prompt = create_prompt(record, extra_properties)
        sample = { "prompt": prompt,
                   "record": record }
        fo.write(f"{json.dumps(sample)}\n")
end = datetime.datetime.now(datetime.UTC)
print(f"Finished at {end}, total time for 1k sample is {(end-start).seconds / 60.} minutes")

Starting Sample for Training at 2024-09-16 23:48:37.553512+00:00
Finished at 2024-09-16 23:48:37.619868+00:00, total time for 1k sample is 0.0 minutes


In [8]:
with open("training-02024-09-16.jsonl") as fo:
    training_records = [json.loads(line) for line in fo.readlines()]

In [122]:
"ISSN".capitalize()

'Issn'

In [109]:
for i,record in enumerate(training_records):
    for contributor in record.get("contributors",[]):
        if contributor['contributorTypeText'].startswith("Author"):
            print(i, end=" ")

In [114]:
frequency_contributors = { None: 0 }
for record in original_records:
    for contributor in record.get("contributors", []):
        contributor_type_id = contributor.get("contributorTypeId")
        if contributor_type_id is None:
            frequency_contributors[None] += 1
        contributor_type_text = lookups["contributorTypeId"].get(contributor_type_id)
        if contributor_type_text in frequency_contributors:
            frequency_contributors[contributor_type_text] += 1
        else:
            frequency_contributors[contributor_type_text] = 1

In [115]:
frequency_contributors

{None: 3004,
 'Interviewee': 1,
 'Interviewer': 1,
 'Compiler': 1,
 'Author': 78,
 'Addressee': 1,
 'Editor': 42,
 'Performer': 11,
 'Artist': 2,
 'Host institution': 8,
 'Issuing body': 7,
 'Composer': 2,
 'Lyricist': 1,
 'Conductor': 1,
 'Respondent': 1,
 'Other': 3,
 'Illustrator': 1,
 'Printer': 2,
 'Editor of compilation': 2,
 'Librettist': 2,
 'Translator': 2,
 'Instrumentalist': 1}

## Create training text

In [10]:
with open("training-02024-09-16.txt", "w+") as fo:
    for record in training_records:
        fo.write(
            f"""<SFT><s>[INST] <<SYS>\n You an expert cataloger. From this prompt, create a FOLIO instance JSON record.
<</SYS>>

{record['prompt']} [/INST] {record['record']} \n""")