# ⚙️ Generating files for fine-tuning from SPARQL endpoints

Using OpenAI JSONL schema: https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset

Questions:
* Which info can I get from the UniProt endpoint?
* Give me example of queries to retrieve 

In [1]:
import json
import re

from bs4 import BeautifulSoup
from SPARQLWrapper import JSON, SPARQLWrapper

# system_prompt = "Expasy is an assistant that helps you query the databases from the Swiss Institute of Bioinformatics, such as UniProt or Bgee."
# system_prompt = "Expasy is an assistant that helps you query the databases from the Swiss Institute of Bioinformatics, such as UniProt or Bgee. It provides high-quality SPARQL queries to retrieve information from these databases, and that all prefixes are well defined"
system_prompt = "Expasy is an assistant that helps you query the databases from the Swiss Institute of Bioinformatics, such as UniProt or Bgee. It learns how to answer questions by using the questions/queries pairs provided from a catalog of examples."

endpoints = [
    "https://sparql.uniprot.org/sparql/",
    "https://www.bgee.org/sparql/",
]

jsonl_str: str = ""
for endpoint_url in endpoints:
    print(f"Processing endpoint {endpoint_url}")
    sparql_endpoint = SPARQLWrapper(endpoint_url)
    sparql_endpoint.setReturnFormat(JSON)

    def remove_a_tags(html_text: str) -> str:
        """Remove all <a> tags from the queries descriptions"""
        soup = BeautifulSoup(html_text, "html.parser")
        for a_tag in soup.find_all("a"):
            a_tag.replace_with(a_tag.text)
        return soup.get_text()

    get_queries = """PREFIX sh: <http://www.w3.org/ns/shacl#>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

    SELECT ?comment ?query
    WHERE
    {
        ?sq a sh:SPARQLExecutable ;
            rdfs:comment ?comment ;
            sh:select ?query .
    }"""

    get_prefixes = """PREFIX sh: <http://www.w3.org/ns/shacl#>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

    SELECT ?prefix ?namespace
    WHERE {
        [] sh:namespace ?namespace ;
            sh:prefix ?prefix .
    } ORDER BY ?prefix"""

    sparql_endpoint.setQuery(get_prefixes)
    results = sparql_endpoint.query().convert()
    prefix_map = {}
    for row in results["results"]["bindings"]:
        prefix_map[row["prefix"]["value"]] = row["namespace"]["value"]

    # print(f"Found {len(prefix_map)} prefixes")

    sparql_endpoint.setQuery(get_queries)
    results = sparql_endpoint.query().convert()
    queries_list = []
    for row in results["results"]["bindings"]:
        queries_list.append(
            {
                "comment": row["comment"]["value"],
                "query": row["query"]["value"],
            }
        )
        # queries_map[row["comment"]["value"]] = row["query"]["value"]

    print(f"Found {len(queries_list)} queries")

    for q in queries_list:
        query = q["query"]
        # Add prefixes to queries
        for prefix, namespace in prefix_map.items():
            prefix_str = f"PREFIX {prefix}: <{namespace}>"
            if not re.search(prefix_str, query) and re.search(
                f"[(| |\u00a0|/]{prefix}:", query
            ):
                query = f"{prefix_str}\n{query}"

        bot_resp = f"This question can be answered by executing the query below on the endpoint available at {endpoint_url}:\n\n```sparql\n{query}\n```"

        jsonl_str += (
            json.dumps(
                {
                    "messages": [
                        {"role": "system", "content": system_prompt},
                        {"role": "user", "content": remove_a_tags(q["comment"])},
                        {"role": "assistant", "content": bot_resp},
                    ]
                }
            )
            + "\n"
        )
    # print(jsonl_str)

with open("../data/finetuning_queries.jsonl", "w") as f:
    f.write(jsonl_str)

Processing endpoint https://sparql.uniprot.org/sparql/
Found 51 queries
Processing endpoint https://www.bgee.org/sparql/
Found 0 queries


  soup = BeautifulSoup(html_text, "html.parser")


In [5]:
from openai import OpenAI

client = OpenAI()

do_fine_tune = False

if do_fine_tune:
    file = client.files.create(
        file=open("../data/finetuning_queries.jsonl", "rb"), purpose="fine-tune"
    )
    print(file.id)

    ft_job = client.fine_tuning.jobs.create(
        training_file=file.id,
        model="gpt-3.5-turbo-1106",
        # model="gpt-3.5-turbo-0125",
        hyperparameters={
            "n_epochs": 20,
            "batch_size": 1,
            "learning_rate": 0.1,
        },
    )
    print(ft_job)

file-6IH2oUWN9xTmCgtkmZe72GBy
