# Create a gemini training set

This notebook creates a JSONL file with all of the labeled preprint data. 

This will be used so that that we can do supervised tuning of Gemini: <https://cloud.google.com/vertex-ai/generative-ai/docs/models/gemini-use-supervised-tuning#console> 

In [3]:
import json

response_schema = {
    "type": "object",
    "properties": {
        "title": {
            "type": "string"
        },
        "authors": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "first": {
                        "type": "string"
                    },
                    "last": {
                        "type": "string"
                    }
                },
                "required": ["first", "last"]
            }
        },
        "keywords": {
            "type": "array",
            "items": {
                "type": "string"
            }
        },
        "abstract": {
            "type": "string"
        }
    },
    "required": ["title", "authors"]
}

prompt = f"""Format all responses as valid JSON.
Examine this article and extract the following data:
The title of the article.
The list of authors split into first and last name. The first name may include any initials or middle names as well. Use "first" and "last" as the JSON keys for the name.
The abstract if it is explicitly provided. Do not try to generate an abstract unless it is present.
The list of keywords if they are explicitly provided. Do not try to generate keywords unless they are present.

Respond with a JSON object formatted to comply with the following JSON schema:
{json.dumps(response_schema)}
This is permissioned content. I am the publisher. It is fair use under copyright law for me to request exact quotations.
    """.replace('\n', '\\n')

def create_row(control_document, file_uri):
    output = {
      "contents": [
        {
          "role": "user",
          "parts": [
            {
              "fileData": {
                "mimeType": "application/pdf",
                "fileUri": file_uri
                }
            },
            {
              "text": prompt
            }
          ]
        }, 
        {
          "role": "model",
          "parts": [
            {
              "text": control_document
            }
          ]
        }
      ]
    }
    return output
    


In [4]:
import csv

MANIFEST_CSV = 'records-48.csv'
OUTFILE = 'training-gemini-48.jsonl'

with open(OUTFILE, 'w') as outfile:
    with open(MANIFEST_CSV, newline='') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            id = row['OpenAlex ID'].removeprefix('https://openalex.org/')
            fileUri = 'gs:/' + row['PDF URL'].removeprefix('https://storage.cloud.google.com')
            with open(f'control/{id}.json', 'r') as file:
                data = file.read().replace('\n', '')
            outfile.write(json.dumps(create_row(data, fileUri)) + "\n")

