In [1]:
%load_ext autoreload
%autoreload 2
    
import warnings
warnings.filterwarnings('ignore')

import json
from gigasmol.gigachat_api.api_model import GigaChat

In [2]:
credentials = json.load(open('/Users/potemin/work_main/gigasmol/credentials.json'))

model = GigaChat(
    model_name="GigaChat-Max",
    api_endpoint="https://gigachat.devices.sberbank.ru/api/v1/", # "https://gigachat-preview.devices.sberbank.ru/api/v1/" 
    temperature=0.0000001,
    top_p=0.0,
    repetition_penalty=1.1,
    max_tokens=1024,
    profanity_check=False,
    client_id=credentials['client_id'],
    client_secret=credentials['client_secret'],
    auth_scope="GIGACHAT_API_CORP",
)

In [3]:
schema = {
    "name": "extract_complex_entities",
    "description": "Extracts complex, nested entity information from text.",
    "parameters": {
        "type": "object",
        "properties": {
            "persons": {
                "type": "array",
                "description": "List of persons found in the text with nested properties.",
                "items": {
                    "type": "object",
                    "properties": {
                        "name": {
                            "type": "string",
                            "description": "Full name of the person"
                        },
                        "birthPlace": {
                            "type": "string",
                            "description": "Birthplace of the person"
                        },
                        "roles": {
                            "type": "array",
                            "description": "Roles or titles the person has held",
                            "items": {
                                "type": "string"
                            }
                        },
                        "education": {
                            "type": "array",
                            "description": "List of institutions where the person studied",
                            "items": {
                                "type": "string"
                            }
                        }
                    },
                    "required": ["name"]
                }
            },
            "organizations": {
                "type": "array",
                "description": "List of organizations with additional details.",
                "items": {
                    "type": "object",
                    "properties": {
                        "orgName": {
                            "type": "string",
                            "description": "Name of the organization"
                        },
                        "orgType": {
                            "type": "string",
                            "description": "Type of organization (e.g., university, government, company)"
                        },
                        "location": {
                            "type": "string",
                            "description": "Location of the organization"
                        }
                    },
                    "required": ["orgName"]
                }
            },
            "locations": {
                "type": "array",
                "description": "List of location entities with details.",
                "items": {
                    "type": "object",
                    "properties": {
                        "placeName": {
                            "type": "string",
                            "description": "Name of the place"
                        },
                        "country": {
                            "type": "string",
                            "description": "Country of the place"
                        }
                    },
                    "required": ["placeName"]
                }
            }
        },
        "required": ["persons", "organizations", "locations"]
    }
}

In [4]:
system = (
    "You are a helpful assistant that extracts complex entity information from text "
    "and returns ONLY JSON."
)

text = """
“Bill Gates, the co-founder of Microsoft, was born in Seattle, Washington, in 1955. He initially enrolled at Harvard University to study pre-law, but he shifted his focus to mathematics and computer science before dropping out. Along with Paul Allen, he founded Microsoft in 1975, which rapidly grew into one of the world’s largest software companies. Over the years, Bill Gates became a notable philanthropist through the Bill & Melinda Gates Foundation, focusing on global health and education initiatives.

	Meanwhile, Mark Zuckerberg, born in 1984, is the co-founder and CEO of Facebook, now known as Meta. He developed the initial version of the platform while studying computer science at Harvard University, though he never completed his degree. Under his leadership, Facebook rapidly expanded into a worldwide social media giant. Zuckerberg also launched the Chan Zuckerberg Initiative in collaboration with his wife, Dr. Priscilla Chan, to tackle issues related to education, healthcare, and scientific research.

	On the other hand, Elon Musk, born in Pretoria, South Africa, in 1971, is known for founding SpaceX and co-founding Tesla. After moving to Canada, he later studied at the University of Pennsylvania in the United States, where he earned degrees in physics and economics. Musk’s ventures have often focused on emerging technologies and innovation—from electric vehicles and solar energy solutions at Tesla to space exploration and rocket technology at SpaceX. In recent years, he has also been actively involved in artificial intelligence research, among other futuristic endeavors.”
"""


prompt = (
    "The user provides the following text:\n\n"
    f"{text}\n"
    "Extract the following complex information:\n"
    " - persons: with name, birthPlace, roles (array of strings), and education (array of schools),\n"
    " - organizations: with orgName, orgType, and location,\n"
    " - locations: with placeName, country.\n\n"
    "Return ONLY JSON containing arrays for 'persons', 'organizations', and 'locations'. "
    "Each 'person' should have nested fields such as 'name', 'birthPlace', 'roles', "
    "and an array of 'education' items. Each 'organization' has 'orgName', 'orgType', 'location'. "
    "Each 'location' has 'placeName' and 'country'."
)

In [5]:
messages = [
    {"role": "system", "content": system},
    {"role": "user", "content": prompt}
]

In [6]:
response_complete = model.chat(
    messages=messages,
    functions=[schema],
    function_call={"name": "extract_complex_entities"}
)

In [7]:
response_complete['response']['choices'][0]['message']['function_call']['arguments']

{'locations': [{'country': 'United States',
   'placeName': 'Seattle, Washington'},
  {'country': 'United States', 'placeName': 'White Plains, New York'},
  {'country': 'South Africa', 'placeName': 'Pretoria, South Africa'},
  {'country': 'United States', 'placeName': 'Menlo Park, California'},
  {'country': 'United States', 'placeName': 'Hawthorne, California'},
  {'country': 'United States', 'placeName': 'Palo Alto, California'}],
 'organizations': [{'location': 'Seattle, Washington',
   'orgName': 'Microsoft',
   'orgType': 'software company'},
  {'location': 'Menlo Park, California',
   'orgName': 'Facebook (Meta)',
   'orgType': 'social media company'},
  {'location': 'Hawthorne, California',
   'orgName': 'SpaceX',
   'orgType': 'space exploration company'},
  {'location': 'Palo Alto, California',
   'orgName': 'Tesla',
   'orgType': 'electric vehicle and clean energy company'}],
 'persons': [{'birthPlace': 'Seattle, Washington',
   'education': ['Harvard University'],
   'name':