In [None]:
!pip install wikipedia
!pip insall pydantic
!pip install wikipedia-api


ERROR: unknown command "insall" - maybe you meant "install"
Collecting wikipedia-api
  Downloading wikipedia_api-0.8.1.tar.gz (19 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wikipedia-api
  Building wheel for wikipedia-api (setup.py) ... [?25l[?25hdone
  Created wheel for wikipedia-api: filename=Wikipedia_API-0.8.1-py3-none-any.whl size=15384 sha256=b90ee00cf442ca157778af5c83d7614a2f43e2a653607e75ea92ac9ffe4bff4f
  Stored in directory: /root/.cache/pip/wheels/0b/0f/39/e8214ec038ccd5aeb8c82b957289f2f3ab2251febeae5c2860
Successfully built wikipedia-api
Installing collected packages: wikipedia-api
Successfully installed wikipedia-api-0.8.1


In [36]:
from pydantic import BaseModel, field_validator
import wikipediaapi
import re

class InstitutionInfo(BaseModel):
    name: str
    founder: str
    founded_year: int | None  # Use int or None to allow a missing value
    headquarters: str
    employees: str
    industry: str
    summary: str

    @field_validator('founded_year', mode='before')
    def parse_founded_year(cls, value):
        if isinstance(value, str):
            # Try to extract a valid year from the string (just the first 4 digits if present)
            match = re.search(r'\b(\d{4})\b', value)
            if match:
                return int(match.group(1))
        return None  # Return None if no valid year is found

def extract_info(content: str, keywords: list, default: str = "Not Available") -> str:
    """Search for keywords in text and return the relevant part of the sentence."""
    for keyword in keywords:
        match = re.search(rf"([^.]*{keyword}[^.]*\b\.)", content, re.IGNORECASE)  # Improved regex
        if match:
            return match.group(1).strip()
    return default

def fetch_institution_details(institution_name: str) -> InstitutionInfo:
    wiki = wikipediaapi.Wikipedia(
        language="en",
        user_agent="MyInstitutionFetcher/1.0 (contact@example.com)"
    )
    page = wiki.page(institution_name)

    if not page.exists():
        return InstitutionInfo(
            name=institution_name,
            founder="Not Available",
            founded_year=None,  # Return None when no valid year is found
            headquarters="Not Available",
            employees="Not Available",
            industry="Not Available",
            summary="No information found on Wikipedia."
        )

    content = page.text
    summary = " ".join(page.summary.split(".")[:2])  # Extract first two meaningful sentences

    # Improved Regex searches for better extraction
    founder = extract_info(content, ["founder", "founded by", "established by", "founders of"])
    founded_year = extract_info(content, ["founded", "established", "incorporated"])
    headquarters = extract_info(content, ["headquartered", "located in", "based in", "headquarters"])
    employees = extract_info(content, ["employees", "staff", "workforce", "team size", "company employees"])
    industry = extract_info(content, ["industry", "sector", "domain", "field of operation"])

    return InstitutionInfo(
        name=institution_name,
        founder=founder,
        founded_year=founded_year,
        headquarters=headquarters,
        employees=employees,
        industry=industry,
        summary=summary
    )

if __name__ == "__main__":
    institution_name = input("Enter Institution/Company Name: ")
    result = fetch_institution_details(institution_name)
    print(result.model_dump_json(indent=4))  # Using model_dump_json instead of json


Enter Institution/Company Name: Ramaiah Institute of Technology
{
    "name": "Ramaiah Institute of Technology",
    "founder": "Not Available",
    "founded_year": 1962,
    "headquarters": "Ramaiah Institute of Technology (MSRIT), is a private engineering college located in Bengaluru in the Indian state of Karnataka.",
    "employees": "Not Available",
    "industry": "Industry recognition\nKarnataka's first IBM Centre of Excellence was established on 19 March 2009 at RIT.",
    "summary": "Ramaiah Institute of Technology (RIT), formerly known as M S"
}
