In [1]:
pip install qwikidata pandas requests

Collecting qwikidata
  Downloading qwikidata-0.4.2.tar.gz (22 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting mypy-extensions (from qwikidata)
  Downloading mypy_extensions-1.0.0-py3-none-any.whl.metadata (1.1 kB)
Downloading mypy_extensions-1.0.0-py3-none-any.whl (4.7 kB)
Building wheels for collected packages: qwikidata
  Building wheel for qwikidata (setup.py) ... [?25l[?25hdone
  Created wheel for qwikidata: filename=qwikidata-0.4.2-py3-none-any.whl size=24867 sha256=b97779428e0130a1038b77211fff95b9b6b04548c24a51067abee49920085a2b
  Stored in directory: /root/.cache/pip/wheels/20/a2/85/3ca91fc8f95fa5be840fce552ac382bbcddaea6d2e31212ae5
Successfully built qwikidata
Installing collected packages: mypy-extensions, qwikidata
Successfully installed mypy-extensions-1.0.0 qwikidata-0.4.2


In [2]:
pip install googletrans==4.0.0-rc1


Collecting googletrans==4.0.0-rc1
  Downloading googletrans-4.0.0rc1.tar.gz (20 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting httpx==0.13.3 (from googletrans==4.0.0-rc1)
  Downloading httpx-0.13.3-py3-none-any.whl.metadata (25 kB)
Collecting hstspreload (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading hstspreload-2024.11.1-py3-none-any.whl.metadata (2.1 kB)
Collecting chardet==3.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading chardet-3.0.4-py2.py3-none-any.whl.metadata (3.2 kB)
Collecting idna==2.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading idna-2.10-py2.py3-none-any.whl.metadata (9.1 kB)
Collecting rfc3986<2,>=1.3 (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading rfc3986-1.5.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting httpcore==0.9.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading httpcore-0.9.1-py3-none-any.whl.metadata (4.6 kB)
Collecting h11<0.10,>=0.8 (from httpcore==0.9.*->httpx==0.13.3->goog

In [None]:
import requests
from qwikidata.entity import WikidataItem
from qwikidata.linked_data_interface import get_entity_dict_from_api
import pandas as pd
from googletrans import Translator

# SPARQL endpoint for Wikidata Query Service
WIKIDATA_SPARQL_URL = "https://query.wikidata.org/sparql"
translator = Translator()  # Initialize the Google Translate API

# List of possible occupations (you can expand this list)
OCCUPATION_KEYWORDS = ["physicist", "chemist", "mathematician", "biologist", "engineer", "astronomer", "scientist"]

# Function to execute a SPARQL query
def execute_sparql_query(query):
    headers = {
        "User-Agent": "WikiDataQueryBot/0.1 (test@example.org)"
    }
    response = requests.get(WIKIDATA_SPARQL_URL, params={'query': query, 'format': 'json'}, headers=headers)
    return response.json()

# Function to extract scientist data with added queries for awards and education
def get_scientist_data(limit=10):
    query = f"""
    SELECT ?scientist ?scientistLabel ?birthDate ?deathDate ?birthPlaceLabel
           (GROUP_CONCAT(DISTINCT ?awardLabel; SEPARATOR="; ") AS ?awards)
           (GROUP_CONCAT(DISTINCT ?educationInstitutionLabel; SEPARATOR="; ") AS ?educationInstitutions)
    WHERE {{
      ?scientist wdt:P31 wd:Q5 ;  # Entity must be a human (Q5)
                wdt:P106 wd:Q901 ;  # Must have the occupation of scientist (Q901)
                wdt:P569 ?birthDate .  # Birth date (P569)
      OPTIONAL {{ ?scientist wdt:P570 ?deathDate. }}  # Date of death (P570)
      OPTIONAL {{ ?scientist wdt:P19 ?birthPlace. }}

      # Fetch awards
      OPTIONAL {{
        ?scientist wdt:P166 ?award.
        ?award rdfs:label ?awardLabel.
        FILTER(LANG(?awardLabel) = "en")
      }}

      # Fetch educational institutions
      OPTIONAL {{
        ?scientist wdt:P69 ?educationInstitution.
        ?educationInstitution rdfs:label ?educationInstitutionLabel.
        FILTER(LANG(?educationInstitutionLabel) = "en")
      }}

      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
    }}
    GROUP BY ?scientist ?scientistLabel ?birthDate ?deathDate ?birthPlaceLabel
    LIMIT {limit}
    """
    data = execute_sparql_query(query)
    return data['results']['bindings']

# Function to fetch additional details from Wikidata API
def get_entity_details(qid):
    entity_dict = get_entity_dict_from_api(qid)
    item = WikidataItem(entity_dict)
    return {
        'description': item.get_description('en'),
        'aliases': item.get_aliases('en'),
        'label': item.get_label('en')
    }

# Function to extract occupation from the description
def extract_occupation_from_description(description):
    description_lower = description.lower()  # Convert description to lowercase for comparison
    for keyword in OCCUPATION_KEYWORDS:
        if keyword in description_lower:
            return keyword.capitalize()  # Return the matched occupation
    return "Unknown"

# Function to translate text to Hindi using Google Translate
def translate_to_hindi(text):
    if text and text != "Unknown":
        try:
            translation = translator.translate(text, dest='hi')  # Translate to Hindi
            return translation.text
        except Exception as e:
            print(f"Translation error: {e}")
            return text
    return text

def clean_date(date_string):
    """
    Clean the date string to remove timestamp and keep only the date part.
    Assumes date is in ISO format (YYYY-MM-DD)
    """
    if date_string == 'Unknown':
        return 'Unknown'

    try:
        # Split the date and take the first part (date without time)
        cleaned_date = date_string.split('T')[0]

        # Split the date into year, month, day
        year, month, day = cleaned_date.split('-')

        # Convert to Hindi numerals
        def to_hindi_numerals(number):
            hindi_numerals = {
                '0': '०', '1': '१', '2': '२', '3': '३', '4': '४',
                '5': '५', '6': '६', '7': '७', '8': '८', '9': '९'
            }
            return ''.join(hindi_numerals.get(digit, digit) for digit in number)

        # Convert year to Hindi numerals
        year_hindi = to_hindi_numerals(year)

        # Return in a more readable Hindi format
        return f"{year_hindi}"

    except Exception:
        return date_string

def process_and_save_scientist_data(scientist_data):
    # Initialize a dictionary to store scientist details
    scientists_dict = {}
    scientists_description_dict = {}  # New dictionary to store descriptions

    for scientist in scientist_data:
        qid = scientist['scientist']['value'].split('/')[-1]
        label = scientist['scientistLabel']['value']

        # Clean birth and death dates
        birth_date = clean_date(scientist.get('birthDate', {}).get('value', 'Unknown'))
        death_date = clean_date(scientist.get('deathDate', {}).get('value', 'Unknown'))

        birth_place = scientist.get('birthPlaceLabel', {}).get('value', 'Unknown')

        # Extract awards and educational institutions
        awards = scientist.get('awards', {}).get('value', 'Unknown')
        education_institutions = scientist.get('educationInstitutions', {}).get('value', 'Unknown')

        # Get additional data from Wikidata API
        extra_details = get_entity_details(qid)

        # Extract occupation from the description
        description = extra_details['description']
        occupation_from_description = extract_occupation_from_description(description)

        # Translate values to Hindi
        name_hindi = translate_to_hindi(label)
        birth_place_hindi = translate_to_hindi(birth_place)
        occupation_hindi = translate_to_hindi(occupation_from_description)
        description_hindi = translate_to_hindi(description)
        awards_hindi = translate_to_hindi(awards)
        education_institutions_hindi = translate_to_hindi(education_institutions)

        # Translate aliases to Hindi
        aliases_hindi = [translate_to_hindi(alias) for alias in extra_details['aliases']]

        # Create a nested dictionary for each scientist
        scientist_info = {
            'QID': qid,
            'Name': name_hindi,
            'BirthDate': birth_date,
            'DeathDate': death_date,
            'BirthPlace': birth_place_hindi,
            'Occupation': occupation_hindi,
            'Description': description_hindi,
            'Aliases': aliases_hindi,
            'Awards': awards_hindi,
            'EducationalInstitutions': education_institutions_hindi
        }

        scientists_dict[label] = scientist_info

        # Generate and store template sentences
        template_sentences = generate_template_sentences(scientist_info)
        scientists_description_dict[label] = {
            'Name': name_hindi,
            'TemplateSentences': template_sentences
        }

    # Save the scientist data to a CSV file
    df = pd.DataFrame.from_dict(scientists_dict, orient='index')
    df.to_csv("scientists_data_hindi.csv", index=False, encoding='utf-8')
    print("Scientist data saved to scientists_data_hindi.csv")

    # Save the template sentences to another CSV file
    df_desc = pd.DataFrame.from_dict(scientists_description_dict, orient='index')
    df_desc.to_csv("scientists_description_hindi.csv", index=False, encoding='utf-8')
    print("Scientist description templates saved to scientists_description_hindi.csv")

# Function to generate template sentences for each scientist
def generate_template_sentences(scientist_info):
    # Template sentence generation
    template_sentences = []

    # Generate individual sentences with non-Unknown information
    sentences_parts = []

    if scientist_info['BirthDate'] != 'Unknown' and scientist_info['BirthPlace'] != 'Unknown':
        sentences_parts.append(f"{scientist_info['Name']} का जन्म {scientist_info['BirthDate']} में {scientist_info['BirthPlace']} में हुआ था।")

    if scientist_info['DeathDate'] != 'Unknown':
        sentences_parts.append(f"उनका निधन {scientist_info['DeathDate']} में हुआ।")

    if scientist_info['Occupation'] != 'Unknown':
        sentences_parts.append(f"{scientist_info['Name']} एक प्रसिद्ध {scientist_info['Occupation']} थे।")

    if scientist_info['Description'] != 'Unknown':
        sentences_parts.append(f"{scientist_info['Name']} के बारे में कहा जाता है: {scientist_info['Description']}।")

    if scientist_info['Awards'] != 'Unknown':
        sentences_parts.append(f"उन्हें {scientist_info['Awards']} से सम्मानित किया गया।")

    if scientist_info['EducationalInstitutions'] != 'Unknown':
        sentences_parts.append(f"{scientist_info['Name']} ने {scientist_info['EducationalInstitutions']} से अपनी शिक्षा प्राप्त की।")

    # Add aliases template
    if scientist_info['Aliases'] and scientist_info['Aliases'] != ['Unknown']:
        # Join aliases with commas
        aliases_str = "، ".join(scientist_info['Aliases'])
        sentences_parts.append(f"{scientist_info['Name']} को अन्य नामों से भी जाना जाता है, जैसे {aliases_str}।")

    # Combine sentences to make a coherent narrative
    if sentences_parts:
        combined_sentence = " ".join(sentences_parts)
        template_sentences.append(combined_sentence)

    return template_sentences

# Main function to run the data collection and preprocessing process
def main():
    print("Querying Wikidata for scientists...")
    scientist_data = get_scientist_data(limit=20)  # You can increase the limit
    process_and_save_scientist_data(scientist_data)
    print("Data collection and preprocessing complete.")

if __name__ == "__main__":
    main()

Querying Wikidata for scientists...
