In [3]:
pip install qwikidata pandas requests

Collecting qwikidata
  Downloading qwikidata-0.4.2.tar.gz (22 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting mypy-extensions (from qwikidata)
  Downloading mypy_extensions-1.0.0-py3-none-any.whl.metadata (1.1 kB)
Downloading mypy_extensions-1.0.0-py3-none-any.whl (4.7 kB)
Building wheels for collected packages: qwikidata
  Building wheel for qwikidata (setup.py) ... [?25l[?25hdone
  Created wheel for qwikidata: filename=qwikidata-0.4.2-py3-none-any.whl size=24867 sha256=13045007b9c11a6ae79a34f37e8f633236fa90fdfd8e0a434dd4f3902170b4c3
  Stored in directory: /root/.cache/pip/wheels/20/a2/85/3ca91fc8f95fa5be840fce552ac382bbcddaea6d2e31212ae5
Successfully built qwikidata
Installing collected packages: mypy-extensions, qwikidata
Successfully installed mypy-extensions-1.0.0 qwikidata-0.4.2


In [14]:
pip install googletrans==4.0.0-rc1


Collecting googletrans==4.0.0-rc1
  Downloading googletrans-4.0.0rc1.tar.gz (20 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting httpx==0.13.3 (from googletrans==4.0.0-rc1)
  Downloading httpx-0.13.3-py3-none-any.whl.metadata (25 kB)
Collecting hstspreload (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading hstspreload-2024.9.1-py3-none-any.whl.metadata (2.1 kB)
Collecting chardet==3.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading chardet-3.0.4-py2.py3-none-any.whl.metadata (3.2 kB)
Collecting idna==2.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading idna-2.10-py2.py3-none-any.whl.metadata (9.1 kB)
Collecting rfc3986<2,>=1.3 (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading rfc3986-1.5.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting httpcore==0.9.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading httpcore-0.9.1-py3-none-any.whl.metadata (4.6 kB)
Collecting h11<0.10,>=0.8 (from httpcore==0.9.*->httpx==0.13.3->googl

In [1]:
import requests
from qwikidata.entity import WikidataItem
from qwikidata.linked_data_interface import get_entity_dict_from_api
import pandas as pd
from googletrans import Translator

# SPARQL endpoint for Wikidata Query Service
WIKIDATA_SPARQL_URL = "https://query.wikidata.org/sparql"
translator = Translator()  # Initialize the Google Translate API

# List of possible occupations (you can expand this list)
OCCUPATION_KEYWORDS = ["physicist", "chemist", "mathematician", "biologist", "engineer", "astronomer", "scientist"]

# Function to execute a SPARQL query
def execute_sparql_query(query):
    headers = {
        "User-Agent": "WikiDataQueryBot/0.1 (test@example.org)"
    }
    response = requests.get(WIKIDATA_SPARQL_URL, params={'query': query, 'format': 'json'}, headers=headers)
    return response.json()

# Function to extract scientist data
def get_scientist_data(limit=10):
    query = f"""
    SELECT ?scientist ?scientistLabel ?birthDate ?birthPlaceLabel WHERE {{
      ?scientist wdt:P31 wd:Q5 ;  # Entity must be a human (Q5)
                wdt:P106 wd:Q901 ;  # Must have the occupation of scientist (Q901)
                wdt:P569 ?birthDate .
      OPTIONAL {{ ?scientist wdt:P19 ?birthPlace. }}
      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
    }} LIMIT {limit}
    """

    data = execute_sparql_query(query)
    return data['results']['bindings']

# Function to fetch additional details from Wikidata API
def get_entity_details(qid):
    entity_dict = get_entity_dict_from_api(qid)
    item = WikidataItem(entity_dict)
    return {
        'description': item.get_description('en'),
        'aliases': item.get_aliases('en'),
        'label': item.get_label('en')
    }

# Function to extract occupation from the description
def extract_occupation_from_description(description):
    description_lower = description.lower()  # Convert description to lowercase for comparison
    for keyword in OCCUPATION_KEYWORDS:
        if keyword in description_lower:
            return keyword.capitalize()  # Return the matched occupation
    return "Unknown"

# Function to translate text to Hindi using Google Translate
def translate_to_hindi(text):
    if text and text != "Unknown":
        translation = translator.translate(text, dest='hi')  # Translate to Hindi
        return translation.text
    return text

# Process and save data
def process_and_save_scientist_data(scientist_data):
    # Initialize a dictionary to store scientist details
    scientists_dict = {}

    for scientist in scientist_data:
        qid = scientist['scientist']['value'].split('/')[-1]
        label = scientist['scientistLabel']['value']
        birth_date = scientist.get('birthDate', {}).get('value', 'Unknown')
        birth_place = scientist.get('birthPlaceLabel', {}).get('value', 'Unknown')

        # Get additional data from Wikidata API
        extra_details = get_entity_details(qid)

        # Extract occupation from the description
        description = extra_details['description']
        occupation_from_description = extract_occupation_from_description(description)

        # Translate values to Hindi
        name_hindi = translate_to_hindi(label)
        birth_place_hindi = translate_to_hindi(birth_place)
        occupation_hindi = translate_to_hindi(occupation_from_description)
        description_hindi = translate_to_hindi(description)

        # Create a nested dictionary for each scientist
        scientists_dict[label] = {
            'QID': qid,
            'Name': name_hindi,  # Translated name
            'BirthDate': birth_date,
            'BirthPlace': birth_place_hindi,  # Translated birth place
            'Occupation': occupation_hindi,  # Translated occupation
            'Description': description_hindi,  # Translated description
            'Aliases': extra_details['aliases']
        }

    # Save the nested dictionary to a CSV file
    df = pd.DataFrame.from_dict(scientists_dict, orient='index')
    df.to_csv("scientists_data_hindi.csv", index=False)
    print("Data saved to scientists_data_hindi.csv")

# Main function to run the data collection and preprocessing process
def main():
    print("Querying Wikidata for scientists...")
    scientist_data = get_scientist_data(limit=20)  # You can increase the limit
    process_and_save_scientist_data(scientist_data)
    print("Data collection and preprocessing complete.")

if __name__ == "__main__":
    main()


Querying Wikidata for scientists...
Data saved to scientists_data_hindi.csv
Data collection and preprocessing complete.


In [2]:
import requests
from qwikidata.entity import WikidataItem
from qwikidata.linked_data_interface import get_entity_dict_from_api
import pandas as pd
from googletrans import Translator

# SPARQL endpoint for Wikidata Query Service
WIKIDATA_SPARQL_URL = "https://query.wikidata.org/sparql"
translator = Translator()  # Initialize the Google Translate API

# List of possible occupations (you can expand this list)
OCCUPATION_KEYWORDS = ["physicist", "chemist", "mathematician", "biologist", "engineer", "astronomer", "scientist"]

# Function to execute a SPARQL query
def execute_sparql_query(query):
    headers = {
        "User-Agent": "WikiDataQueryBot/0.1 (test@example.org)"
    }
    response = requests.get(WIKIDATA_SPARQL_URL, params={'query': query, 'format': 'json'}, headers=headers)
    return response.json()

# Function to extract scientist data
def get_scientist_data(limit=10):
    query = f"""
    SELECT ?scientist ?scientistLabel ?birthDate ?birthPlaceLabel WHERE {{
      ?scientist wdt:P31 wd:Q5 ;  # Entity must be a human (Q5)
                wdt:P106 wd:Q901 ;  # Must have the occupation of scientist (Q901)
                wdt:P569 ?birthDate .
      OPTIONAL {{ ?scientist wdt:P19 ?birthPlace. }}
      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
    }} LIMIT {limit}
    """

    data = execute_sparql_query(query)
    return data['results']['bindings']

# Function to fetch additional details from Wikidata API
def get_entity_details(qid):
    entity_dict = get_entity_dict_from_api(qid)
    item = WikidataItem(entity_dict)
    return {
        'description': item.get_description('en'),
        'aliases': item.get_aliases('en'),
        'label': item.get_label('en')
    }

# Function to extract occupation from the description
def extract_occupation_from_description(description):
    description_lower = description.lower()  # Convert description to lowercase for comparison
    for keyword in OCCUPATION_KEYWORDS:
        if keyword in description_lower:
            return keyword.capitalize()  # Return the matched occupation
    return "Unknown"

# Function to translate text to Hindi using Google Translate
def translate_to_hindi(text):
    if text and text != "Unknown":
        translation = translator.translate(text, dest='hi')  # Translate to Hindi
        return translation.text
    return text

# Function to generate template sentences for each scientist
def generate_template_sentences(scientist_info):
    # Template sentence generation
    template_sentences = []

    # Generate sentences based on the scientist info
    template_sentences.append(f"{scientist_info['Name']} का जन्म {scientist_info['BirthDate']} को {scientist_info['BirthPlace']} में हुआ था।")
    template_sentences.append(f"{scientist_info['Name']} एक प्रसिद्ध {scientist_info['Occupation']} थे।")
    template_sentences.append(f"{scientist_info['Name']} के बारे में कहा जाता है: {scientist_info['Description']}.")

    # Combine sentences to make a coherent narrative
    combined_sentence = (
        f"{scientist_info['Name']} का जन्म {scientist_info['BirthDate']} को {scientist_info['BirthPlace']} में हुआ था। "
        f"{scientist_info['Name']} एक प्रसिद्ध {scientist_info['Occupation']} थे। "
        f"{scientist_info['Name']} के बारे में कहा जाता है: {scientist_info['Description']}."
    )

    # Add combined sentence to the list
    template_sentences.append(combined_sentence)

    return template_sentences

# Process and save data
def process_and_save_scientist_data(scientist_data):
    # Initialize a dictionary to store scientist details
    scientists_dict = {}

    for scientist in scientist_data:
        qid = scientist['scientist']['value'].split('/')[-1]
        label = scientist['scientistLabel']['value']
        birth_date = scientist.get('birthDate', {}).get('value', 'Unknown')
        birth_place = scientist.get('birthPlaceLabel', {}).get('value', 'Unknown')

        # Get additional data from Wikidata API
        extra_details = get_entity_details(qid)

        # Extract occupation from the description
        description = extra_details['description']
        occupation_from_description = extract_occupation_from_description(description)

        # Translate values to Hindi
        name_hindi = translate_to_hindi(label)
        birth_place_hindi = translate_to_hindi(birth_place)
        occupation_hindi = translate_to_hindi(occupation_from_description)
        description_hindi = translate_to_hindi(description)

        # Create a nested dictionary for each scientist
        scientist_info = {
            'QID': qid,
            'Name': name_hindi,  # Translated name
            'BirthDate': birth_date,
            'BirthPlace': birth_place_hindi,  # Translated birth place
            'Occupation': occupation_hindi,  # Translated occupation
            'Description': description_hindi,  # Translated description
            'Aliases': extra_details['aliases']
        }

        # Generate template sentences for the scientist
        sentences = generate_template_sentences(scientist_info)
        scientists_dict[label] = {
            'Details': scientist_info,
            'TemplateSentences': sentences
        }

    # Save the nested dictionary to a CSV file
    df = pd.DataFrame.from_dict({k: v['Details'] for k, v in scientists_dict.items()}, orient='index')
    df.to_csv("scientists_data_hindi.csv", index=False)

    # Print template sentences for each scientist
    for scientist_name, data in scientists_dict.items():
        print(f"Template sentences for {scientist_name}:")
        for sentence in data['TemplateSentences']:
            print(sentence)
        print("\n")

    print("Data saved to scientists_data_hindi.csv")

# Main function to run the data collection and preprocessing process
def main():
    print("Querying Wikidata for scientists...")
    scientist_data = get_scientist_data(limit=20)  # You can increase the limit
    process_and_save_scientist_data(scientist_data)
    print("Data collection and preprocessing complete.")

if __name__ == "__main__":
    main()


Querying Wikidata for scientists...
Template sentences for Thomas Sprat:
थॉमस फ्लोर का जन्म 1635-01-01T00:00:00Z को बीमिन्स्टर में हुआ था।
थॉमस फ्लोर एक प्रसिद्ध Unknown थे।
थॉमस फ्लोर के बारे में कहा जाता है: अंग्रेजी चर्चमैन और लेखक (1635-1713).
थॉमस फ्लोर का जन्म 1635-01-01T00:00:00Z को बीमिन्स्टर में हुआ था। थॉमस फ्लोर एक प्रसिद्ध Unknown थे। थॉमस फ्लोर के बारे में कहा जाता है: अंग्रेजी चर्चमैन और लेखक (1635-1713).


Template sentences for Mary Somerville:
मैरी सोमरविले का जन्म 1780-12-26T00:00:00Z को जेडबर्ग में हुआ था।
मैरी सोमरविले एक प्रसिद्ध Unknown थे।
मैरी सोमरविले के बारे में कहा जाता है: ब्रिटिश विज्ञान लेखक और पॉलीमथ (1780-1872).
मैरी सोमरविले का जन्म 1780-12-26T00:00:00Z को जेडबर्ग में हुआ था। मैरी सोमरविले एक प्रसिद्ध Unknown थे। मैरी सोमरविले के बारे में कहा जाता है: ब्रिटिश विज्ञान लेखक और पॉलीमथ (1780-1872).


Template sentences for Virginia Henderson:
वर्जीनिया हेंडरसन का जन्म 1897-11-30T00:00:00Z को कैनसस सिटी में हुआ था।
वर्जीनिया हेंडरसन एक प्रसिद्ध Unknown थे।
व

In [5]:
import requests
from qwikidata.entity import WikidataItem
from qwikidata.linked_data_interface import get_entity_dict_from_api
import pandas as pd
from googletrans import Translator

# SPARQL endpoint for Wikidata Query Service
WIKIDATA_SPARQL_URL = "https://query.wikidata.org/sparql"
translator = Translator()  # Initialize the Google Translate API

# List of possible occupations (you can expand this list)
OCCUPATION_KEYWORDS = ["physicist", "chemist", "mathematician", "biologist", "engineer", "astronomer", "scientist"]

# Function to execute a SPARQL query
def execute_sparql_query(query):
    headers = {
        "User-Agent": "WikiDataQueryBot/0.1 (test@example.org)"
    }
    response = requests.get(WIKIDATA_SPARQL_URL, params={'query': query, 'format': 'json'}, headers=headers)
    return response.json()

# Function to extract scientist data
def get_scientist_data(limit=10):
    query = f"""
    SELECT ?scientist ?scientistLabel ?birthDate ?birthPlaceLabel WHERE {{
      ?scientist wdt:P31 wd:Q5 ;  # Entity must be a human (Q5)
                wdt:P106 wd:Q901 ;  # Must have the occupation of scientist (Q901)
                wdt:P569 ?birthDate .
      OPTIONAL {{ ?scientist wdt:P19 ?birthPlace. }}
      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
    }} LIMIT {limit}
    """

    data = execute_sparql_query(query)
    return data['results']['bindings']

# Function to fetch additional details from Wikidata API
# Function to fetch additional details from Wikidata API
def get_entity_details(qid):
    entity_dict = get_entity_dict_from_api(qid)
    item = WikidataItem(entity_dict)

    # Access claims directly from the entity_dict
    claims = entity_dict.get('claims', {})

    return {
        'description': item.get_description('en'),
        'aliases': item.get_aliases('en'),
        'label': item.get_label('en'),
        'awards': claims.get('P166', []),  # Awards (P166)
        'degrees': claims.get('P512', []),  # Academic degree (P512)
        'doctoral_advisors': claims.get('P184', []),  # Doctoral advisor (P184)
        'doctoral_students': claims.get('P185', []),  # Doctoral student (P185)
        'gender': claims.get('P21', [])  # Gender (P21)
    }


# Function to extract occupation from the description
def extract_occupation_from_description(description):
    description_lower = description.lower()  # Convert description to lowercase for comparison
    for keyword in OCCUPATION_KEYWORDS:
        if keyword in description_lower:
            return keyword.capitalize()  # Return the matched occupation
    return "Unknown"

# Function to translate text to Hindi using Google Translate
def translate_to_hindi(text):
    if text and text != "Unknown":
        translation = translator.translate(text, dest='hi')  # Translate to Hindi
        return translation.text
    return text

# Function to determine gender and return appropriate endings
def determine_gender_and_endings(gender_claims):
    if gender_claims:
        # Gender QIDs for Male and Female
        male_qid = "Q6581097"  # Male
        female_qid = "Q6581072"  # Female
        gender_id = gender_claims[0]['mainsnak']['datavalue']['value']['id']

        if gender_id == male_qid:
            return 'थे', 'है'  # Past tense and present tense for males
        elif gender_id == female_qid:
            return 'थी', 'है'  # Past tense and present tense for females
    return 'थे', 'है'  # Default to male endings

# Function to generate template sentences for each scientist
def generate_template_sentences(scientist_info, gender_endings):
    # Template sentence generation
    template_sentences = []

    # Generate sentences based on the scientist info
    past_tense, present_tense = gender_endings

    template_sentences.append(f"{scientist_info['Name']} का जन्म {scientist_info['BirthDate']} को {scientist_info['BirthPlace']} में हुआ {past_tense}।")
    template_sentences.append(f"{scientist_info['Name']} एक प्रसिद्ध {scientist_info['Occupation']} {present_tense}।")
    template_sentences.append(f"{scientist_info['Name']} के बारे में कहा जाता है: {scientist_info['Description']}.")

    # Combine sentences to make a coherent narrative
    combined_sentence = (
        f"{scientist_info['Name']} का जन्म {scientist_info['BirthDate']} को {scientist_info['BirthPlace']} में हुआ {past_tense}। "
        f"{scientist_info['Name']} एक प्रसिद्ध {scientist_info['Occupation']} {present_tense}। "
        f"{scientist_info['Name']} के बारे में कहा जाता है: {scientist_info['Description']}."
    )

    # Add combined sentence to the list
    template_sentences.append(combined_sentence)

    return template_sentences

# Process and save data
def process_and_save_scientist_data(scientist_data):
    # Initialize a dictionary to store scientist details
    scientists_dict = {}

    for scientist in scientist_data:
        qid = scientist['scientist']['value'].split('/')[-1]
        label = scientist['scientistLabel']['value']
        birth_date = scientist.get('birthDate', {}).get('value', 'Unknown')
        birth_place = scientist.get('birthPlaceLabel', {}).get('value', 'Unknown')

        # Get additional data from Wikidata API
        extra_details = get_entity_details(qid)

        # Extract occupation from the description
        description = extra_details['description']
        occupation_from_description = extract_occupation_from_description(description)

        # Translate values to Hindi
        name_hindi = translate_to_hindi(label)
        birth_place_hindi = translate_to_hindi(birth_place)
        occupation_hindi = translate_to_hindi(occupation_from_description)
        description_hindi = translate_to_hindi(description)

        # Determine gender and get appropriate sentence endings
        gender_endings = determine_gender_and_endings(extra_details['gender'])

        # Create a nested dictionary for each scientist
        scientist_info = {
            'QID': qid,
            'Name': name_hindi,  # Translated name
            'BirthDate': birth_date,
            'BirthPlace': birth_place_hindi,  # Translated birth place
            'Occupation': occupation_hindi,  # Translated occupation
            'Description': description_hindi,  # Translated description
            'Aliases': extra_details['aliases'],
            'Awards': extra_details['awards'],  # Add awards to the scientist info
            'Degrees': extra_details['degrees'],  # Add degrees to the scientist info
            'Doctoral Advisors': extra_details['doctoral_advisors'],  # Add doctoral advisors
            'Doctoral Students': extra_details['doctoral_students']  # Add doctoral students
        }

        # Generate template sentences for the scientist
        sentences = generate_template_sentences(scientist_info, gender_endings)
        scientists_dict[label] = {
            'Details': scientist_info,
            'TemplateSentences': sentences
        }

    # Save the nested dictionary to a CSV file
    df = pd.DataFrame.from_dict({k: v['Details'] for k, v in scientists_dict.items()}, orient='index')
    df.to_csv("scientists_data_hindi.csv", index=False)

    # Print template sentences for each scientist
    for scientist_name, data in scientists_dict.items():
        print(f"Template sentences for {scientist_name}:")
        for sentence in data['TemplateSentences']:
            print(sentence)
        print("\n")

    print("Data saved to scientists_data_hindi.csv")

# Main function to run the data collection and preprocessing process
def main():
    print("Querying Wikidata for scientists...")
    scientist_data = get_scientist_data(limit=20)  # You can increase the limit
    process_and_save_scientist_data(scientist_data)
    print("Data collection and preprocessing complete.")

if __name__ == "__main__":
    main()


Querying Wikidata for scientists...
Template sentences for Thomas Sprat:
थॉमस फ्लोर का जन्म 1635-01-01T00:00:00Z को बीमिन्स्टर में हुआ थे।
थॉमस फ्लोर एक प्रसिद्ध Unknown है।
थॉमस फ्लोर के बारे में कहा जाता है: अंग्रेजी चर्चमैन और लेखक (1635-1713).
थॉमस फ्लोर का जन्म 1635-01-01T00:00:00Z को बीमिन्स्टर में हुआ थे। थॉमस फ्लोर एक प्रसिद्ध Unknown है। थॉमस फ्लोर के बारे में कहा जाता है: अंग्रेजी चर्चमैन और लेखक (1635-1713).


Template sentences for Mary Somerville:
मैरी सोमरविले का जन्म 1780-12-26T00:00:00Z को जेडबर्ग में हुआ थी।
मैरी सोमरविले एक प्रसिद्ध Unknown है।
मैरी सोमरविले के बारे में कहा जाता है: ब्रिटिश विज्ञान लेखक और पॉलीमथ (1780-1872).
मैरी सोमरविले का जन्म 1780-12-26T00:00:00Z को जेडबर्ग में हुआ थी। मैरी सोमरविले एक प्रसिद्ध Unknown है। मैरी सोमरविले के बारे में कहा जाता है: ब्रिटिश विज्ञान लेखक और पॉलीमथ (1780-1872).


Template sentences for Virginia Henderson:
वर्जीनिया हेंडरसन का जन्म 1897-11-30T00:00:00Z को कैनसस सिटी में हुआ थी।
वर्जीनिया हेंडरसन एक प्रसिद्ध Unknown है।
व