# Wiki Extractions

In [None]:
import requests
import time
import json
from tqdm.notebook import tqdm as tqdm
import sqlite3
import pandas as pd
from bs4 import BeautifulSoup
# from langchain.llms import Ollama
# from langchain import LLMChain, PromptTemplate

# -----------------------
# 1. Extract Website Data
# -----------------------
df = pd.read_csv("E:\\My Workspaces\\Langchain\\conditions_wiki.csv")

# URL of the Wikipedia article
urls = [i for i in df['Link'].values]

# Initialize a list to hold the JSON data for each article.
data_list = []

for url in tqdm(urls):
    # Pause between requests to be polite to the server
    time.sleep(2)
    response = requests.get(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract the main article title
        main_title = soup.find('h1', {'id': 'firstHeading'}).get_text(strip=True)

        # Extract the main content container
        content_div = soup.find('div', {'class': 'mw-parser-output'})

        # Create a list to hold sections; each section is a dict with a heading and its content.
        sections = []
        current_section = {"heading": main_title, "content": ""}

        # Process the children elements (headings and paragraphs) in the content container
        for element in content_div.find_all(lambda tag: tag.name in ['h2', 'h3', 'h4', 'h5', 'h6', 'p']):
            if element.name in ['h2', 'h3', 'h4', 'h5', 'h6']:
                # Save the previous section before starting a new one (if it has any content)
                if current_section["heading"] or current_section["content"]:
                    sections.append(current_section)
                heading_text = element.get_text(separator=" ", strip=True)
                current_section = {"heading": heading_text, "content": ""}
            elif element.name == 'p':
                paragraph_text = element.get_text(strip=True)
                if paragraph_text:
                    # Append paragraphs; separate them with a newline if needed
                    if current_section["content"]:
                        current_section["content"] += "\n" + paragraph_text
                    else:
                        current_section["content"] = paragraph_text
        # Append the final section
        if current_section["heading"] or current_section["content"]:
            sections.append(current_section)

        # Combine the extracted data into a dictionary
        extracted_data = {
            "title": main_title,
            "sections": sections
        }

        # Optionally print extracted data
        # print("Extracted Data:")
        # print(json.dumps(extracted_data, indent=2))
        print(main_title)
        # Append the JSON data for this article to the list
        data_list.append(extracted_data)
    else:
        print(f"Failed to retrieve the page. Status code: {response.status_code}")
        # Depending on your use-case, you might continue or exit. Here we continue.
        continue

    # ----------------------------------------------------------------
    # (Optional) 2. Cleaning Using an LLM (Commented Out)
    # ----------------------------------------------------------------
    # If you wish to perform cleaning on each section using the Ollama LLM,
    # uncomment and adjust the code below.
    #
    # section_prompt_template = PromptTemplate(
    #     input_variables=["section_data"],
    #     template="""
    # You are given raw section data in JSON format. Please perform simple cleaning and text corrections only.
    # Do not add any extra information or hallucinate details.
    # Your task is to fix typos, remove extra spaces, and ensure the text is grammatically correct while preserving the original content.
    # If multiple related headings can be combined as a nested section, do so, but do not lose any content.
    #
    # Return only the cleaned JSON in the following format (with literal braces escaped):
    # {{
    #   "heading": "<cleaned section heading>",
    #   "content": "<cleaned section content>"
    # }}
    #
    # Here is the raw section data:
    # {section_data}
    #
    # ONLY RETURN THE CLEANED JSON.
    # """
    # )
    #
    # llm = Ollama(model="phi4",  temperature=0)
    # section_chain = LLMChain(llm=llm, prompt=section_prompt_template)
    #
    # cleaned_sections = []
    # for section in extracted_data["sections"]:
    #     section_json = json.dumps(section, indent=2)
    #     try:
    #         cleaned_section_response = section_chain.run(section_data=section_json)
    #         if not cleaned_section_response.strip():
    #             cleaned_sections.append(section)
    #             continue
    #         try:
    #             cleaned_section = json.loads(cleaned_section_response)
    #             cleaned_sections.append(cleaned_section)
    #         except Exception as e:
    #             cleaned_sections.append(section)
    #     except Exception as e:
    #         cleaned_sections.append(section)
    #
    # # Combine the cleaned sections with the title
    # cleaned_data = {
    #     "title": main_title,
    #     "sections": cleaned_sections
    # }
    # print("\nCleaned Data from Ollama (per section):")
    # print(json.dumps(cleaned_data, indent=2))
    # # If using cleaned data, consider saving cleaned_data to data_list instead of extracted_data.
    # # data_list.append(cleaned_data)

# -----------------------
# 3. Create a Database and Save the Data
# -----------------------

# Create (or open) an SQLite database
conn = sqlite3.connect('wikipedia_articles_base.db')
cursor = conn.cursor()

# Create a table for articles if it does not exist.
# Here, we store the title and sections (as a JSON string).
cursor.execute('''
    CREATE TABLE IF NOT EXISTS articles (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        title TEXT,
        sections TEXT
    )
''')

# Insert each article's data into the database.
for article in data_list:
    title = article['title']
    # Convert the sections list into a JSON string before storing.
    sections_json = json.dumps(article['sections'])
    cursor.execute('INSERT INTO articles (title, sections) VALUES (?, ?)', (title, sections_json))

conn.commit()
conn.close()

print("Database created and data inserted successfully.")


  0%|          | 0/70 [00:00<?, ?it/s]

Echinococcosis
Amoebiasis
Anthrax
Ascariasis
Bartonellosis
Schistosomiasis
Brucellosis
Burkholderia pseudomallei
Buruli ulcer
Chikungunya
Cholera
Coccidioidomycosis
Crimean–Congo hemorrhagic fever
Cryptosporidiosis
Cysticercosis
Dengue fever
Dermatophytosis
Diphtheria
Dracunculiasis
Ebola
Filariasis
Meningitis
Giardiasis
Dracunculiasis
Orthohantavirus
Helminthiasis
Hendra virus
Hepatitis
HIV/AIDS
Hookworm
Japanese encephalitis
Kaposi's sarcoma-associated herpesvirus
Lassa fever
Leishmaniasis
Leprosy
Leptospirosis
Loa loafilariasis
Lymphatic filariasis
Lymphogranuloma venereum
Malaria
Marburg virus
Measles
Melioidosis
Meningococcal disease
B virus
Mpox
Mycetoma
Norovirus
Onchocerciasis
Plague (disease)
Polio
Q fever
Rabies
Rift Valley fever
Rotavirus
Schistosomiasis
Scrub typhus
Strongyloidiasis
Syphilis
Tetanus
Toxoplasmosis
Trachoma
Tropical sprue
Trypanosomiasis
Tuberculosis
Typhoid fever
West Nile virus
Yaws
Yellow fever
Zika virus
Database created and data inserted successfully.


In [None]:
import sqlite3
import pandas as pd
import json

# Connect to the database
conn = sqlite3.connect('wikipedia_articles_base.db')

# Read the entire articles table into a DataFrame
df = pd.read_sql_query("SELECT * FROM articles", conn)

# Optionally, if you want to convert the JSON string back into Python objects for the 'sections' column:
df['sections'] = df['sections'].apply(json.loads)

# Close the connection
conn.close()

# Display the DataFrame
print(df.head())


   id           title                                           sections
0   1  Echinococcosis  [{'heading': 'Echinococcosis', 'content': 'Ech...
1   2      Amoebiasis  [{'heading': 'Amoebiasis', 'content': 'Amoebia...
2   3         Anthrax  [{'heading': 'Anthrax', 'content': 'Anthraxis ...
3   4      Ascariasis  [{'heading': 'Ascariasis', 'content': 'Ascaria...
4   5   Bartonellosis  [{'heading': 'Bartonellosis', 'content': 'Bart...


In [None]:
import sqlite3
import json
from langchain.llms import Ollama
from langchain import LLMChain, PromptTemplate

# -------------------------------------------
# 1. Set Up the LLM Tagging Chain
# -------------------------------------------
tagging_prompt_template = PromptTemplate(
    input_variables=["section_content"],
    template="""
You are an advanced named entity recognition system. Given the following section content (provided as plain text), identify which of the following tags is applicable: "treatment", "prevention", "diagnosis", "cause", or "drop".

Use the "drop" tag if the content doesn't fit into any of the other four categories.
Each section must have exactly **one** tag. Do not assign multiple tags.

Return the result **strictly** as a single-word response in double quotes, like `"cause"`.

Examples:

1.
Text:
An adult worm resides in the small intestine of a definitive host. A single gravid proglottid releases eggs that are passed in the feces of the definitive host...
Tag:
"cause"

2.
Text:
The most common form found in humans is cystic echinococcosis...
Tag:
"diagnosis"

3.
Text:
Several different strategies are currently being used to prevent and control cystic echinococcosis (CE)...
Tag:
"prevention"

4.
Text:
A number of therapy options are presently available. Treatment with albendazole...
Tag:
"treatment"

Now, classify the following text:

Section content:
{section_content}
"""
)

# Initialize the LLM (adjust model and parameters as needed)
llm_tagger = Ollama(model="llama3.2:3b", temperature=0.1)

# Define chains
tag_chain = LLMChain(llm=llm_tagger, prompt=tagging_prompt_template)

# Connect to the Database and Read JSON Data
conn = sqlite3.connect('wikipedia_articles_base.db')
cursor = conn.cursor()
cursor.execute("SELECT id, title, sections FROM articles")
rows = cursor.fetchall()

records = []

# Process Each Record
for row in range(df.shape[0]):
    article_id, title, sections_json = df.iloc[row]['id'], df.iloc[row]['title'], df.iloc[row]['sections']
    sections = sections_json

    updated = False

    for section in sections:
        text_for_tagging = section.get("heading", "") + "\n" + section.get("content", "")
        raw_tags = tag_chain.invoke({"section_content": text_for_tagging})
        section["tags"] = raw_tags['text']
        updated = True

        updated_section = {
            "disease_name": title,                           # from the article title
            "section": section.get("heading", ""),                    # section heading
            "section_tag": raw_tags['text'],                   # tag returned by the LLM
            "content": section.get("content", "")                     # Corrected text content returned by LLM
        }
        records.append(updated_section)

print("Tagging complete and database updated.")


  llm_tagger = Ollama(model="llama3.2:3b", temperature=0.1)
  tag_chain = LLMChain(llm=llm_tagger, prompt=tagging_prompt_template)


Tagging complete and database updated.


In [None]:
# -------------------------------------------
# 3. Create (or Update) the DataFrame from the Updated Dictionary
# -------------------------------------------
df_sections = pd.DataFrame(records)

# Now you have a DataFrame with columns:
# ['disease_name', 'section', 'section_tag', 'content']
print(df_sections.head())

     disease_name             section   section_tag  \
0  Echinococcosis      Echinococcosis  "prevention"   
1  Echinococcosis  Signs and symptoms   "diagnosis"   
2  Echinococcosis               Cause       "cause"   
3  Echinococcosis               Hosts       "cause"   
4  Echinococcosis          Life cycle       "cause"   

                                             content  
0  Echinococcosisis aparasitic diseasecaused byta...  
1  In the human manifestation of the disease,E. g...  
2  Like many other parasite infections, the cours...  
3                                                     
4  An adult worm resides in the small intestine o...  


In [None]:
df_sections

Unnamed: 0,disease_name,section,section_tag,content
0,Echinococcosis,Echinococcosis,"""prevention""",Echinococcosisis aparasitic diseasecaused byta...
1,Echinococcosis,Signs and symptoms,"""diagnosis""","In the human manifestation of the disease,E. g..."
2,Echinococcosis,Cause,"""cause""","Like many other parasite infections, the cours..."
3,Echinococcosis,Hosts,"""cause""",
4,Echinococcosis,Life cycle,"""cause""",An adult worm resides in the small intestine o...
...,...,...,...,...
1125,Zika virus,"India, Bangladesh","""prevention""","On 22 March 2016, Reuters reported that Zika w..."
1126,Zika virus,East Asia,"""prevention""","Between August and November 2016, 455 cases of..."
1127,Zika virus,See also,"""drop""",
1128,Zika virus,References,"""drop""",This article incorporatespublic domain materia...


In [None]:
df_sections.to_csv("disease_wiki.csv")