# Installing the module 

In [2]:
pip install wikipedia-api

Note: you may need to restart the kernel to use updated packages.


In [3]:
import wikipediaapi
import re
import json
import requests
from bs4 import BeautifulSoup
import warnings
warnings.filterwarnings('ignore')

**Wikipedia strongly encourages and, in some cases, requires a user agent, especially for higher usage. This helps Wikipedia track and manage resources effectively, allowing them to reach out if your script is causing issues or excessive traffic.**

In [4]:
# Initialize the Wikipedia API with a custom user agent
user_agent = "MyCountryInfoScript/1.0 (https://mywebsite.com; myemail@gmail.com)"
wiki_wiki = wikipediaapi.Wikipedia(language='en',user_agent = user_agent)

In [5]:
# Initialize the Wikipedia API with a custom user agent
user_agent = "MyCountryInfoScript/1.0 (https://mywebsite.com; myemail@example.com)"
wiki_wiki = wikipediaapi.Wikipedia(language='en', user_agent= user_agent)

def fetch_infobox_content(country_name):
    # Access the Wikipedia page for the country
    page = wiki_wiki.page(country_name)

    # Check if the page exists
    if not page.exists():
        print(f"The page for {country_name} does not exist.")
        return None
    
    # Get the HTML of the page
    url = page.fullurl
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find the infobox table (it usually has class "infobox")
    infobox = soup.find('table', {'class': 'infobox'})
    if not infobox:
        print("Infobox not found.")
        return None

    # Extract plain text from each row in the infobox
    infobox_text = ""
    for row in infobox.find_all(['tr']):
        # Get each cell in the row
        cells = row.find_all(['th', 'td'])
        
        # Append each cell's text to the infobox_text string
        row_text = " ".join(cell.get_text(" ", strip=True) for cell in cells)
        infobox_text += row_text + "\n"
    
    return infobox_text

In [6]:
import unicodedata

def remove_accents(text):
    # Normalize the text to separate accents from letters
    normalized_text = unicodedata.normalize('NFD', text)
    # Filter out the accents and keep only ASCII characters
    ascii_text = ''.join(char for char in normalized_text if unicodedata.category(char) != 'Mn')
    return ascii_text

# Fetching the content of all coutries 

# Australia

In [7]:
aus = fetch_infobox_content("Australia")

In [8]:
nz = fetch_infobox_content("New Zealand")

In [9]:
png = fetch_infobox_content("Papua New Guinea")

In [10]:
nauru = fetch_infobox_content("Nauru")

In [11]:
palau = fetch_infobox_content("Palau")

In [12]:
solomon = fetch_infobox_content("Solomon Islands")

In [13]:
fiji = fetch_infobox_content("Fiji")

In [14]:
tuvalu = fetch_infobox_content("Tuvalu")

In [15]:
tonga = fetch_infobox_content("Tonga")

In [16]:
samoa = fetch_infobox_content("Samoa")

In [17]:
kiribati = fetch_infobox_content("Kiribati")

In [18]:
ml= fetch_infobox_content("Marshall Islands")

In [19]:
vanuatu = fetch_infobox_content("Vanuatu")

# Regular Expression 

**Using Regular Expression to extract all the key information needed from the wikipedia page.**

In [124]:
def fetch_area(country):
    area = re.findall("Total\s([0-9,]+)",country)
    water = re.findall("Water\s\(\%\)\s([0-9.]+)",country)
    return ''.join(area), ''.join(water)

In [125]:
def fetch_capital(country):
    if country.startswith("New Zealand"):
        cap = re.findall("Capital\s([a-zA-Z]+(?:\s[A-Za-z]+){0,3})",country)
        city = re.findall("Largest\scity\s([A-Za-z]+(?:\s[a-zA-Z]+){0,2})(?=\n)",country)
        return ''.join(cap) ,''.join(city)
    else:
        city = re.findall("Capital\sand\slargest\scity\s([a-zA-Z]+(?:\s[a-zA-Z]+){0,3})",country)
        return ''.join(city) ,''.join(city)

In [127]:
def fetch_lang(country):
    if country.startswith("New Zealand") :
        lang = re.findall("Official\slanguages\s([a-zA-z]+(?:\s[a-zA-Z]+){0,4})",country)
    elif country.startswith("Commonwealth of Australia"):
        lang = re.findall("Official\slanguage\sand\snational\slanguage\s([A-za-z]+)",country)
    else:
        lang = re.findall("Official\slanguages\s([a-zA-z]+(?:\s[a-zA-Z]+){0,4})(?=\n)",country)
    return ''.join(lang)

In [23]:
def fetch_cur(country):
    cur = re.findall("Currency\s([A-Za-z.ʻā]+(?:\s[A-Za-z.]+){0,3})",country)
    return ''.join(cur)

In [24]:
#accurate
def fetch_info(country):
    if country.startswith("Netherlands"):
        anthem = re.findall("Anthem\s\:\s([a-zA-z]+)",country)
    elif country.startswith("Tuvalu"):
        anthem = re.findall("Anthem\:\s([a-zA-z]+(?:\s[A-Za-z]+){0,3})",country)
    else:
        anthem = re.findall("Anthem:\s.*?\"([^\"]+)\"",country)
    flag = re.findall("Flag\s([A-Za-z]+(?:\s[A-Za-z]+){0,3})(?=\n)",country)
    if country.startswith("Solomon Islands") or  country.startswith("Republic of Kiribati") :
        demonym = re.findall("Demonym\(s\)\s([A-Za-z-]+(?:\s[A-Za-z]+){0,3})(?=\n)",country)
    elif country.startswith("New Zealand"):
        demonym = re.findall("Demonym\(s\)\s([A-Za-z-]+(?:\s[a-zA-Z]+){0,3})",country)
    else:
        demonym = re.findall("Demonym\(s\)\s([A-Za-z-]+)",country)
    
    return anthem,flag,demonym

**Function to fetch each and every detail for specific country**

In [25]:
def fetch_all_details(country):
    a = fetch_area(country)
    b = fetch_capital(country)
    c = fetch_lang(country)
    d = fetch_cur(country)
    e = fetch_info(country)
    land=''
    total_area= a[0]
    water = a[1]
    if len(a)==3:
        land = a[2]
    capital = b[0] or "No official capital"
    largest = b[1] 
    lang = c or "English"
    currency = d
    anthem = e[0]
    flag = e[1]
    demonym = e[2]   
    
    all_info = {
        key: value for key, value in {
            "Country": country.splitlines()[0],
            "Capital": capital,
            "Largest City": largest,
            "Official Language": lang,
            "Currency": currency,
            "National Anthem": ''.join(anthem),
            "National Flag": ''.join(flag),
            "Demonym": ''.join(demonym),
            "Total Area": total_area,
            "Water (%)": water,
            "Total Land Area":land
        }.items() if value
    }
    return all_info

# Australia

In [48]:
country_aus = fetch_all_details(aus)

In [49]:
country_nz = fetch_all_details(nz)

In [53]:
country_png = fetch_all_details(png)

In [50]:
country_nauru = fetch_all_details(nauru)

In [51]:
country_palau = fetch_all_details(palau)

In [52]:
country_solomon = fetch_all_details(solomon)

In [54]:
country_fiji = fetch_all_details(fiji)

In [55]:
country_tuvalu = fetch_all_details(tuvalu)

In [57]:
country_tonga = fetch_all_details(tonga)

In [56]:
country_samoa = fetch_all_details(samoa)

In [60]:
country_kiribati = fetch_all_details(kiribati)

In [59]:
country_ml  =fetch_all_details(ml)

In [58]:
country_vanuatu  =fetch_all_details(vanuatu)

In [39]:
australasia_info = {
    "Australia": country_aus,
    "New Zealand":country_nz,
    "Papua New Guinea":country_png,
    "Nauru":country_nauru,
    "Palau": country_palau,
    "Solomon Islands":country_solomon,
    "Vanuatu": country_vanuatu,
    "Fiji":country_fiji,
    "Tuvalu":country_tuvalu,
    "Tonga":country_tonga,
    "Samoa":country_samoa,
    "Kiribati":country_kiribati,
    "Marshall Islands":country_ml
}

In [128]:
australasia_info

{'Australia': {'Country': 'Commonwealth of Australia',
  'Capital': 'Canberra',
  'Largest City': 'Sydney',
  'Official Language': 'English',
  'Currency': 'Australian dollar',
  'National Anthem': ' Advance Australia Fair ',
  'National Flag': 'Coat of arms',
  'Demonym': 'Australian',
  'Total Area': '7,688,287',
  'Water (%)': '1.79'},
 'New Zealand': {'Country': 'New Zealand Aotearoa ( Māori )',
  'Capital': 'Wellington',
  'Largest City': 'Auckland',
  'Official Language': 'English',
  'Currency': 'New Zealand dollar',
  'National Flag': 'Coat of arms',
  'Demonym': 'New Zealander Kiwi',
  'Total Area': '268,021',
  'Water (%)': '1.6'},
 'Papua New Guinea': {'Country': 'Independent State of Papua New Guinea Independen Stet bilong Papua Niugini ( Tok Pisin ) Independen Stet bilong Papua Niu Gini ( Hiri Motu )',
  'Capital': 'Port Moresby',
  'Largest City': 'Port Moresby',
  'Official Language': 'English',
  'Currency': 'Kina',
  'National Anthem': ' O Arise, All You Sons ',
  'Nat

In [41]:
with open('australasia.json', 'w') as json_file:
    json.dump(australasia_info, json_file, indent=4)