## Scrape WFB country links from saved HTML and geolocate using coco

In [7]:
import pandas as pd
import country_converter as coco
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
import time

# Set up the driver
options = webdriver.ChromeOptions()
options.add_argument('--headless')  # Run in headless mode (no UI)
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# Saved HTML - ran into issues when trying to scrape directly from the CIA webiste
url = "C:\\Users\\samco\\OneDrive\\Desktop\\OSINT\\saved_web_pages\\CIA_wfb.html"

driver.get(url)
time.sleep(2)  # Give time for the page to load

wfb_country_links = [] # Stores each link

content = driver.find_elements(By.TAG_NAME, "a")
for a in content:
    # Replace "file:///C:/" with the CIA url to make the "href" url valid an clickable
    wfb_country_links.append(a.get_attribute("href").replace("file:///C:/", "https://www.cia.gov/"))
    
driver.quit()

#Create DataFrame using pandas
wfb_df = pd.DataFrame(wfb_country_links)
## wfb_link.to_csv("C:\\Users\\samco\\OneDrive\\Desktop\\OSINT\\wfb_country_links.csv", index=False) ## Export plain country links to CSV

#Split link into new df
wfb_df_coded = wfb_df[0].str.split("/", expand=True)
#Get country name from split link
wfb_df_country = wfb_df_coded[5]

cc = coco.CountryConverter()
#Compare country with coco and create column with matched country
country_names = cc.pandas_convert(wfb_df_country, to='short_name')
wfb_df_coded['country'] = country_names

#Get ISO3 Code with coco based off of the matched country
iso3_codes = cc.pandas_convert(wfb_df_coded['country'], to='ISO3')
wfb_df_coded['ISO3_CODE'] = iso3_codes

#Merge split link data with original link df...then drop & rename columns
wfb_coded = wfb_df.merge(wfb_df_coded, left_index=True, right_index=True).drop(columns=["0_y",	1,	2,	3,	4, 6]).rename(columns={"0_x": "wfb_country_link", 5: "wfb_country_name"})

#Create two separate dfs...one without coded links and one with the coded links
wfb_code_not_found = wfb_coded[wfb_coded['country'] == "not found"].drop_duplicates().reset_index().drop(columns="index")
wfb_coded = wfb_coded[wfb_coded['country'] != "not found"].drop_duplicates().reset_index().drop(columns="index")

# Dataset of locations not found thru coco...(i.e., place.drop_duplicates()s, not official countries)
wfb_code_not_found.to_csv("C:\\Users\\samco\\OneDrive\\Desktop\\OSINT\\Complete_datasets\\wfb_countries_no_code.csv")

# Dataset of goelocated World Fact Book links
wfb_coded.to_csv("C:\\Users\\samco\\OneDrive\\Desktop\\OSINT\\Complete_datasets\\wfb_countries_coded.csv")

# Ignore "...not found in regex"...that is normal for all words not matching a coco regex match

akrotiri-and-dhekelia not found in regex
ashmore-and-cartier-islands not found in regex
cabo-verde not found in regex
clipperton-island not found in regex
congo-republic-of-the not found in regex
coral-sea-islands not found in regex
european-union not found in regex
jan-mayen not found in regex
navassa-island not found in regex
paracel-islands not found in regex
spratly-islands not found in regex
virgin-islands not found in regex
wake-island not found in regex
world not found in regex
not found not found in regex
