In [10]:
    import requests
    from bs4 import BeautifulSoup
    import json
    from concurrent.futures import ThreadPoolExecutor, as_completed
    from multiprocessing import Pool


    class WikiScraper:

        """allow to create an instance of a scraper that will :
        -  fetch world leaders information from an API, and notably their wikipedia URL
        -  scrap this url to retrieve the 1st paragraph of their biography
        -  load those information into a json file
        """

        def __init__(
            self,
            root_url: str,
            country_endpoint: str,
            leaders_endpoint: str,
            cookies_endpoint: str,
        ):
            self.root_url = root_url
            self.session = requests.Session()
            self.cookies_endpoint = cookies_endpoint
            self.cookie = self.get_cookie()
            self.country_endpoint = country_endpoint
            self.leaders_endpoint = leaders_endpoint
            self.country4prez_list = []
            self.name_list = []
            self.wiki_url_list = []
            self.first_paragraph_list = []
            self.soup_list = []
            
            
        def get_cookie(self):
            # allow to fetch a cookie to get access to the website & keep it active with session use
            try:
                cookie_req = self.session.get(f"{self.root_url}{self.cookies_endpoint}")
                cookie = cookie_req.cookies
                return cookie
            except Exception:
                print(f"get_cookie API issue,  code : {cookie_req.status_code}")
                return None

        def get_country_list(self):
            # allow to fetch the available countries list through the api
            try:
                country_req = self.session.get(
                    f"{self.root_url}{self.country_endpoint}", cookies=self.cookie
                )
                country_list = country_req.json()
            except Exception:
                print(f"api get_country_list issue,  code : {country_req.status_code}")
                return None
            return country_list

        def get_leaders_data(self):
            # allow to fetch all leaders info and gather them in a list of tuple
            leaders_main_data_list = []
            for country in self.get_country_list():  # looping through all countries
                params = {"country": f"{country}"}
                try:
                    leaders_req = self.session.get(  # fetching all leaders info
                        f"{self.root_url}{self.leaders_endpoint}",
                        cookies=self.cookie,
                        params=params,
                    )
                    leaders_data = leaders_req.json()
                    leaders_main_data_list.extend(  # store them in list of tuple
                        [
                            (
                                country,
                                item["first_name"] + " " + item["last_name"],
                                item["wikipedia_url"],
                            )
                            for item in leaders_data
                        ]
                    )
                except Exception:
                    print(f"api get_leaders_data issue, code : {leaders_req.status_code}")
            return leaders_main_data_list
        
        
        def scrap_url(self, url):
            url_req = self.session.get(url)
            content = url_req.text
            soup = BeautifulSoup(content, "html.parser")
            return soup


        def get_leaders_paragraph(self):
                # get 4 lists of country, name, wikipedia url and wikipedia page 1st paragraph for all leaders
                self.country4prez_list = []
                self.name_list = []
                self.wiki_url_list = []
                self.first_paragraph_list = []
                counter = 0

                for soup in self.soup_list:

                    for tag in soup.find_all(
                        "p"
                    ):  # subloop through all of the § to find the first one
                        if "<b>" in str(
                            tag
                        ):  # id of the first paragraph through the bold tag
                            first_paragraph = tag
                            counter += 1
                            self.country4prez_list.append(
                                tuple[0]
                            )  # create the country by president list
                            self.name_list.append(
                                tuple[1]
                            )  # create the name by president list
                            self.wiki_url_list.append(
                                tuple[2]
                            )  # create the wikipedia url by president list
                            self.first_paragraph_list.append(
                                first_paragraph.get_text()
                            )  # create the 1st paragraph by president list
                            print(counter)
                            print(f"Ladies and Gentlmen, Welcome to President \033[1m\033[4m{tuple[1]}\033[0m\033[0m")
                            print(f"Country : {tuple[0]}")
                            print(f"Bio extract : (Full bio on wiki_url : {tuple[2]})")
                            print(first_paragraph.get_text())
                            break
                    
                return (
                    self.country4prez_list,
                    self.name_list,
                    self.wiki_url_list,
                    self.first_paragraph_list,
                )

        def go_to_json(self):
            # store the 4 abobe list in a json file
            data_to_store = {}
            for country, name, wiki_url, first_paragraph in zip(
                self.country4prez_list,
                self.name_list,
                self.wiki_url_list,
                self.first_paragraph_list,
            ):
                if country not in data_to_store:
                    data_to_store[country] = []
                president_info = {
                    "name": name,
                    "wiki_url": wiki_url,
                    "first_paragraph": first_paragraph,
                }
                data_to_store[country].append(president_info)

            with open("leader_data.json", "w", encoding="utf-8") as json_file:
                json.dump(data_to_store, json_file, ensure_ascii=False, indent=2)


    if __name__ == "__main__":
        # to create the WikiScraper instance with API related parameters
        a = WikiScraper(
            "https://country-leaders.onrender.com", "/countries/", "/leaders/", "/cookie/"
        )

        # to get the cookie necessary to get into the website
        a.get_cookie()

        # to get the list of country from API
        a.get_country_list()
        
        # to get  all the main leaders data (country, name, wikipedia url)
        a.get_leaders_data()
        


        # scrap all url
        with ThreadPoolExecutor(max_workers=10) as executor:
            futures = list(executor.submit(a.scrap_url, tuple[2]) for tuple in a.get_leaders_data())
            
            for future in as_completed(futures):
                try:
                    a.soup_list.append(future.result())
                except Exception as e:
                    print(f"Error while processing a future: {e}")

        
        # get the leader basic bio (the 1st paragraph of wikipedia)
        a.get_leaders_paragraph()

        # save to json all those leaders info
        a.go_to_json()



1
Ladies and Gentlmen, Welcome to President [1m[4mtuple[1][0m[0m
Country : tuple[0]
Bio extract : (Full bio on wiki_url : tuple[2])
Richard Milhous Nixon (January 9, 1913 – April 22, 1994) was the 37th president of the United States, serving from 1969 to 1974. A member of the Republican Party, he previously served as a representative and senator from California and as the 36th vice president from 1953 to 1961 under President Dwight D. Eisenhower. His presidency saw the reduction of U.S. involvement in the Vietnam War, détente with the Soviet Union and China, the Apollo 11 Moon landing, and the establishment of the Environmental Protection Agency and Occupational Safety and Health Administration. Nixon's second term ended early when he became the only U.S. president to resign from office, as a result of the Watergate scandal.

2
Ladies and Gentlmen, Welcome to President [1m[4mtuple[1][0m[0m
Country : tuple[0]
Bio extract : (Full bio on wiki_url : tuple[2])
Andrew Johnson (Decemb

TypeError: keys must be str, int, float, bool or None, not GenericAlias

threads_leaders_data = [Thread(target=a.get_leaders_data) for i in range(len(a.country_list))]  
 
for thread in threads_leaders_data:
    thread.start()

for thread in threads_leaders_data:
    thread.join()
