In [39]:
import wikipedia
import os
import unicodedata
import re
#!pip install mediawikiapi
# from mediawikiapi import MediaWikiAPI
# wikipedia = MediaWikiAPI()

In [44]:
def slugify(value, allow_unicode=False):
    """
    Taken from https://github.com/django/django/blob/master/django/utils/text.py
    Convert to ASCII if 'allow_unicode' is False. Convert spaces or repeated
    dashes to single dashes. Remove characters that aren't alphanumerics,
    underscores, or hyphens. Convert to lowercase. Also strip leading and
    trailing whitespace, dashes, and underscores.
    """
    value = str(value)
    if allow_unicode:
        value = unicodedata.normalize('NFKC', value)
    else:
        value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii')
    value = re.sub(r'[^\w\s-]', '', value.lower())
    return re.sub(r'[-\s]+', '-', value).strip('-_')


class WikiSearch:
    
    def __init__(self):
        self.already_acquired = set()

    def get_content(self,search_criteria,name):
        """Get content based on specific search criteria.
        
        Args:
            search_criteria (str): search string from wikipedia suggestion
            name (str): search string based on config
        Return:
            content (dict): payload containing content from API call
                {
                    links:list,
                    references:list,
                    raw_html:str,
                    see_also:list,
                    name"str
                }
        """
        output = {}
        results = None
        try:
            results = wikipedia.page(search_criteria,auto_suggest=False)
        except wikipedia.exceptions.DisambiguationError as e:
            print("tried for {}, did not work so trying {}".format(search_criteria,name))
            try:
                results = wikipedia.page(name,auto_suggest=False)
            except wikipedia.exceptions.DisambiguationError as e:
                print("did not work for {}".format(name))
                return None

        position = None
        if [x for x in results.content.split("\n") if "== See" in x]:
            for idx, res in enumerate(results.content.split("\n")):
                if "== See" in res:
                    position = idx
                    break

        see_also = []
        if position:
            for content in results.content.split("\n")[position+1:]:
                if content:
                    if "==" in content:
                        break
                    see_also.append(content)

        output["see_also"] = see_also
        output["name"] = name
        output["links"] = results.links
        output["raw_html"] = results.html()
        try:# Weird issue with the wrapper
            output["references"] = results.references
        except KeyError:
            output["references"] = []
        
        self.already_acquired.add(search_criteria)
        self.already_acquired.add(name)

        return output

    def acquire_information(self,search_criteria):
        """Extract information from the API.
        Entry point into main acquisition.
        
        Args:
            search_criteria (str): name from config
        Return:
            content (dict): content from get_content()
        """
        print("Acquiring information for {}".format(search_criteria))
        if "(disambiguation)" in search_criteria:
            print("ignoring: {}".format(search_criteria))
            return {
                "references":[],
                "links":[],
                "raw_html":"",
                "see_also":[],
                "name":search_criteria,
                "no_data":True
            }

        check = wikipedia.search(search_criteria,results=1,suggestion=True)
        check = check[0][0] if isinstance(check[0],list) else check[0]
        
        if (check not in self.already_acquired) or (search_criteria not in self.already_acquired):
            output = self.get_content(check,search_criteria)
            if not output:
                print("no data for {}".format(check))
                return {
                    "references":[],
                    "links":[],
                    "raw_html":"",
                    "see_also":[],
                    "name":search_criteria,
                    "no_data":True
                }
            return output
        else:
            print("Already Acquired {}".format(check))
            return {
                "references":[],
                "links":[],
                "raw_html":"",
                "see_also":[],
                "name":search_criteria,
                "no_data":False,
                "already_acquired":True
            }


    def get_nested_content(self,output):
        """Get all of the data for dependent links"""
        print("getting nested content for: {}".format(output.get("name")))

        related_content = {}
        related_content["links"] = []
        related_content["see_also"] = []

        if output.get("no_data",False):
            print("no data for {}".format(output.get("name")))
            return related_content
        if output.get("already_acquired",False):
            print(" Already got data for {}".format(output.get("name")))
            return related_content

        if output.get("links"):
            for link in output.get("links"):
                related_content["links"].append(self.acquire_information(link))

        if output.get("see_also"):
            for also in output.get("see_also"):
                related_content["see_also"].append(self.acquire_information(link))

        return related_content

    def write_out(
        self,
        path,
        content,
        file_type
    ):
        if file_type == "html":
            with open(path,"w",encoding="utf-8") as out:
                out.write(content)
        elif file_type == "txt":
            if not content:
                return
            with open(path,"w") as out:
                out.write("\n".join(str(item) for item in content)) 

        return True

    def write_to_path(
        self,
        path,
        content
    ):

        if not os.path.exists(os.getcwd() + "\content\{}".format(path)):
            os.makedirs(os.getcwd() + "\content\{}".format(path))

        self.write_out(
            os.getcwd() + "\content\{}\{}.html".format(
                path,
                slugify(content.get("name"))
            ),
            content.get("raw_html"),
            file_type = "html"
        )

        self.write_out(
            os.getcwd() + "\content\{}\{}.txt".format(
                path,
                "sources"
            ),
            content.get("references"),
            file_type = "txt"
        )

    def iterate_through_results(self,name_of_concept,depth=0):
        """Main function that iterates through the nested tree structure of data.
        
        Args:
            name_of_concept (str): name from config
        Return:
            None
        """

        depth = depth
        content = self.acquire_information(name_of_concept)
        self.write_to_path(
            path = slugify(content.get("name")),
            content = content
        )

        nested_content = self.get_nested_content(content)
        first_level = []
        first_level+=nested_content["links"]
        first_level+=nested_content["see_also"]
        new_content = first_level
        while depth >= 0:

            for stuff in new_content:
                self.write_to_path(
                    path = slugify(content.get("name")) + "\\" + slugify(stuff.get("name")),
                    content = stuff
                )
            
            if depth == 0:
                break

            new_content_temp = []
            for stuff in new_content:
                print("nested info: {}".format(stuff.get("name")))
                #output = self.acquire_information(stuff.get("name"))
                nested_content = self.get_nested_content(stuff)
                new_content_temp+=nested_content["links"]
                new_content_temp+=nested_content["see_also"]

            new_content = new_content_temp
            print(len(new_content))

            depth-=1

        return

In [45]:
def get_info():
    search = WikiSearch()
    with open("search_data.txt","r") as f:
        for line in f:
            search_criteria = line.strip()
            if ("Division" in search_criteria) or ("Composition" in search_criteria):
                search_criteria = "Fallacy of " + search_criteria
            search.iterate_through_results(search_criteria)
    return

In [48]:
#get_info()