In [1]:
import concurrent.futures as cf
import requests
import pickle
import os

from web_scrape import ScrapePageRegex as ScrapePage


# Prep

In [2]:
starting_url = "https://en.wikipedia.org/wiki/Wikipedia"
comparison_url = "https://en.wikipedia.org/wiki/Open_collaboration" #  
domains_to_crawl = ["en.wikipedia.org"] # Add None for all domains

# Using Regex

Must faster than BeautifulSoup

Sample

In [3]:
# Sample
s = ScrapePage()
txt = s.load_page(starting_url)
comparison_links = s.find_links(txt)
comparison_links[0:5]

['https://en.wikipedia.org/wiki/RationalWiki',
 'https://en.wikipedia.org/wiki/GAVI',
 'https://en.wikipedia.org/wiki/Slashdot',
 'https://en.wikipedia.org/wiki/Financial_Times',
 'https://en.wikipedia.org/wiki/Viola_angustifolia']

In [4]:
class WikiCrawler:
    
    def __init__(
        self, 
        starting_url:str="https://en.wikipedia.org/wiki/Wikipedia", 
        pickle_dir:str="wiki_pickle", 
        request_session = None
    ) -> None:
        
        if not os.path.exists(pickle_dir):
            os.mkdir(pickle_dir)
        
        # Pickle Settings 
        self.pickle_dir = pickle_dir
        self.pickle_limit = 100_000
        self.pickle_counter = 0
        
        # Web Scraping Object
        self.s = ScrapePage(request_session=request_session)
        
        # Storage
        self.id_counter = 0
        self.url_to_id = {}
        
        self.graph = {} # {id0: [id1, id2, id3]}
        self.nodes = {} # {id0: {"url": url, "data": data}}
        
        self.errors = {}
        
        # RAM Storage
        self.queue = []
        self.queue.append(starting_url)
        
        # Starting URL
        self.starting_url = starting_url

        
    def _crawl_page(self, url):
        """
        Crawl a single page and add to graph and nodes.
        
        Args:
            url (str): URL to crawl
            
        Returns:
            _id (int): ID of crawled page
        """
        if url in self.url_to_id.keys() and self.url_to_id[url] in self.graph.keys():
            return
        
        txt = self.s.load_page(url)
        links = self.s.find_links(txt)
        
        if len(links) == 0:
            return
        
        _id = self.id_counter
        self.id_counter += 1
        
        self.url_to_id[url] = _id
        

        self.nodes[_id] = {"url": url, "data": self.s.find_data_block(txt)}
        
        for link in links:
            if not link in self.url_to_id:
            
                self.url_to_id[link] = self.id_counter
                self.id_counter += 1
            
            self.graph[_id] = self.graph.get(_id, []) + [self.url_to_id[link]]
                
            self.queue.append(link)
            
        return _id
    
    
    def pickle_progress(self):
        """
        Pickle graph and nodes.
        """
        # Pickle graph and nodes
        pickle.dump(self.graph, open(f"{self.pickle_dir}/graph_{self.pickle_counter}.pkl", "wb"))
        pickle.dump(self.nodes, open(f"{self.pickle_dir}/nodes_{self.pickle_counter}.pkl", "wb"))
        pickle.dump(self.errors, open(f"{self.pickle_dir}/errors_{self.pickle_counter}.pkl", "wb"))
        self.pickle_counter += 1
         
    
    def log_results(self):
        """
        Log results to console.
        
        Prints progress every 100 pages.
        
        Pickles progress every self.pickle_limit pages.
        """
        if len(self.nodes) % 1000 == 0:
            print(f"Progress:\nPages in Queue (past and current) {self.id_counter}\nPages Scanned: {len(self.nodes)}\n===================================")
        if len(self.nodes) % self.pickle_limit == 0:
            self.pickle_progress()
        
            
            
    def crawl(self, max_pages=1000):
        """
        Crawl pages until max_pages is reached or queue has been emptied.
        
        Multiprocessing needs to be tested. Current equipment does not have a GPU.
        
        Args:
            max_pages (int): Maximum number of pages to crawl
            
        Returns:
            None
        """
        # Multiprocessing
        with cf.ProcessPoolExecutor() as executor:
            while len(self.nodes) < max_pages:
                # If queue is empty, wait for all processes to finish and check again
                if len(self.queue) == 0:
                    print("Waiting for processes to finish...")
                    cf.wait(executor, timeout=None, return_when=cf.ALL_COMPLETED)
                    # If queue is still empty, break
                    if len(self.queue) == 0:
                        break
                    
                # If queue is not empty, pop first url and crawl
                url = self.queue.pop(0)
                try:
                    self._crawl_page(url)
                except Exception as e:
                    self.errors["url"] = e
                    continue
                self.log_results()
                
        # Pickle final results
        self.pickle_progress()

        
    

In [9]:
r = requests.Session()
w = WikiCrawler(starting_url, request_session=r)
w.crawl(10_000);
r.close()

Progress: 17271
Progress: 40545
Progress: 57619
Progress: 79211
Progress: 94810
Progress: 108245
Progress: 119024
Progress: 132892
Progress: 141888
Progress: 150125
Progress: 150125
Progress: 163959
Progress: 176993
Progress: 189598
Progress: 207289
Progress: 219750
Progress: 229146
Progress: 239509
Progress: 251671
Progress: 260443
Progress: 271126
Progress: 278647
Progress: 278647
Progress: 278647
Progress: 278647
Progress: 278647
Progress: 278647
Progress: 278647
Progress: 278647
Progress: 278647
Progress: 278647
Progress: 278647
Progress: 278647
Progress: 297814
Progress: 309669
Progress: 315275
Progress: 319791
Progress: 322470
Progress: 324846
Progress: 332537
Progress: 337048
Progress: 337048
Progress: 350472
Progress: 350472
Progress: 350472
Progress: 350472
Progress: 364442
Progress: 364442
Progress: 372876
Progress: 372876
Progress: 372876
Progress: 375550
Progress: 379127
Progress: 387671
Progress: 387671
Progress: 387671
Progress: 387671
Progress: 387671
Progress: 400970
Pr

IndexError: list index out of range

10 pages took 1.7s. Need to test with better spec computer

In [10]:
w.nodes

{0: {'url': 'https://en.wikipedia.org/wiki/Wikipedia',
  'data': '{"@context":"https:\\/\\/schema.org","@type":"Article","name":"Wikipedia","url":"https:\\/\\/en.wikipedia.org\\/wiki\\/Wikipedia","sameAs":"http:\\/\\/www.wikidata.org\\/entity\\/Q52","mainEntity":"http:\\/\\/www.wikidata.org\\/entity\\/Q52","author":{"@type":"Organization","name":"Contributors to Wikimedia projects"},"publisher":{"@type":"Organization","name":"Wikimedia Foundation, Inc.","logo":{"@type":"ImageObject","url":"https:\\/\\/www.wikimedia.org\\/static\\/images\\/wmf-hor-googpub.png"}},"datePublished":"2001-11-06T01:16:13Z","dateModified":"2023-10-21T07:46:34Z","image":"https:\\/\\/upload.wikimedia.org\\/wikipedia\\/en\\/8\\/80\\/Wikipedia-logo-v2.svg","headline":"free multilingual online encyclopedia"}'},
 985: {'url': 'https://en.wikipedia.org/wiki/RationalWiki',
  'data': '{"@context":"https:\\/\\/schema.org","@type":"Article","name":"RationalWiki","url":"https:\\/\\/en.wikipedia.org\\/wiki\\/RationalWiki",

In [11]:
len(w.graph)

8650