# Data Scraping / Acquistion

In [132]:
# Import Dependencies
import pickle
import requests
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re
from wikipedia_page_cleaning import clean_wiki_page

In [35]:
data_path = "/Users/seanosier/data/Metis/Wiki/"

In [4]:
# Pickling functions
def pickle_it(data, filename, python_version=3):
    """
    In:
        data = the data you want to pickle (save)
        filename = file name where you want to save the data
        python_version = the python version where you will be opening the pickle file
    
    Out:
        Saves a pickle file with your data to to the filename you specify
    """
    with open(filename, "wb") as picklefile:
        pickle.dump(data, picklefile, protocol=python_version)

def load_pickle(filename):
    """
    In:
        filename = name of the pickle file you want to open (e.g "my_pickle.pkl")
    
    Out:
        Opens and returns the content of the picklefile to a variable of your choice
    """
    with open(filename, "rb") as picklefile: 
        return pickle.load(picklefile)

In [5]:
def get_HTML(url):
    """
    In:
        url = address of the website whose contents you want to scrape
    
    Out:
        html = the raw HTML of the website for scraping
    """
    response = requests.get(url)
    assert (response.status_code >= 200) and (response.status_code < 300)
    html = response.text
    html = BeautifulSoup(html, "lxml")
    return html

### Scrape Simple Wiki Article Index (List) Pages

In [6]:
def scrape_simple_wiki_index_page(html):
    """
    In:
        html = Raw HTML of wiki index page to scrape
    
    Out:
        Tuple of:
            links = List of links to individual simple wiki pages
            next_page = URL of next wiki index page
    """
    index_list = html.find(class_="mw-allpages-chunk")
    a_tags = index_list.find_all("a")
    links = []
    for link in a_tags:
        links.append(link["href"])
    
    nav_links = html.find(class_="mw-allpages-nav").find_all("a")
    if nav_links[-1].text[:4] == "Next":
        next_page = nav_links[-1]["href"]
    else:
        next_page = ""
    
    return links, next_page

In [7]:
def get_all_simple_wiki_links(starting_url):
    """
    In:
       starting_url = URL of first wiki index page to scrape 
    
    Out:
        all_links = List of links to all individual simple wiki pages
    """
    next_url = starting_url
    all_links = []
    url_prefix = "https://simple.wikipedia.org"
    last_save = 0
    
    while next_url != url_prefix:
        html = get_HTML(next_url)
        links, next_page = scrape_simple_wiki_index_page(html)
        all_links += links
        next_url = url_prefix + next_page
        
        if len(all_links) > 10000 + last_save:
            pickle_it((all_links, next_url), data_path + "simple_wiki_links.pkl")
            last_save = len(all_links)
        
    return all_links

In [None]:
# Scrape all simple wiki links
all_simple_wiki_links = get_all_simple_wiki_links(
    "https://simple.wikipedia.org/w/index.php?title=Special:AllPages&from=%21%21%21&hideredirects=1")

In [10]:
# Remove links to simple wiki pages that were removed since scraping
all_simple_wiki_links.remove("/wiki/2009-2010_Mary_Fisk_School_screamer_incident")
all_simple_wiki_links.remove("/wiki/Eaas")
all_simple_wiki_links.remove("/wiki/Hazel_Levesque")
all_simple_wiki_links.remove("/wiki/Hoodoo_(geology)")
all_simple_wiki_links.remove("/wiki/Truancy")
len(all_simple_wiki_links)

115748

In [None]:
# Pickle simple wiki links
pickle_it(all_simple_wiki_links, data_path + "simple_wiki_links.pkl")

In [12]:
# Extract simple wiki topics from simple wiki links
simple_topics = [link.split("/")[-1] for link in all_simple_wiki_links]
len(simple_topics)

115748

In [None]:
# Pickle simple wiki topics
pickle_it(simple_topics, data_path + "simple_wiki_topics.pkl")

### Scrape All Individual Wiki Pages to Get English Wiki Page Links

In [14]:
def get_english_wiki_links(all_simple_wiki_links):
    """
    In:
        all_simple_wiki_links = List of links to all individual simple wiki pages
    
    Out:
        all_links = List of links to all individual english wiki pages
    """
    all_links = []
    url_prefix = "https://simple.wikipedia.org"
    last_save = 0
    
    for link in all_simple_wiki_links:
        html = get_HTML(url_prefix + link)
        try:
            english_wiki_link_li = html.find(class_="interwiki-en")
            all_links.append(english_wiki_link_li.find("a")["href"])
        except:
            all_links.append("")
        
        if len(all_links) >= 100 + last_save:
            pickle_it(all_links, data_path + "english_wiki_links.pkl")
            last_save = len(all_links)
            print(last_save)
        
    return all_links

In [16]:
# Scrape English Wiki Links from Simple Wiki Pages
english_wiki_links = get_english_wiki_links(all_simple_wiki_links)
len(english_wiki_links)

115748

In [None]:
# Pickle english wiki links
pickle_it(english_wiki_links, data_path + "english_wiki_links.pkl")

In [19]:
# Extract english wiki topics from english wiki links
english_topics = [link.split("/")[-1] for link in english_wiki_links]
len(english_topics)

115748

In [None]:
# Pickle english wiki topics 
pickle_it(english_topics, data_path + "english_wiki_topics.pkl")

In [21]:
# Group english topics and simple topics together, and remove any without english pages
topic_pairs = [(english, simple) for english, simple in zip(english_topics, simple_topics) if english != ""]
len(topic_pairs)

113420

In [None]:
# Pickle english, simple wiki topic pairs 
pickle_it(topic_pairs, data_path + "wiki_topic_pairs.pkl")

### Pull Raw English and Simple Wiki Articles Using API

In [31]:
def pull_raw_wiki_articles(topic_pairs):
    """
    In:
        topic_pairs = List of grouped english topics and simple topics together
    
    Out:
        raw_english_articles = Raw text for all english articles
        raw_simple_articles = Raw text for all simple articles
    """
    raw_english_articles = []
    raw_simple_articles = []
    pickle_number = 1
    
    for pair in topic_pairs:
        english_topic = pair[0]
        simple_topic = pair[1]
        
        # Get english article text
        english_url = "https://en.wikipedia.org/w/index.php?action=raw&title=" + english_topic
        try:
            raw_english_text = urlopen(english_url)
            raw_english_text = raw_english_text.read().decode('UTF-8')
        except:
            raw_english_text = ""
        
        # Get simple article text
        simple_url = "https://simple.wikipedia.org/w/index.php?action=raw&title=" + simple_topic
        try:
            raw_simple_text = urlopen(simple_url)
            raw_simple_text = raw_simple_text.read().decode('UTF-8')
        except:
            raw_simple_text = ""
        
        # Deal with any english article redirects
        if raw_english_text[:9] == "#REDIRECT":
            english_topic = re.search(r"\[\[.*\]\]", raw_english_text).group()[2:-2]
            english_topic = english_topic.replace(" ", "_")
            english_url = "https://en.wikipedia.org/w/index.php?action=raw&title=" + english_topic
            try:
                raw_english_text = urlopen(english_url)
                raw_english_text = raw_english_text.read().decode('UTF-8')
            except:
                raw_english_text = ""
        
        # Deal with any simple article redirects
        if raw_simple_text[:9] == "#REDIRECT":
            simple_topic = re.search(r"\[\[.*\]\]", raw_simple_text).group()[2:-2]
            simple_topic = simple_topic.replace(" ", "_")
            simple_url = "https://simple.wikipedia.org/w/index.php?action=raw&title=" + simple_topic
            try:
                raw_simple_text = urlopen(simple_url)
                raw_simple_text = raw_simple_text.read().decode('UTF-8')
            except:
                raw_simple_text = ""
            
        raw_english_articles.append(raw_english_text)
        raw_simple_articles.append(raw_simple_text)
        
        # Give status updated and save progress periodically
        if len(raw_english_articles) % 30000 == 0:
            pickle_number += 1
        
        if len(raw_english_articles) % 1000 == 0:
            pickle_it((raw_english_articles, raw_simple_articles), data_path + "raw_wiki_articles" \
                      + str(pickle_number) + ".pkl")
            print(len(raw_english_articles))
        
    return raw_english_articles, raw_simple_articles

In [68]:
# Pull raw text for all english and simple article pairs
raw_english_articles, raw_simple_articles = pull_raw_wiki_articles(topic_pairs)
print(len(raw_english_articles), len(raw_simple_articles))

113420 113420


In [69]:
# Remove any english article, simple article, topic trios if either the english or simple article is blank
raw_article_pairs = [(english, simple, topic) for english, simple, topic \
                     in zip(raw_english_articles, raw_simple_articles, topic_pairs)\
                     if english != "" and simple != ""]
len(raw_article_pairs)

113325

In [70]:
# Unzip the remaining english articles, simple articles, and topics
raw_english_articles, raw_simple_articles, topic_pairs = zip(*raw_article_pairs)

In [72]:
# Split english and simple article lists into two and pickle (full list won't fit in a single pickle)
pickle_it(raw_english_articles[:50000], data_path + "raw_english_articles1.pkl")
pickle_it(raw_english_articles[50000:], data_path + "raw_english_articles2.pkl")

pickle_it(raw_simple_articles[:50000], data_path + "raw_simple_articles1.pkl")
pickle_it(raw_simple_articles[50000:], data_path + "raw_simple_articles2.pkl")

pickle_it(topic_pairs, data_path + "wiki_topic_pairs.pkl")

### Clean Raw Wiki Article Text

In [73]:
# Clean simple articles
simple_articles = [clean_wiki_page(article) for article in raw_simple_articles]

In [74]:
len(simple_articles)

113325

In [76]:
# Pickle cleaned simple articles
pickle_it(simple_articles, data_path + "simple_articles.pkl")

In [99]:
# Remove last character in this english article because it causes issues during article cleaning
raw_english_articles = list(raw_english_articles)
raw_english_articles[83399] = raw_english_articles[83399][:-1]

In [None]:
# Clean english articles
english_articles = [clean_wiki_page(article) for article in raw_english_articles]

In [119]:
len(english_articles)

113325

In [110]:
# Pickle cleaned simple articles
pickle_it(english_articles, data_path + "english_articles.pkl")

In [144]:
# Remove any english article, simple article, topic trios if either the cleaned english or simple article is blank
article_pairs_and_topics = [(english, simple, topic) for english, simple, topic 
                            in zip(english_articles, simple_articles, topic_pairs)\
                            if english != "" and simple != ""]
len(article_pairs_and_topics)

113298

In [145]:
# Unzip the remaining english articles, simple articles, and topics
english_articles, simple_articles, topic_pairs = zip(*article_pairs_and_topics)

In [146]:
len(english_articles), len(simple_articles), len(topic_pairs)

(113298, 113298, 113298)

In [147]:
# Pickle final, cleaned english articles, simple articles, and topic pairs
pickle_it(english_articles, data_path + "english_articles.pkl")
pickle_it(simple_articles, data_path + "simple_articles.pkl")
pickle_it(topic_pairs, data_path + "wiki_topic_pairs.pkl")