# Supporting Data Scraping (Glossaries)

In [1]:
# Import Dependencies
import pickle
import requests
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re
import string

In [2]:
data_path = "/Users/seanosier/data/Metis/Final/"

In [3]:
# Pickling functions
def pickle_it(data, filename, python_version=3):
    """
    In:
        data = the data you want to pickle (save)
        filename = file name where you want to save the data
        python_version = the python version where you will be opening the pickle file
    
    Out:
        Saves a pickle file with your data to to the filename you specify
    """
    with open(filename, "wb") as picklefile:
        pickle.dump(data, picklefile, protocol=python_version)

def load_pickle(filename):
    """
    In:
        filename = name of the pickle file you want to open (e.g "my_pickle.pkl")
    
    Out:
        Opens and returns the content of the picklefile to a variable of your choice
    """
    with open(filename, "rb") as picklefile: 
        return pickle.load(picklefile)

In [4]:
def get_HTML(url):
    """
    In:
        url = address of the website whose contents you want to scrape
    
    Out:
        html = the raw HTML of the website for scraping
    """
    response = requests.get(url)
    assert (response.status_code >= 200) and (response.status_code < 300)
    html = response.text
    html = BeautifulSoup(html, "lxml")
    return html

### Scrape Quizlet Vocab Page

In [24]:
def scrape_quizlet_vocab(url):
    """
    In:
        url = url of Quizlet vocab page
    
    Out:
        terms = List of Quizlet glossary terms
        definitions = List of the corresponding definitions for the Quizlet glossary terms
    """
    html = get_HTML(url)
    
    vocab_list = html.find(class_="terms")
    terms = vocab_list.find_all("h3")
    definitions = vocab_list.find_all(class_="definition")
    
    terms = [term.text.strip() for term in terms]
    definitions = [definition.text.strip() for definition in definitions]
    
    return terms, definitions

In [77]:
terms1, definitions1 = scrape_quizlet_vocab(
    "https://quizlet.com/21753155/8th-grade-physical-science-vocabulary-flash-cards/alphabetical")

In [78]:
len(terms1), len(definitions1)

(473, 473)

### Scrape k12.sd.us Vocab Page

In [54]:
def scrape_k12_sd_us_vocab(url):
    """
    In:
        url = url of k12.sd.us vocab page
    
    Out:
        terms = List of k12.sd.us glossary terms
        definitions = List of the corresponding definitions for the k12.sd.us glossary terms
    """
    html = get_HTML(url)
    
    vocab_list = html.find_all("li")
    vocab = [term.text.strip() for term in vocab_list]
    vocab = [term.replace("\xa0", "").replace("\r", "").replace("\n", "").replace("    ", " ") for term in vocab]
    terms, definitions = zip(*[term.split(": ") for term in vocab])

    return terms, definitions

In [55]:
terms2, definitions2 = scrape_k12_sd_us_vocab(
    "https://sb058.k12.sd.us/vocabulary/8th_grade_science_vocabulary_ans.htm")

In [43]:
len(terms2), len(definitions2)

(308, 308)

### Scrape Vocabularly.com Page

In [47]:
def scrape_vocabulary_com_vocab(url):
    """
    In:
        url = url of Vocabulary.com vocab page
    
    Out:
        terms = List of Vocabulary.com glossary terms
        definitions = List of the corresponding definitions for the Vocabulary.com glossary terms
    """
    html = get_HTML(url)
    
    vocab_list = html.find(id="wordlist")
    terms = vocab_list.find_all("a")
    definitions = vocab_list.find_all(class_="definition")
    
    terms = [term.text.strip() for term in terms]
    definitions = [definition.text.strip() for definition in definitions]
    
    return terms, definitions

In [48]:
terms3, definitions3 = scrape_vocabulary_com_vocab(
    "http://www.vocabulary.com/lists/24280#view=notes")

In [49]:
len(terms3), len(definitions3)

(119, 119)

### Read In 8th Grade Glossary .txt

In [115]:
def read_in_8th_grade_glossary_file(filename):
    """
    In:
        filename = Location of pre-downloaded 8th grade glossary .txt file
    
    Out:
        terms = List of pre-downloaded 8th grade glossary terms
        definitions = List of the corresponding definitions for the pre-downloaded 8th grade glossary terms
    """
    vocab = []
    with open(filename) as f:
        for line in f:
            vocab.append(line)
    
    vocab = [term.strip() for term in vocab]
    vocab = [term for term in vocab if term != ""]
    vocab = [term.split("-") for term in vocab]
    vocab = [(term[0].strip(), "-".join(term[1:]).strip()) for term in vocab]
    
    terms, definitions = zip(*vocab)
    
    return terms, definitions

In [75]:
terms4, definitions4 = read_in_8th_grade_glossary_file(data_path + "8thgradeglossary.txt")

In [76]:
len(terms4), len(definitions4)

(143, 143)

### Scrape HRW Glossary Pages

In [5]:
def scrape_HRW_vocab(starting_url):
    """
    In:
        starting_url = url of first HRW vocab page ("A")
    
    Out:
        terms = List of HRW glossary terms
        definitions = List of the corresponding definitions for the HRW glossary terms
    """
    vocab = []
    
    # Scrape page for each letter
    for letter in string.ascii_lowercase:
        url = starting_url[:-5] + letter + ".htm"
        html = get_HTML(url)
    
        vocab_list = html.find_all("p")
        vocab += [str(term) for term in vocab_list]
    
    # Clean scraped vocab
    vocab = [term.replace("<p>","") for term in vocab if type(term) != None]
    vocab = [term.replace("</p>","") for term in vocab]
    vocab = [term.replace("<b>","") for term in vocab]
    vocab = [term.replace("</b>","") for term in vocab]
    vocab = [term.replace("<br>"," = ") for term in vocab]
    vocab = [term.replace("<br/>"," = ") for term in vocab]
    vocab = [term.strip() for term in vocab]
    
    vocab = [term.split(" = ") for term in vocab]
    vocab = [(term[0].strip(), " = ".join(term[1:]).strip()) for term in vocab]
    terms, definitions = zip(*vocab)

    return terms, definitions

In [117]:
terms5, definitions5 = scrape_HRW_vocab("http://go.hrw.com/resources/go_sc/glossary/termsa.htm")

In [118]:
len(terms5), len(definitions5)

(3401, 3401)

### Combine All Vocab Lists

In [121]:
terms = list(terms1) + list(terms2) + list(terms3) + list(terms4) + list(terms5)
definitions = list(definitions1) + list(definitions2) + list(definitions3) + list(definitions4) + list(definitions5)

In [122]:
len(terms), len(definitions)

(4444, 4444)

In [123]:
pickle_it(terms, data_path + "terms.pkl")
pickle_it(definitions, data_path + "definitions.pkl")