### Scrape URLs for CommonLit Readability Prize competition 

1. There contains over 600 URLs to scrape. I only scrape ~570 URLs from 3-4 separate domains. 
2. Wikipedia was the most annoying to scrape cleanly. 
3. There may be some undetected artifacts in the text so use with caution.
3. You should perform your own exploratory data analysis to discover any remaining artifacts that occured during scraping.
4. I created a [notebook](https://www.kaggle.com/teeyee314/readability-external-data-eda)  with additional preparation for use with competition training.

You're welcome :)

In [None]:
!pip install -q bs4

In [None]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
from bs4 import BeautifulSoup
import requests
import re
import warnings
warnings.filterwarnings("ignore")

BASE_DIR = '../input/commonlitreadabilityprize'

print(os.listdir(BASE_DIR))

In [None]:
train = pd.read_csv(os.path.join(BASE_DIR, 'train.csv'))

In [None]:
# select rows that have urls
has_text = train[~train['url_legal'].isnull()]

# grab the domain name
has_text['domain'] = has_text['url_legal'].apply(lambda x: x.split('/')[2])

In [None]:
# list all reference urls by frequency in descending order
has_text['url_legal'].apply(lambda x: x.split('/')[2]).value_counts()

| Count | Url |
|--- | --- |
| 196 | simple.wikipedia.org |  
| 191 | kids.frontiersin.org |
| 176 | en.wikipedia.org |
| 8 | en.wikibooks.org |
| 571 | Total |
| 95 | Missing | 

In [None]:
def show_html(text):
    soup = BeautifulSoup(text, 'html.parser')

    words = []

    for paragraph in soup.find_all('p'):
        if paragraph.sup:
            for support in paragraph.find_all('sup'):
                support.decompose()
        words.append(paragraph.get_text())

    return words

def clean_newline(soup=''):
    return re.sub(r'\n', '', soup)

def clean_http(soup=''):
    soup = list(map(lambda x: '' if re.search('http',x) else x, soup))
    soup = list(filter(lambda x: x != '', soup))
    return soup

def clean_frontiersin(soup=''):
    soup = list(map(lambda x: '0' if re.search('\n', x) else x, soup))
    soup = list(map(lambda y: '1' if re.search('↑', y) else y, soup))
    soup = list(filter(lambda x: x != '0', soup))
    soup = list(map(clean_brackets, soup))
    soup = list(map(remove_http_url, soup))
    try:
        soup = soup[:soup.index('1')]
        
    except Exception as e:
        pass
    
    return soup

def remove_copyright(soup=''):
    text = ['The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.',
            'The author declares that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.']
    for t in text:
        try:
            soup = soup[:soup.index(t)]
        except Exception as e:
            pass
    return soup

# remove some artifacts not present in competition data
def clean_brackets(text):
    cleaned = re.sub(r'\[([a-zA-Z0-9]+)\]', '', text)
    cleaned = re.sub(r'\((Figure(s) .+)\)', '', cleaned)
    cleaned = re.sub(r'\((see Figure .+)\)', '', cleaned)
    cleaned = re.sub(r'\[([\w\d\s\W]+)\]', '', cleaned)
    return cleaned

def remove_http_url(soup):
    soup = re.sub(r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))", '', soup)
    return soup

# kids.frontiersin.org

In [None]:
frontier = has_text[has_text['domain'] == 'kids.frontiersin.org'].reset_index(drop=True)

In [None]:
%%time
frontier_text = frontier['url_legal'].map(requests.get)

In [None]:
frontier_soup = frontier_text.apply(lambda x: x.text)
frontier_soup = frontier_soup.map(show_html)
frontier_soup = frontier_soup.map(clean_frontiersin)
frontier_soup = frontier_soup.map(remove_copyright)
frontier_soup = frontier_soup.map(lambda x: '\n'.join(x))

In [None]:
frontier['external_text'] = frontier_soup

# en.wikibooks.org

In [None]:
wikibooks = has_text[has_text['domain'] == 'en.wikibooks.org'].reset_index(drop=True)

In [None]:
%%time
wikibooks_text = wikibooks['url_legal'].map(requests.get)

In [None]:
wikibooks_soup = wikibooks_text.apply(lambda x: x.text)
wikibooks_soup = wikibooks_soup.map(show_html)

In [None]:
wikibooks_soup[0] = list(filter(lambda x: x != '\n', clean_http(wikibooks_soup[0])[:-5]))
wikibooks_soup[1] = list(filter(lambda x: x != '\n',clean_http(wikibooks_soup[1])[:-1]))
wikibooks_soup[2] = list(filter(lambda x: x != '\n', wikibooks_soup[2]))
wikibooks_soup[6] = wikibooks_soup[6][:7] + wikibooks_soup[6][9:]
wikibooks_soup = wikibooks_soup.map(lambda x: ''.join(x))

In [None]:
wikibooks['external_text'] = wikibooks_soup

# simple.wikipedia.org

In [None]:
def show_html_wiki(text):
    soup = BeautifulSoup(text, 'html.parser')
    words = []
    
    # remove tables
    for table in soup.find_all('table'):
        table.decompose()
    
    # remove spans
    for span in soup.find_all('span'):
        span.decompose()
        
    # remove un-ordered lists
    for ul in soup.find_all('ul'):
        ul.decompose()
        
    # remove ordered lists
    for ol in soup.find_all('ol'):
        ol.decompose()

    for paragraph in soup.find_all('p'):
        # remove sup tags
        if paragraph.sup:
            for support in paragraph.find_all('sup'):
                support.decompose()
        cleaned = remove_ufeff(paragraph.get_text())
        cleaned = remove_xa0(cleaned)
        words.append(cleaned)
    
    return words

#  remove artifact from using requests library on wikipedia
def remove_ufeff(text):
    return re.sub(r'\ufeff', '', text)

# remove another artifact
def remove_xa0(text):
    return re.sub(r'\xa0', '', text)

def filter_newline(text):
    text = text.split('\n')
    return '\n'.join(list(filter(lambda x: x != "", text)))

In [None]:
simple_wiki = has_text[has_text['domain'] == 'simple.wikipedia.org'].reset_index(drop=True)

In [None]:
%%time
simple_wiki_text = simple_wiki['url_legal'].map(requests.get)

In [None]:
simple_wiki_soup = simple_wiki_text.apply(lambda x: x.text)
simple_wiki_soup = simple_wiki_soup.map(show_html_wiki)
simple_wiki_soup = simple_wiki_soup.map(lambda x: ''.join(x))
simple_wiki_soup = simple_wiki_soup.map(filter_newline)

In [None]:
simple_wiki['external_text'] = simple_wiki_soup

# en.wikipedia.org

In [None]:
wiki = has_text[has_text['domain'] == 'en.wikipedia.org'].reset_index(drop=True)

In [None]:
%%time
wiki_text = wiki['url_legal'].map(requests.get)

In [None]:
wiki_soup = wiki_text.apply(lambda x: x.text)
wiki_soup = wiki_soup.map(show_html_wiki)
wiki_soup = wiki_soup.map(lambda x: ''.join(x))
wiki_soup = wiki_soup.map(filter_newline)

In [None]:
wiki['external_text'] = wiki_soup

In [None]:
external = pd.concat([wiki, simple_wiki, wikibooks, frontier])
external.to_csv('external.csv', index=False)
external