Notebook to accompany the Programming Historian lesson:
    
    "Web scraping of news articles from the UK Web Archive using Boilerpipe"

In [None]:
!pip install pandas
import pandas as pd

In [None]:
shinedata = pd.read_csv('./data/export.csv', skiprows=2)

# In jupyter notebooks, putting the name of a variable on the last line
# of the cell will show that variable below the cell. This helps to inspect
# the values and structures of the variables your algorithms are working on.
shinedata.head()

### Renaming the columns

In [None]:
# Look at the columns
shinedata.columns

In [None]:
cols = shinedata.columns.to_list()
cols = [c.strip().replace(' ', '_') for c in cols]
cols

In [None]:
cols[7] = 'Archive_URL'  # change item 8 in the list
cols[-1] = 'Original_URL'  # change the last item in the list
shinedata.columns = cols  # Assign the list of column names to the dataframe

In [None]:
shinedata.head()

In [None]:
shinedata = shinedata[['Crawl_Date', 'Archive_URL', 'Original_URL']]

In [None]:
shinedata.head()

In [None]:
shinedata = shinedata.drop_duplicates(subset='Original_URL', keep='first')

In [None]:
shinedata.info()

### Deeper data cleaning

In [None]:
urls = shinedata.Archive_URL
urls = [url for url in urls if url.endswith('rss') == False]
len(urls)

In [None]:
# Write list to file - this is useful if you want to use wget, trafilatura or another command line scraping tool.
with open('./data/unique-urls.txt', 'w') as f:
    f.write('\n'.join(urls))

## Web Scraping using Boilerpipe

In [None]:
!pip install boilerpy3
from boilerpy3 import extractors

In [None]:
extractor = extractors.ArticleExtractor()
content = extractor.get_content_from_url(urls[55])

content  # View what Boilerpipe returned

### Choosing filenames

In [None]:
# Replace characters in a URL with filesystem safe characters. Returns a string.

def filenameFromUrl(url):
    return url.replace("http://", "").replace("https://", "").replace(".", "_").replace("/", "_")

In [None]:
# Test it out on the first 5 URL's in our list.
for url in urls[:5]:
    print(filenameFromUrl(url[27:]))

### Where to save our corpus

In [None]:
import os

corpusdir = './data/corpus/'

if not os.path.exists(corpusdir):
    os.mkdir(corpusdir)

**How long is scraping 361 URL's going to take?**

In [None]:
%%time

extractor = extractors.ArticleExtractor()
content = extractor.get_content_from_url(urls[0])

In [None]:
len(urls) * 2.8 / 60

### Scraping all pages

In [None]:
for url in urls:
    filename = os.path.join(corpusdir, filenameFromUrl(url)) + '.txt'
    if not os.path.exists(filename):
        with open(filename, 'w', encoding='utf8') as f:
            print('Scraping... {}'.format(url))
            extractor = extractors.ArticleExtractor()
            content = extractor.get_content_from_url(url)
            f.write(content)
    else:
        print('Already scraped... {}'.format(url))

Below is the same scraping algorithm with a small bit of error handling added

In [None]:
errorlog = 'error.log'

for url in urls:
    filename = os.path.join(corpusdir, filenameFromUrl(url)) + '.txt'
    if not os.path.exists(filename):
        try:
            with open(filename, 'w', encoding='utf8') as f:
                print('Scraping... {}'.format(url))
                extractor = extractors.ArticleExtractor()
                content = extractor.get_content_from_url(url)
                f.write(content)

        except Exception as ex:
            errormsg = 'Exception of type {} on... {}\n'.format(type(ex).__name__, url)
            print(errormsg)
            with open(errorlog, 'a', encoding='utf8') as errlog:
                # note we are opening this file with the 'a' status, 
                # which means append to existing if we opened it with 'w'
                # which is much more common, that would overwrite the file.
                errlog.write(errormsg)

    else:
        print('Already scraped... {}'.format(url))

## Are the articles still relevant

In [None]:
import re

# Let's first create a new directory called 'filtered_corpus' where the
# files originated from the 'corpus' directory will be stored after filtered. 
corpusdir = './data/corpus'
filteredcorpusdir = './data/filtered_corpus'

if not os.path.exists(filteredcorpusdir):
    os.mkdir(filteredcorpusdir)


items = ['legacy', 'Legacy']


# For every text file in the 'corpus', if file contain the words 
# in the list of items, copy and paste to the new directory 'filtered_corpus'.
# Also, print whether the words were found or not in each text file and 
# how many times they are cited. 
for filename in os.listdir(corpusdir):
    if filename.endswith(".txt"):
        with open(os.path.join(corpusdir, filename), 'r', encoding='utf-8') as myfile:
            content = myfile.read()

            for i in items:
                lis = re.findall(i, content)
                if len(lis)==0:
                    print(filename,'Not found')
                    
                elif len(lis)==1:
                    print(filename,'Found once')
                    with open(os.path.join(filteredcorpusdir, filename + '.txt'), 'w', encoding='utf-8') as file1:
                        file1.write(content)
                        
                elif len(lis)==2:
                    print(filename,'Found twice')
                    with open(os.path.join(filteredcorpusdir, filename + '.txt'), 'w', encoding='utf-8') as file1:
                        file1.write(content)
                        
                else:
                    print(filename,'Found', len(lis), 'times')
                    with open(os.path.join(filteredcorpusdir, filename + '.txt'), 'w', encoding='utf-8') as file1:
                        file1.write(content)