# Webpage data scraping with Mercury, Requests and Beautifulsoup

In [1]:
import requests
from bs4 import BeautifulSoup
import os
import json
import config
import pathlib
import re
from nltk.tokenize import sent_tokenize, word_tokenize
import csv

  if 'order' in inspect.getargspec(np.copy)[0]:


### Configuration

In [2]:
model = 'mobile'
folder = 'models/' + model

# create a sources.txt file with one line per url within the model folder

In [3]:
class WebParser:
    def __init__(self):
        self.env_variables = config.Config().get()
        self.parse_api_key = str(self.env_variables['mercury_api_key'])
        self.parse_api_url = 'https://mercury.postlight.com/parser?url='

    def prepare_payload(self, url):
        payload = {'Content-Type': 'application/json',
                   'x-api-key': self.parse_api_key}
        return payload

    def parse(self, url):
        payload = self.prepare_payload(url)
        request_url = self.parse_api_url + url
        try:
            response = requests.get(request_url, headers=payload)
            response = response.json()['content']
        except ValueError:
            response = response.content
        except Exception as e:
            response = str(e)
        return response

In [4]:
class FileStorage:
    def __init__(self, folder, path):
        self.env_variables = config.Config().get()
        self.folder = folder
        self.path = path
        
    def store(self, data):
        pathlib.Path(self.folder + '/data/' + self.path).mkdir(parents=True, exist_ok=True)
        fp = open(self.folder + '/data/' + self.path + data['uid'] + '.json', 'w')
        fp.write(json.dumps(data))
        
    def storeInCSV(self, uid, data):
        pathlib.Path(self.folder + '/data/' + self.path).mkdir(parents=True, exist_ok=True)
        with open(self.folder + '/data/' + self.path + uid + '.csv', 'w', encoding='utf-8') as csvfile:
            spamwriter = csv.writer(csvfile, delimiter=',',
                                    quotechar='"')
            for row in data:
                spamwriter.writerow([row])

In [5]:
class TextExtractor:
    def __init__(self):
        self.env_variables = config.Config().get()
        
    def HTMLtoLines(self, html):
        soup = BeautifulSoup(html, 'html.parser')
#         paragraphs = soup.find_all('p')
        text = soup.get_text()
        sentences = sent_tokenize(text)
        return sentences
    
    def SentenceToWords(self, sentence):
        words = word_tokenize(sentence)
        return words

In [6]:
class Download:
    def __init__(self, folder):
        self.env_variables = config.Config().get()
        self.folder = folder
        self.url_regex = re.compile(
        r'^(?:http|ftp)s?://' # http:// or https://
        r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain...
        r'localhost|' #localhost...
        r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
        r'(?::\d+)?' # optional port
        r'(?:/?|[/?]\S+)$', re.IGNORECASE)
        
    def start(self):
        with open(self.folder + '/sources.txt') as sources:
            url = sources.readline()
            while url:
                url = url.strip()
                if re.match(self.url_regex, url) is None:
                    print('faulty url: ' + url)
                    url = sources.readline()
                    continue
                data = dict()
#                 print('fetching ' + url)
                data['content'] = WebParser().parse(url)
                data['uid'] = url.split('/')[-1]
                FileStorage(self.folder, 'html/').store(data)
                sentences = TextExtractor().HTMLtoLines(data['content'])
                FileStorage(self.folder, 'csv/').storeInCSV(data['uid'], sentences)
#                 print('fetched and saved csv and html')
                url = sources.readline()

In [7]:
Download(folder).start()

### Compiled csv of sentences

In [8]:
allRows = []
directory = os.getcwd() + '/' + folder + '/data/csv/'
for filename in os.listdir(directory):
    with open(os.path.join(directory, filename), 'r', encoding='utf-8') as csvfile:
        spamreader = csv.reader(csvfile)
        for row in spamreader:
            allRows.append(row[0])
FileStorage(folder, '').storeInCSV(model + '-compiled', allRows)