# 2. Web Scraping

Download and extract as separate texts all posts in a section of choice from http://forum.lvivport.com. The task requires web scraping.

Hints: scrapy, readability, lazyweb, lazynlp

In [20]:
# For debug purposes
from pprint import pprint as pp

import requests
from bs4 import BeautifulSoup as bs
import re
import json

base_url = "http://forum.lvivport.com/"

# But of course we're parsing Divochi Posydenky
board_url = "http://forum.lvivport.com/forums/divochi-posidenki.194/"


def generate_urls(board_url, num_pages):
    """Generate urls for pagination"""
    
    urls = [board_url]
    if num_pages > 1:
        for i in range(2, num_pages+1):
            urls.append(board_url + 'page-{0}'.format(i))
        
    return urls


def get_threads(board_url, num_pages):
    """Get thread ulrs from board index"""
    
    urls = generate_urls(board_url, num_pages)
    threads = []
    
    for url in urls:
        r = requests.get(url).text
        soup = bs(r, 'html.parser')
        
        links = soup.select('ol.discussionListItems > li > div > div > h3 > a')
        for link in links:
            s = link.get('href')
            threads.append(s)

    return threads


def get_content(base_url, threads):
    """
    The actual content scraping routine
    
    Returns content in a nested dict as follows:
    
    threads = {int thread_id: {
        int post_id1 : {
            user: str,
            text: str,
        }
    }}
    """
    
    # This will help clean up text in messages
    text_postprocessors = {
        '(adsbygoogle = window.adsbygoogle || []).push({});': '',
        'Натисніть, щоб розгорнути...': '',
        '\n\n\n\n': '\n',
        '\n\n\n': '\n',
        '\n\n': '\n',
        '↑\n': '',
        '	': '',
        '  ': ' '
    }
    
    # Init forum container
    forum_content = {}
    
    # Walk each thread url
    for thread_url in threads:
        # Build url
        url = base_url + thread_url
                
        # Request & soupify page
        r = requests.get(url).text
        soup = bs(r, 'html.parser')
        
        # Parse thread id
        try:
            thread_id = int(re.findall(r'\.(\d+)\/', url)[0])
        except IndexError:
            print("Invalid Thread ID")
            thread_id = -1
                
        # Init thread container
        thread_content = {}
        
        # Messages on the board live in an <ol> tag
        # Parse the tag
        messages = soup.select('ol.messageList')
        
        for tag in messages:
            
            # Posts live as <li> items
            posts = tag.select('li')
            
            # for <li> in <ol>:
            for post in posts:
                
                # Init post container
                post_content = {}
                                
                # Parse post_id
                try:
                    txt = post.get('id')
                    post_id = re.findall(r'post-(\d+)', txt)[0]
                except TypeError:
                    # Some <li> will be non-content sections like ads, etc.
                    continue

                # Parse user_id
                try:
                    p = r'\/(\S+)\/'
                    txt = post.select('div.avatarHolder a')[0].get('href')
                    user_id = re.findall(p, txt)[0]
                    post_content['user_id'] = user_id

                except IndexError:
                    pass

                # Parse post_text
                try:
                    # Extract text
                    post_text = post.select('article')[0].get_text()

                    # Run post-processors for text cleanup
                    for pp in text_postprocessors.keys():
                        post_text = post_text.replace(pp, text_postprocessors[pp])

                    post_content['post_text'] = post_text

                except IndexError:
                    pass                
                
                # Save post into thread container
                thread_content[post_id] = post_content

        # Save thread into board container
        forum_content[thread_id] = thread_content
    
    return forum_content
    
    

threads = get_threads(board_url, 5)
forum_content = get_content(base_url, threads)
json_string = json.dumps(forum_content)

jfile = open('posydenky.json', 'w+')
jfile.write(json_string)
jfile.close()

print("Parsed %s threads." % len(threads))

Parsed 64 threads.


In [96]:
soup.find_all('title')

[<title>Подарунок хлопцю | Львівський Форум</title>]