Skip to content

Commit

Permalink
Merge pull request #108 from MrLokans/master
Browse files Browse the repository at this point in the history
Implemented importpython.com basic parser.
  • Loading branch information
axsapronov committed May 14, 2016
2 parents 5f7ff6c + e5b4abc commit 6e36310
Show file tree
Hide file tree
Showing 4 changed files with 240 additions and 0 deletions.
169 changes: 169 additions & 0 deletions digest/management/commands/import_importpython.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
# -*- coding: utf-8 -*-
"""
This module contains command to obtain news from importpython.com
and save the to database.
To use it run something like
python manage.py import_importpython --number 67
If no args specified parses latest news page.
"""
from __future__ import unicode_literals

from urllib.error import URLError
from urllib.request import urlopen

from typing import Dict, Union, Tuple, List

from django.core.management.base import BaseCommand
from bs4 import BeautifulSoup

from digest.management.commands import (
apply_parsing_rules,
apply_video_rules,
save_item
)

from digest.models import (
ITEM_STATUS_CHOICES,
ParsingRules,
Section,
Resource
)

ResourceDict = Dict[str, Union[str, int, Resource]]
ItemTuple = Tuple[BeautifulSoup, BeautifulSoup]


class ImportPythonParser(object):
BASE_URL = "http://importpython.com"
RESOURCE_NAME = "importpython"

def __init__(self):
pass

@staticmethod
def _get_url_content(url: str) -> str:
"""Gets text from URL's response"""
try:
result = urlopen(url, timeout=10).read()
except URLError:
return ''
else:
return result

@classmethod
def get_latest_issue_url(cls) -> str:
"""Returns latest issue URL"""
archive_url = "/".join([cls.BASE_URL, "newsletter", "archive"])
content = cls._get_url_content(archive_url)
soup = BeautifulSoup(content, "lxml")
el = soup.find_all("div", "info")[0]
href = el.find("h2").find("a")["href"]
link = cls.BASE_URL + href
return link

@classmethod
def get_issue_url(cls, number: int) -> str:
"""Returns issue URL corresponding to the issue number"""
number = int(number)
if number >= 16:
return "/".join([cls.BASE_URL, "newsletter", "no", str(number)])
elif 12 <= number <= 15:
return "/".join([cls.BASE_URL, "newsletter", "draft", str(number)])
elif 2 <= number <= 14:
return "/".join([cls.BASE_URL, "static", "files", "issue{}.html".format(str(number))])
else:
raise ValueError("Incorre page number: {}".format(number))

def _get_all_news_blocks(self,
soap: BeautifulSoup) -> List[ItemTuple]:
"""Returns sequence of blocks that present single news"""
# TODO: add tags parsing
subtitle_els = soap.find_all("div", "subtitle")
body_texts = [el.find_next_sibling("div") for el in subtitle_els]
return list(zip(subtitle_els, body_texts))

def _get_block_dict(self,
el: Tuple[BeautifulSoup,
BeautifulSoup]) -> ResourceDict:
resource, created = Resource.objects.get_or_create(
title='ImportPython',
link='http://importpython.com'
)

subtitle, body = el
title = subtitle.find("a").text
url = subtitle.find("a")['href']
text = body.text
return {
'title': title,
'link': url,
'raw_content': text,
'http_code': 200,
'content': text,
'description': text,
'resource': resource,
'language': 'en',
}

def get_blocks(self, url: str) -> List[ResourceDict]:
"""Get news dictionaries from the specified URL"""
content = self._get_url_content(url)
soup = BeautifulSoup(content, "lxml")
blocks = self._get_all_news_blocks(soup)
items = map(self._get_block_dict, blocks)
return list(items)


def _apply_rules_wrap(**kwargs):
# TODO: move this function into separate module
# as it is used in several parsing modules
rules = kwargs

def _apply_rules(item: dict) -> dict:
item.update(
apply_parsing_rules(item, **rules)
if kwargs.get('query_rules') else {})
item.update(apply_video_rules(item))
return item

return _apply_rules


def main(url: str="", number: int="") -> None:
data = {
'query_rules': ParsingRules.objects.filter(is_activated=True).all(),
'query_sections': Section.objects.all(),
'query_statuses': [x[0] for x in ITEM_STATUS_CHOICES],
}
_apply_rules = _apply_rules_wrap(**data)

parser = ImportPythonParser()
if number and not url:
url = parser.get_issue_url(number)
if not number and not url:
url = parser.get_latest_issue_url()
blocks = parser.get_blocks(url)
with_rules_applied = map(_apply_rules, blocks)
for block in with_rules_applied:
save_item(block)


class Command(BaseCommand):
help = """This command parses importpython.com site\
and saves posts from it to the database.
You may either specify url by using --url argument, or
implicitly specify issue number by using --number argument."""

def add_arguments(self, parser):
parser.add_argument('--url', type=str, help='Url to parse data from')
parser.add_argument('--number',
type=int,
help='Number of "issue" to parse')

def handle(self, *args, **options):
if 'url' in options and options['url'] is not None:
main(url=options['url'])
elif 'number' in options and options['number'] is not None:
main(number=int(options['number']))
else:
main()

0 comments on commit 6e36310

Please sign in to comment.