# 1. Download and persist #
Please complete a code for `load()`, `download()` and `persist()` methods of `Document` class. What they do:
- for a given URL `download()` method downloads binary data and stores in `self.content`. It returns `True` for success, else `False`.
- `persist()` method saves `self.content` somewhere. We do it to avoid multiple downloads.
- `load()` method loads data from hard drive. Returns `True` for success.

Tests checks that your code somehow works.

In [0]:
import requests
import os
from urllib.parse import quote, urlsplit

class Document:
    def __init__(self, url):
        self.url = url
        
    def get(self):
        if not self.load():
            if not self.download():
                raise FileNotFoundError(self.url)
            else:
                self.persist()
    
    def __get_filename(self):
        name = str(hash(self.url))  # use the hash as a file name
        return name
    
    def download(self):
        try:
            r = requests.get(self.url)
            if r.status_code // 100 not in (2, 3):  # either 2.. or 3..
                return False
            self.content = r.content
            return True
        except Exception:
            return False
        
    def persist(self):
        if self.content is None:  # If there is nothing to save
            return False
        
        file_name = self.__get_filename()
        file = open(file_name, "wb")
        file.write(self.content)
        return True
            
    def load(self):
        #TODO load content from hard drive, store it in self.content and return True in case of success
        file_name = self.__get_filename()
        if file_name not in os.listdir():  # if there is no such file the folder
            return False
        
        file = open(file_name, "rb")
        self.content = file.read()
        return True

## 1.1. Tests ##

In [0]:
doc = Document('http://sprotasov.ru/data/iu.txt')

doc.get()
assert doc.content, "Document download failed"
assert "Code snippets, demos and labs for the course" in str(doc.content), "Document content error"

doc.get()
assert doc.load(), "Load should return true for saved document"
assert "Code snippets, demos and labs for the course" in str(doc.content), "Document load from disk error"

# 2. Parse HTML #
`BeautifulSoap` library is a de facto standard to parse XML and HTML documents in python. Use it to complete `parse()` method that extracts document contents. You should initialize:
- `self.anchors` list of tuples `('text', 'url')` met in a document. Be aware, there exist relative links. Use `urllib.parse.urljoin()` to fix this issue.
- `self.images` list of images met in a document. Again links can be relative.
- `self.text` should keep plain text of the document without scripts, tags, comments and so on. You can refer to [this stackoverflow answer](https://stackoverflow.com/a/1983219) for details.

In [0]:
from bs4 import BeautifulSoup
from bs4.element import Comment
import urllib.parse
import re


class HtmlDocument(Document):
    
    def __tag_visible(self, element):
        if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
            return False
        if isinstance(element, Comment):
            return False
        return True
    
    def parse(self):
        #TODO exctact plain text, images and links from the document
        self.anchors = []
        self.images = []
        self.text = ""
        
        html_page = self.content
        soup = BeautifulSoup(html_page)
        
        for link in soup.findAll('a', href=True):
            if re.search("(tel)|(sms)|(mailto):", link['href']) is None:  # if link contains neither 'tel:' nor 'sms:' nor 'mailto:'
                full_link = urllib.parse.urljoin(self.url, link['href'])  # completes the link if it is relative
                self.anchors.append((link.text, full_link))        
            
        for img in soup.findAll('img'):
            link = urllib.parse.urljoin(self.url, img.get('src'))
            self.images.append(link)
        
        for txt in soup.findAll(text=True):
            if self.__tag_visible(txt):  # is tag is visible to user
                self.text += txt
        

## 2.1. Tests ##

In [0]:
doc = HtmlDocument("http://sprotasov.ru")
doc.get()
doc.parse()

assert "тестирующий сервер codetest" in doc.text, "Error parsing text"
assert "http://sprotasov.ru/images/phone.png" in doc.images, "Error parsing images"
assert any(p[1] == "http://university.innopolis.ru/" for p in doc.anchors), "Error parsing links"

# 3. Document analysis #
Complete the code for `HtmlDocumentTextData` class. Implement word (and sentence) splitting. Your `get_word_stats()` method should return `Counter` object. Don't forget to lowercase your words.

In [0]:
from collections import Counter
import re


class HtmlDocumentTextData:
    
    def __init__(self, url):
        self.doc = HtmlDocument(url)
        self.doc.get()
        self.doc.parse()
    
    def get_sentences(self):
        txt = self.doc.text
        txt = re.sub('[!@#$.\-+*—,\(\):]', ' ', txt)  # replace all punctuation signs with spaces
        txt = re.sub('[0-9]', ' ', txt)               # replace all digits with spaces
        result = [x.lower() for x in txt.split()]     # lower all letters and delete all doubled spaces
        return result
    
    def get_word_stats(self):
        return Counter(self.get_sentences())

## 3.1. Tests ##

In [49]:
doc = HtmlDocumentTextData("https://university.innopolis.ru")

print(doc.get_word_stats().most_common(10))
assert [x for x in doc.get_word_stats().most_common(10) if x[0] == 'иннополис'], 'иннополис sould be among most common'

[('и', 62), ('в', 39), ('по', 34), ('иннополис', 31), ('на', 26), ('ул', 25), ('января', 20), ('университет', 16), ('ост', 16), ('со', 14)]


# 4. Crawling #

Method `crawl_generator()` is given starting url (`source`) and max depth of search. It should return a **generator** of `HtmlDocumentTextData` objects (return a document as soon as it is downloaded and parsed). You can benefit from `yield obj_name` python construction. Use `HtmlDocumentTextData.anchors` field to go deeper.

In [0]:
from queue import Queue


class Crawler:
    
    def crawl_generator(self, source, depth=1):
        #TODO return real crawling results. Don't forget to process failures
        q = Queue()  # queue where pairs (url, depth) will be stored
        q.put((source, 1)) 
        visited = set()  # set of visited urls

        while not q.empty():  # while there are someting to proceed
            try: 
                cur_url, cur_depth = q.get()

                if "#" in cur_url:
                    pos = cur_url.index("#")
                    cur_url = cur_url[:pos]
                if cur_url in visited:  # no need to proceed already visited page
                    continue
                visited.add(cur_url)
                
                data = HtmlDocumentTextData(cur_url) 
                yield data

                new_depth = cur_depth + 1
                if new_depth <= depth:  # only if `new_depth` is not greater than argument `depth`
                    for new_text, new_url in data.doc.anchors:  # proceed all tuples ('text', 'url')
                        q.put((new_url, new_depth))  # add new pair to queue

            except FileNotFoundError:
                continue

## 4.1. Tests ##

In [51]:
crawler = Crawler()
counter = Counter()
print("starting")
for c in crawler.crawl_generator("https://university.innopolis.ru/en/", 2):
    print(c.doc.url)
    if c.doc.url[-4:] in ('.pdf', '.mp3', '.avi', '.mp4', '.txt'):
        print("Skipping", c.doc.url)
        continue
    counter.update(c.get_word_stats())
    print(len(counter), "distinct word(s) so far")
    
print("Done")

print(counter.most_common(20))
assert [x for x in counter.most_common(20) if x[0] == 'innopolis'], 'innopolis sould be among most common'

starting
https://university.innopolis.ru/en/
408 distinct word(s) so far
https://university.innopolis.ru/
927 distinct word(s) so far
https://university.innopolis.ru/en/?special=Y
938 distinct word(s) so far
https://university.innopolis.ru/en/about/
1083 distinct word(s) so far
https://university.innopolis.ru/en/about/city
1148 distinct word(s) so far
https://university.innopolis.ru/en/about/board
1210 distinct word(s) so far
https://university.innopolis.ru/en/about/job
1475 distinct word(s) so far
https://university.innopolis.ru/en/about/structure
1613 distinct word(s) so far
https://university.innopolis.ru/en/about/teaching-composition/
1718 distinct word(s) so far
https://university.innopolis.ru/upload/iblock/026/IU_AR2018_eng.pdf
Skipping https://university.innopolis.ru/upload/iblock/026/IU_AR2018_eng.pdf
https://university.innopolis.ru/en/education/
1750 distinct word(s) so far
https://university.innopolis.ru/en/education/bachelor/
1781 distinct word(s) so far
https://university.i