In [1]:
import urllib.request
import re
import os
import shutil
from bs4 import BeautifulSoup


def download_page(add_url):
    # скачивание страницы
    link = 'http://www.evening-kazan.ru' + add_url
    f = urllib.request.urlopen(link)
    text = f.read()
    soup = BeautifulSoup(text, 'html.parser')
    return soup


def get_href(soup):
    # поиск ссылок на статьи
    hrefs = []
    for div in soup.find_all('div', **{'class': 'views-field-title'}):
        hr = div.span.a.get('href')
        hrefs.append(hr)
    return hrefs

In [2]:
def get_filename(file_dict, publ_year, month):
    # получение номера текста статьи в папке
    st = publ_year + month
    if st in file_dict:
        file_dict[st] += 1
    else:
        file_dict[st] = 1
    return str(file_dict[st])


def get_info(soup, href):
    # получение всех метаданных со страницы
    heading = soup.find('div', **{'class':'heading--meta-wrap'})
    author = heading.find('div', **{'class': 'author heading--meta'})
    author = author.get_text()  
    header = soup.find('h1').get_text()
    
    created = heading.find('div', **{'class':'submitted heading--meta'})
    created = created.get_text()
    publ_year = '20' + created[6:8]  # так как даты хранятся в виде ab.cd.ef
    created = created[:6] + publ_year
    
    sphere = 'публицистика'
    
    topic = heading.find('a', rel='tag')
    if topic is not None:
        topic = topic.get_text()
    else:
        topic = ''
    
    style = 'нейтральный'
    audience_age = 'н-возраст'
    audience_level = 'н-уровень'
    audience_size = 'городская'
    source = 'http://www.evening-kazan.ru' + href
    publication = 'Вечерняя Казань'
    medium = 'газета'
    country = 'Россия'
    region = 'республика Татарстан'
    language = 'ru'
    
    month = created[3:5]
    if month.startswith('0'):
        month = month[1]
        
    path = 'plain\\%s\\%s' % (publ_year, month)
    return path, author, header, created, sphere, topic, style, audience_age, \
        audience_level, audience_size, source, publication, publ_year, \
        medium, country, region, language, month

In [3]:
def get_text(soup):
    # получение чистого текста статьи
    text = ''
    for t in soup.find('div', **{'class': 'content'}).find_all('p'):
        t = t.get_text()
        text += t
    return text

In [6]:
def add_metadate(metadate):
    f = open('Вечерняя Казань\\metadata.csv', 'a', encoding = 'utf-8')
    for st in metadate[:-1]:
        f.write(st + '\t')
    f.write(metadate[-1] + '\n')
    

def text_proc(outp_xml, outp_plain, filename):
    # создание морфологического разбора текста
    fw1 = open(filename + '.xml', 'tw', encoding='utf-8')
    os.system(r'\\mystem.exe -cgild --eng-gr --format xml input.txt ' +
              filename + '.xml')
    fw1.close()
    shutil.move(filename + '.xml', outp_xml)
    
    fw2 = open(filename + '.txt', 'tw', encoding='utf-8')
    os.system(r'\\mystem.exe -cgild --eng-gr input.txt ' + filename + '.txt')
    fw2.close()
    shutil.move(filename + '.txt', outp_plain)

In [7]:
def make_catalog(soup, path, filename, author, header, created, topic, source):
    text = get_text(soup)
    path = 'Вечерняя Казань\\' + path
    if not os.path.exists(path):
        os.makedirs(path)
    
    f1 = open('input.txt', 'w', encoding = 'utf-8')
    f1.write(text)
    f1.close
    
    f2 = open(path + '\\' + filename + '.txt', 'w', encoding='utf-8')
    f2.write('@au %s\n@ti %s\n@da %s\n@topic %s\n@url %s\n%s' % 
             (author, header, created, topic, source, text)) 
    f2.close()
    
    
    path_xml = re.sub('plain', 'mystem-xml', path)
    if not os.path.exists(path_xml):
        os.makedirs(path_xml)
        
    path_plain = re.sub('plain', 'mystem-plain', path)
    if not os.path.exists(path_plain):
        os.makedirs(path_plain)
        
    text_proc(path_xml, path_plain, filename)

In [8]:
def main():
    os.mkdir('Вечерняя Казань')
        
    file_dict = {}
    href_set = set()  # для того, чтобы побывать на каждой странице лишь 1 раз
    page_index = 0
    while page_index < 990 :  # цикл, чтобы "перелистывать" страницы
        if page_index == 0:
            soup = download_page('')
        soup = download_page('/frontpage?page=%d' % page_index)
        hrefs = get_href(soup)
      
        for href in hrefs:
            if href not in href_set:
                href_set.add(href)
                href_soup = download_page(href)
                meta = get_info(href_soup, href)
                add_metadate(meta)
            
                filename = get_filename(file_dict, meta[12], meta[17])
                make_catalog(href_soup, meta[0], filename, meta[1],
                             meta[2], meta[3], meta[5], meta[10])    
        page_index += 1
    os.remove('input.txt')
    f.close()

In [None]:
if __name__ == "__main__":
    main()
