In [3]:
from os import path, walk
from collections import defaultdict
from re import compile, sub

from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage

from io import StringIO

from pickle import dump, load

In [3]:
def convert_pdf_to_txt(path):
    # print(path)
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ''
    maxpages = 0
    caching = True
    pagenos=set()

    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
        interpreter.process_page(page)

    text = retstr.getvalue()

    fp.close()
    device.close()
    retstr.close()
    return text

Парсим Диалог-2003

In [37]:
dialogue_2004 = []

for root, dirs, files in walk(path.join('..', 'data', 'AINL', '2017'), 'rb'):
        for file in files:
            if (str(file)) != '.DS_Store':
                try:
                    dialogue_2004.append(convert_pdf_to_txt('{}/{}'.format(root, file)))  
                except:
                    continue

In [38]:
with open('data.pickle', 'wb') as f:
    dump(dialogue_2004, f)

In [39]:
with open('data.pickle', 'rb') as f:
    texts = load(f)

In [40]:
for tid, text in enumerate(texts):
    with open('../prepared-data/AINL/2017/{}.txt'.format(tid), 'w') as f:
        f.write(text)

Парсим остальное:

In [4]:
split_to_nums = compile("([a-zA-Z]+)([0-9]+)")

In [5]:
def remove_bad(string):
    return sub('\t|\xa0', '', string.replace('\n', ' ')).strip()

In [6]:
def encounter_utility_file(filename):
    if any(name in filename for name in ['DS_Store', 'zip']):
        return True

Парсим Диалог после 2010:

In [7]:
TITLE = 'title'
AUTHOR = 'author'
NAME = 'name'
AFFILIATION = 'affiliation'
EMAIL = 'email'
TEXT = 'text'
REFERENCES = 'references'
REF_MARKER = '\xa0 ЛИТЕРАТУРА \xa0'
KEYWORDS_MARKER = 'Ключевые слова'
KEYWORDS = 'keywords'
ABSTRACT = 'abstract'
CONFERENCE = 'conference'
YEAR = 'year'

ПАРСИМ ДИАЛОГ

In [15]:
result = []

for root, dirs, files in walk(path.join('..', 'prepared-data/Dialogue'), 'r'):
     for file in files:
        with open(path.join(root, file), 'r') as input_stream:
            if encounter_utility_file(file):
                continue
            conference = 'Dialogue'
            year = str(root).split('prepared-data/')[1].split('/')[1]
            try:
                add_paper_data(input_stream.read(), conference, year)
            except (UnicodeDecodeError, IndexError):
                continue

In [12]:
def add_paper_data(content, conference, year):
    if int(year) > 2007:
        parse_dialogue_2007_plus(content, conference, year)
    else:
        parse_dialogue_until_2007(content, conference, year)

In [14]:
def parse_dialogue_until_2007(content, conference, year):
    splits = content.split('\n\n\n')
    data = {}
    authors = []
    for author in splits[1].split('\n\n'):
        d = {}
        d[NAME] = author.split('\n\n')[0]
        d[EMAIL] = author.split('\n\n')[2]
        d[AFFILIATION] = author.split('\n\n')[1]
        authors.append(d)
    data[AUTHOR] = authors
    d = {}
    d[TITLE] = splits[0]
    d[ABSTRACT] = splits[2]
    d[TEXT] = splits[3:]
    data[TEXT] = d
    data[CONFERENCE] = conference
    data[YEAR] = year
    result.append(data)

In [10]:
def parse_dialogue_2007_plus(content, conference, year):
    splits = content.split('\n\n\n')
    data = {}
    authors = []
    for author in splits[1].split('\n\n'):
        d = {}
        d[NAME] = author.split('(')[0]
        d[EMAIL] = author[author.find('(') + 1:author.find(')')]
        d[AFFILIATION] = author.split(')')[1]
        authors.append(d)
    data[AUTHOR] = authors
    d = {}
    d[TITLE] = splits[0]
    d[ABSTRACT] = splits[2]
    d[TEXT] = splits[3:]
    data[TEXT] = d
    data[CONFERENCE] = conference
    data[YEAR] = year
    result.append(data)

In [18]:
with open('dialogue.pickle', 'wb') as f:
    dump(result, f)

ПАРСИМ АИСТ

In [86]:
def parse_aist(content, conference, year):
    splits = content.split('\n\n\n')
    data = {}
    authors = []
    for author in splits[1].split('\n\n'):
        d = {}
        d[NAME] = author.split('\n')[0]
        d[EMAIL] = author.split('\n')[2]
        d[AFFILIATION] = author.split('\n')[1]
        authors.append(d)
    data[AUTHOR] = authors
    d = {}
    d[TITLE] = splits[0]
    d[ABSTRACT] = splits[2]
    d[KEYWORDS] = splits[3]
    d[TEXT] = splits[4:]
    data[TEXT] = d
    data[CONFERENCE] = conference
    data[YEAR] = year
    result.append(data)
    
result = []

for root, dirs, files in walk(path.join('..', 'prepared-data/AIST'), 'r'):
     for file in files:
        with open(path.join(root, file), 'r') as input_stream:
            if encounter_utility_file(file):
                continue
            conference = 'AIST'
            year = str(file.split('.')[0])
            try:
                papers = input_stream.read().split('==')
                for paper in papers:
                    parse_aist(paper, conference, year)
            except (UnicodeDecodeError, IndexError):
                continue

with open('aist.pickle', 'wb') as f:
    dump(result, f)

ПАРСИМ AINL

In [79]:
def parse_ainl(content, conference, year):
    splits = content.split('\n\n\n')
    data = {}
    authors = []
    for author in splits[1].split('\n\n'):
        d = {}
        d[NAME] = author.split('\n')[0]
        d[EMAIL] = author.split('\n')[2]
        d[AFFILIATION] = author.split('\n')[1]
        authors.append(d)
    data[AUTHOR] = authors
    d = {}
    d[TITLE] = splits[0]
    d[ABSTRACT] = splits[2]
    # d[KEYWORDS] = splits[3]
    d[TEXT] = splits[3:]
    data[TEXT] = d
    data[CONFERENCE] = conference
    data[YEAR] = year
    result.append(data)
    
result = []

for root, dirs, files in walk(path.join('..', 'prepared-data/AINL'), 'r'):
     for file in files:
        with open(path.join(root, file), 'r') as input_stream:
            if encounter_utility_file(file):
                continue
            conference = 'AINL'
            year = str(root).split('prepared-data/')[1].split('/')[1]
            try:
                papers = input_stream.read().split('==')
                for paper in papers:
                    parse_ainl(paper, conference, year)
            except (UnicodeDecodeError, IndexError):
                continue

with open('ainl.pickle', 'wb') as f:
    dump(result, f)

СТАРЫЙ ПАРСИНГ КОД (НЕВАЛИДЕН)

In [9]:
def search_files(root):
    for root, dirs, files in walk(root, 'r'):
        for file in files:
            with open(path.join(root, file), 'rb') as input_stream:
                if encounter_utility_file(file):
                    continue
                conference = str(root + file).split('data/')[1].split('/')[0]
                year = str(root).split('data/')[1].split('/')[1]
                try:
                    add_paper_data(input_stream.read().decode('utf-8').split('\n'), conference, year)
                except UnicodeDecodeError:
                    continue # remove
                    reader = PdfFileReader(input_stream)
                    data[conference][year].append(reader.getPage(0).extractText())

In [159]:
from pickle import dump

with open('dialogue_2010-2017_chunk.pickle', 'wb') as f:
    dump(result[:10], f)