In [1]:
from os import path, walk, makedirs
from collections import defaultdict
from re import compile, sub

from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage

from io import StringIO

from pickle import dump, load

### Конвертация .pdf в сырой текст

In [2]:
def convert_pdf_to_txt(path):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ''
    maxpages = 0
    caching = True
    pagenos=set()

    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
        interpreter.process_page(page)

    text = retstr.getvalue()

    fp.close()
    device.close()
    retstr.close()
    return text

### Конвертация данных конференций из .pdf в текстовые файлы:

In [21]:
conferences = ['Dialogue', 'AIST', 'AINL', 'RuSSIR']
ds_store = '.DS_Store'
saving_dir = 'prepared-data'

In [18]:
def convert():
    papers = []
    for conference in conferences:
        for year in range(2000, 2018):
            for root, dirs, files in walk(path.join('..', 'data', conference, str(year)), 'rb'):
                    for file in files:
                        if (str(file)) != ds_store:
                            try:
                                papers.append(convert_pdf_to_txt('{}/{}'.format(root, file)))  
                            except:
                                continue
            for paper_id, paper in enumerate(papers):
                directory = path.join('..', saving_dir, conference, str(year))
                if not path.exists(directory):
                    makedirs(directory)
                with open(path.join(directory, str(paper_id)) + '.txt', 'w') as f:
                    f.write(paper)      
            papers = []

In [19]:
# convert()

### Парсинг полученных текстовых файлов

Подготовка.

In [23]:
def remove_bad(string):
    return sub('\t|\xa0', '', string.replace('\n', ' ')).strip()

In [24]:
def encounter_utility_file(filename):
    if any(name in filename for name in ['DS_Store', 'zip']):
        return True

In [25]:
TITLE = 'title'
AUTHOR = 'author'
NAME = 'name'
AFFILIATION = 'affiliation'
EMAIL = 'email'
TEXT = 'text'
REFERENCES = 'references'
REF_MARKER = '\xa0 ЛИТЕРАТУРА \xa0'
KEYWORDS_MARKER = 'Ключевые слова'
KEYWORDS = 'keywords'
ABSTRACT = 'abstract'
CONFERENCE = 'conference'
YEAR = 'year'

### Парсим Диалог

Из-за разных форматов статей за разные годы конференций я сделал две отдельные функции для парсинга для разных наборов годов (200-2007 и 2008-2017).

In [52]:
def add_paper_data(content, conference, year):
    if int(year) > 2007:
        parse_dialogue_2007_plus(content, conference, year)
    else:
        parse_dialogue_until_2007(content, conference, year)

In [53]:
def parse_dialogue_until_2007(content, conference, year):
    splits = content.split('\n\n\n')
    data = {}
    authors = []
    for author in splits[1].split('\n\n'):
        d = {}
        d[NAME] = author.split('\n\n')[0]
        d[EMAIL] = author.split('\n\n')[2]
        d[AFFILIATION] = author.split('\n\n')[1]
        authors.append(d)
    data[AUTHOR] = authors
    d = {}
    d[TITLE] = splits[0]
    d[ABSTRACT] = splits[2]
    d[TEXT] = splits[3:]
    data[TEXT] = d
    data[CONFERENCE] = conference
    data[YEAR] = year
    result.append(data)

In [54]:
def parse_dialogue_2007_plus(content, conference, year):
    splits = content.split('\n\n\n')
    data = {}
    authors = []
    for author in splits[1].split('\n\n'):
        d = {}
        d[NAME] = author.split('(')[0]
        d[EMAIL] = author[author.find('(') + 1:author.find(')')]
        d[AFFILIATION] = author.split(')')[1]
        authors.append(d)
    data[AUTHOR] = authors
    d = {}
    d[TITLE] = splits[0]
    d[ABSTRACT] = splits[2]
    d[TEXT] = splits[3:]
    data[TEXT] = d
    data[CONFERENCE] = conference
    data[YEAR] = year
    result.append(data)

In [59]:
result = []

for root, dirs, files in walk(path.join('..', saving_dir, conferences[0]), 'r'):
     for file in files:
        with open(path.join(root, file), 'r') as input_stream:
            if encounter_utility_file(file):
                continue
            year = root.split('/')[~0]
            try:
                add_paper_data(input_stream.read(), conferences[0], year)
            except (UnicodeDecodeError, IndexError):
                continue

In [18]:
with open('{}.pickle'.format(conferences[2]), 'wb') as f:
    dump(result, f)

ПАРСИМ АИСТ

In [63]:
def parse_aist(content, conference, year):
    splits = content.split('\n\n\n')
    data = {}
    authors = []
    for author in splits[1].split('\n\n'):
        d = {}
        d[NAME] = author.split('\n')[0]
        d[EMAIL] = author.split('\n')[2]
        d[AFFILIATION] = author.split('\n')[1]
        authors.append(d)
    data[AUTHOR] = authors
    d = {}
    d[TITLE] = splits[0]
    d[ABSTRACT] = splits[2]
    d[KEYWORDS] = splits[3]
    d[TEXT] = splits[4:]
    data[TEXT] = d
    data[CONFERENCE] = conference
    data[YEAR] = year
    result.append(data)
    
result = []

for root, dirs, files in walk(path.join('..', saving_dir, conferences[1]), 'r'):
     for file in files:
        with open(path.join(root, file), 'r') as input_stream:
            if encounter_utility_file(file):
                continue
            year = str(file.split('.')[0])
            try:
                papers = input_stream.read().split('==')
                for paper in papers:
                    parse_aist(paper, conferences[1], year)
            except (UnicodeDecodeError, IndexError):
                continue

with open('{}.pickle'.format(conferences[1]), 'wb') as f:
    dump(result, f)

ПАРСИМ AINL

In [50]:
def parse_ainl(content, conference, year):
    splits = content.split('\n\n\n')
    data = {}
    authors = []
    for author in splits[1].split('\n\n'):
        d = {}
        d[NAME] = author.split('\n')[0]
        d[EMAIL] = author.split('\n')[2]
        d[AFFILIATION] = author.split('\n')[1]
        authors.append(d)
    data[AUTHOR] = authors
    d = {}
    d[TITLE] = splits[0]
    d[ABSTRACT] = splits[2]
    # d[KEYWORDS] = splits[3]
    d[TEXT] = splits[3:]
    data[TEXT] = d
    data[CONFERENCE] = conference
    data[YEAR] = year
    result.append(data)
    
result = []

for root, dirs, files in walk(path.join('..', saving_dir, conferences[0]), 'r'):
     for file in files:
        with open(path.join(root, file), 'r') as input_stream:
            if encounter_utility_file(file):
                continue
            year = root.split('/')[~0]
            try:
                papers = input_stream.read().split('==')
                for paper in papers:
                    parse_ainl(paper, conferences[0], year)
            except (UnicodeDecodeError, IndexError):
                continue

with open('{}.pickle'.format(conferences[0]), 'wb') as f:
     dump(result, f)

ПАРСИМ RUSSIR

In [48]:
def parse_russir(content, conference, year):
    splits = content.split('\n\n\n')
    data = {}
    authors = []
    for author in splits[1].split('\n\n'):
        d = {}
        d[NAME] = author.split('\n')[0]
        d[EMAIL] = author.split('\n')[2]
        d[AFFILIATION] = author.split('\n')[1]
        authors.append(d)
    data[AUTHOR] = authors
    d = {}
    d[TITLE] = splits[0]
    d[ABSTRACT] = splits[2]
    # d[KEYWORDS] = splits[3]
    d[TEXT] = splits[3:]
    data[TEXT] = d
    data[CONFERENCE] = conference
    data[YEAR] = year
    result.append(data)
    
result = []

for root, dirs, files in walk(path.join('..', saving_dir, conferences[3]), 'r'):
     for file in files:
        with open(path.join(root, file), 'r') as input_stream:
            if encounter_utility_file(file):
                continue
            year = root.split('/')[~0]
            try:
                papers = input_stream.read().split('==')
                for paper in papers:
                    parse_russir(paper, conferences[3], year)
            except (UnicodeDecodeError, IndexError):
                continue

with open('{}.pickle'.format(conferences[3]), 'wb') as f:
     dump(result, f)

### Вот так вот!