In [40]:
import re
from math import sqrt

class AuthorCandidate:
    def __init__(self,
                 candidate_string,
                 stop_word_confidence = 0,
                 author_list_confidence = 0,
                 known_name_confidence = 0,
                 known_title_confidence = 0,
                 name_pattern_confidence = 0,
                 name_word_count_confidence = 0,
                 digit_word_count_confidence = 0):
        self.candidate_string = candidate_string
        
        self.stop_word_confidence = stop_word_confidence
        self.author_list_confidence = author_list_confidence
        self.known_name_confidence = known_name_confidence
        self.known_title_confidence = known_title_confidence
        self.name_pattern_confidence = name_pattern_confidence
        self.name_word_count_confidence = name_word_count_confidence
        self.digit_word_count_confidence = digit_word_count_confidence
        
    def score(self):
        return (self.stop_word_confidence * 10) \
            + (self.author_list_confidence * 0) \
            + (self.known_name_confidence * 4) \
            + (self.known_title_confidence * 4) \
            + (self.name_pattern_confidence * 2) \
            + (self.name_word_count_confidence * .5) \
            + (self.digit_word_count_confidence * 1)
    
    def confidence(self):
        return 1 - (1 / sqrt(max(self.score(), 1)))
    
    def calculate_confidences(self, stop_words, first_names):
        self.calculate_stop_word_confidence(stop_words)
        self.calculate_author_list_confidence()
        self.calculate_author_pattern_confidence()
        self.calculate_known_name_confidence(first_names)
        self.calculate_known_title_confidence()
        self.calculate_name_word_count_confidence()
        self.calculate_digit_word_count_confidence()
    
    def calculate_stop_word_confidence(self, stop_words):
        'if one stop word is included, the specific confidence becomes 0, otherwise 1'
        self.stop_word_confidence = 0
        for stop_word in stop_words:
            if stop_word in self.candidate_string.casefold():
                self.stop_word_confidence -= 1
        
    def calculate_author_list_confidence(self):
        'calculates a confidence matching author lists by counting commas and "and"s'
        self.author_list_confidence = sum([self.candidate_string.count(delimiter) for delimiter in [',', 'and']])
        
    def calculate_author_pattern_confidence(self):
        'based on whether or not the words e.g. begin with an uppercase letter'
        self.name_pattern_confidence = 0
        if len(self.candidate_string) > 0 and all(map(lambda x: x[0].isupper(), self.candidate_string.replace('and', '').split())):
            self.name_pattern_confidence = 1
        if sum(1 for c in self.candidate_string if c.isupper()) == 0:
            self.name_pattern_confidence = -1
    
    def calculate_known_name_confidence(self, first_names):
        'based on whether or not the candidate includes an entry from a list of known names; name set has to be lowercase'
        # remove digits and split at commas and 'and's
        self.known_name_confidence = 0
        for word in re.split(r' |,|and', ''.join([i for i in self.candidate_string if not i.isdigit()])):
            if word and word[0].isupper():
                if len(word) == 2 and word[1] == '.':
                    self.known_name_confidence += 1
                elif word.strip().casefold() in first_names:
                    self.known_name_confidence += 1
                    
    def calculate_known_title_confidence(self):
        'based on whether or not the candidate includes an entry from a list of known titles, like Dr., or Prof.'
        # remove digits and split at commas and 'and's
        self.known_title_confidence = 0
        known_titles = {'dr.', 'prof.', 'b.sc.', 'phd', 'ph.d.', 'dphil'}
        for word in re.split(r' |,|and', ''.join([i for i in self.candidate_string if not i.isdigit()])):
            if word.casefold() in known_titles:
                self.known_title_confidence += 1
                    
    def calculate_name_word_count_confidence(self):
        '''based on the number of words the candidate consists of, a confidence is calculated.
        Normaly a name consists of two words (first and last name) but some more words are also common (multiple first names)'''
        word_count = sum(1 for word in self.candidate_string.split() if len(word) > 0 and word[0].isalpha())
        if word_count < 2:
            self.name_word_count_confidence = -4
        elif word_count == 2:
            self.name_word_count_confidence = 4
        elif word_count == 3:
            self.name_word_count_confidence = 3
        elif word_count == 4:
            self.name_word_count_confidence = 1
        elif word_count == 5:
            self.name_word_count_confidence = 0
        else:
            self.name_word_count_confidence = -2
            
    def calculate_digit_word_count_confidence(self):
        '''based on the number of digits in the words of the candidate a confidence is calculated.
        Normaly a name does not includ digits, but through the PDF to Text procedure footnote referenced are part of some candidates'''
        for word in self.candidate_string.split():
            digit_count = sum(1 for c in word if c.isdigit())
            if digit_count < 2:
                self.digit_word_count_confidence += 0
            elif digit_count == 2:
                self.digit_word_count_confidence += -1
            elif digit_count == 3:
                self.digit_word_count_confidence += -2
            else:
                self.digit_word_count_confidence += -4
            

    def __repr__(self):
        return 'AuthorCandidate(' + str(round(self.confidence(), 3)) + ' \'' + self.candidate_string + '\' [' + \
                 str(self.stop_word_confidence) + ' ' + \
                 str(self.author_list_confidence) + ' ' + \
                 str(self.known_name_confidence) + ' ' + \
                 str(self.known_title_confidence) + ' ' + \
                 str(self.name_pattern_confidence) + ' ' + \
                 str(self.name_word_count_confidence) + ' ' + \
                 str(self.digit_word_count_confidence) + ']' + ')'

In [41]:
def preprocess_input(text):
    '''
    returns a list of AuthorCandidates, preprocessed by ignoring
    blank lines and merging multi line entries
    '''
    author_candidates = []
    current_entry = ''
    for line in text.split('\n')[:5]:
        for chunk in re.split(r',| and ', line):
            author_candidates.append(AuthorCandidate(chunk.strip()))
        
    return author_candidates

In [43]:
import os
if __name__ == '__main__':
    stop_words = 'bureau univ school department institut ltd science labor'.split()
    # first names borrowed from http://www.quietaffiliate.com/free-first-name-and-last-name-databases-csv-and-sql/
    with open('first_names.csv') as file:
        first_names = {name.strip().casefold() for name in file.read().split('\n')[1:]}
    texts = []
    text_dir = './PDF-Archiv/'
    for file in sorted(os.listdir(text_dir))[0:50]:
        with open(text_dir + file, encoding='UTF-8') as file:
            texts.append(file.read())
    for text in texts:
        for candidate in preprocess_input(text):
            candidate.calculate_confidences(stop_words, first_names)
            print(candidate)
        print()

AuthorCandidate(0.0 'MoReBikeS - Model reuse with bike rental' [0 0 0 0 0 -2 0])
AuthorCandidate(0.0 'station data' [0 0 0 0 -1 4 0])
AuthorCandidate(0.5 'Meelis Kull1' [0 0 0 0 1 4 0])
AuthorCandidate(0.646 'Nicolas Lachiche2' [0 0 1 0 1 4 0])
AuthorCandidate(0.0 '' [0 0 0 0 -1 -4 0])
AuthorCandidate(0.646 'Adolfo Martı́nez-Usó3' [0 0 1 0 1 4 0])
AuthorCandidate(0.0 '1' [0 0 0 0 -1 -4 0])
AuthorCandidate(0.0 '' [0 0 0 0 -1 -4 0])

AuthorCandidate(0.0 'Model Reuse with Subgroup Discovery' [0 0 0 0 0 0 0])
AuthorCandidate(0.646 'Hao Song' [0 0 1 0 1 4 0])
AuthorCandidate(0.646 'Peter Flach' [0 0 1 0 1 4 0])
AuthorCandidate(0.0 'Intelligent Systems Laboratory' [-1 0 0 0 1 3 0])
AuthorCandidate(0.0 'University of Bristol' [-1 0 0 0 0 3 0])
AuthorCandidate(0.5 'United Kingdom' [0 0 0 0 1 4 0])
AuthorCandidate(0.0 '{Hao.Song' [0 0 0 0 0 -4 0])
AuthorCandidate(0.0 'Peter.Flach}@bristol.ac.uk' [0 0 0 0 1 -4 0])
AuthorCandidate(0.0 '' [0 0 0 0 -1 -4 0])

AuthorCandidate(0.0 'SVR-based Modelli