In [151]:
import re

class AuthorCandidate:
    def __init__(self, candidate_string, stop_word_confidence = 1, author_list_confidence = 0, known_name_confidence = 0):
        self.candidate_string = candidate_string
        
        self.stop_word_confidence = stop_word_confidence
        self.author_list_confidence = author_list_confidence
        self.known_name_confidence = known_name_confidence
        
    def score(self):
        return self.stop_word_confidence * ((self.author_list_confidence + 1) * (self.known_name_confidence + 1))
    
    def calculate_confidences(self, stop_words, first_names):
        self.calculate_stop_word_confidence(stop_words)
        if self.stop_word_confidence:
            # further confidences will only be calculated if the candidate is not already disqualified
            self.calculate_author_list_confidence()
            self.calculate_author_pattern_confidence()
            self.calculate_known_name_confidence(first_names)
    
    def calculate_stop_word_confidence(self, stop_words):
        'if one stop word is included, the specific confidence becomes 0, otherwise 1'
        for stop_word in stop_words:
            if stop_word in self.candidate_string.casefold():
                self.stop_word_confidence = 0
                return
        self.stop_word_confidence = 1
        
    def calculate_author_list_confidence(self):
        'calculates a confidence matching author lists by counting commas and "and"s'
        self.author_list_confidence = sum([self.candidate_string.count(delimiter) for delimiter in [',', 'and']])
        
    def calculate_author_pattern_confidence(self):
        'based on whether or not the words e.g. begin with an uppercase letter'
        pass
    
    def calculate_known_name_confidence(self, first_names):
        'based on whether or not the candidate includes an entry from a list of known names; name set has to be lowercase'
        # remove digits and split at commas and 'and's
        self.known_name_confidence = 0
        for word in re.split(r' |,|and', ''.join([i for i in self.candidate_string if not i.isdigit()])):
            if word.strip().casefold() in first_names:
                self.known_name_confidence += 1

    def __repr__(self):
        return 'AuthorCandidate(\'' + self.candidate_string + '\', ' + str(self.score()) +')'

In [152]:
def preprocess_input(text):
    '''
    returns a list of AuthorCandidates, preprocessed by ignoring
    blank lines and merging multi line entries
    '''
    author_candidates = []
    current_entry = ''
    for line in text.split('\n')[:5]:
        current_entry += ' ' + line
        if current_entry.strip() and not line.endswith((',', 'and')):
            author_candidates.append(AuthorCandidate(current_entry.strip()))
            current_entry = ''
    if current_entry.strip():
        author_candidates.append(AuthorCandidate(current_entry.strip()))
    return author_candidates

In [153]:
import os
if __name__ == '__main__':
    stop_words = 'bureau univ school department'.split()
    with open('first_names.csv') as file: # first names stolen from http://www.quietaffiliate.com/free-first-name-and-last-name-databases-csv-and-sql/
        first_names = set([name.strip().casefold() for name in file.read().split('\n')[1:]])
    texts = []
    text_dir = './texts/'
    for file in sorted(os.listdir(text_dir))[0:20]:
        with open(text_dir + file) as file:
            texts.append(file.read())
    for text in texts:
        for candidate in preprocess_input(text):
            candidate.calculate_confidences(stop_words, first_names)
            print(candidate)
        print()

AuthorCandidate('Decision Making through Polarized', 1)
AuthorCandidate('Summarization of User Reviews', 1)
AuthorCandidate('Paolo Cremonesi <paolo.cremonesi@polimi.it>, Franca Garzotto <franco.garzotto@polimi.it>, Matteo Guarnerio <matteo.guarnerio@mail.polimi.it>,', 4)

AuthorCandidate('Linked Edit Rules: A Web Friendly Way of', 1)
AuthorCandidate('Checking Quality of RDF Data Cubes', 1)
AuthorCandidate('Albert Meroño-Peñuela1,2 , Christophe Guéret2 , and Stefan Schlobach1', 15)
AuthorCandidate('1', 1)

AuthorCandidate('Modeling the Statistical Process with Linked Metadata', 1)
AuthorCandidate('Franck Cotton and Daniel W. Gillman', 4)
AuthorCandidate('INSEE, Paris, France', 9)
AuthorCandidate('franck.cotton@insee.fr', 1)
AuthorCandidate('US Bureau of Labor Statistics, Washington, USA', 0)

AuthorCandidate('What is Special about Bethlehem, Pennsylvania?', 2)
AuthorCandidate('Identifying Unexpected Facts about DBpedia Entities', 1)
AuthorCandidate('Benjamin Schäfer, Petar Ristoski, an