In [2]:
import nltk

In [3]:
#Get the Gender of the name: Male, Female
def gender_features(word):
     return {'last_letter': word[-1]}
    
from nltk.corpus import names
labeled_names = ([(name, 'male') for name in names.words('male.txt')] +
     [(name, 'female') for name in names.words('female.txt')])

import random
random.shuffle(labeled_names)

featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]
train_set, test_set = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [4]:
def preparetext(text):
    text = text.split(" ")
    text = nltk.pos_tag(text)
    sentindex = 0
    preptext = []
    
    # Give the words an index which tells what sentence they appear in:
    for w in text:
        w = (w[0], w[1], sentindex)
        preptext = preptext + [w]
        if w[0][-1] == ".":
            sentindex = sentindex + 1
        if w[0][-1] == "!":
            sentindex = sentindex + 1
        if w[0][-1] == "?":
            sentindex = sentindex + 1
    
    # Give the words an index that tells their position in the whole text:
    index = 0
    preparedtext = []
    for w in preptext:
        w = (w[0], w[1], w[2], index)
        preparedtext = preparedtext + [w]
        index = index + 1
    
    return preparedtext
    
    
test = "Kathi went to the theater. She plays guitar. It is made of wood."
print preparetext(test)

[('Kathi', 'NNP', 0, 0), ('went', 'VBD', 0, 1), ('to', 'TO', 0, 2), ('the', 'DT', 0, 3), ('theater.', 'NN', 0, 4), ('She', 'PRP', 1, 5), ('plays', 'VBZ', 1, 6), ('guitar.', 'NN', 1, 7), ('It', 'PRP', 2, 8), ('is', 'VBZ', 2, 9), ('made', 'VBN', 2, 10), ('of', 'IN', 2, 11), ('wood.', 'NN', 2, 12)]


In [5]:
# extract all nouns of a text
def extractnouns(text):
    text = preparetext(text)
    nouns1 = []
    for w in text:
        if w[1].startswith('NN') == True:
            nouns1 = nouns1 + [w]
    
    # Give the nouns a gender:
    nouns2 = []
    for w in nouns1:
        if w[0][0].isupper() == True:
            nouns2 = nouns2 + [(w[0], w[1], w[2], w[3], classifier.classify(gender_features(w[0])))]
        else:
            nouns2 = nouns2 + [(w[0], w[1], w[2], w[3], 'neutral')]
     
    # Give the nouns a numerus:
    nouns = []
    for w in nouns2:
        if w[1].endswith('S'):
            nouns = nouns + [(w[0], w[2], w[3], w[4], 'plural')]
        else:
            nouns = nouns + [(w[0], w[2],  w[3], w[4], 'singular')]
    
    
    return nouns

print extractnouns(test)

[('Kathi', 0, 0, 'female', 'singular'), ('theater.', 0, 4, 'neutral', 'singular'), ('guitar.', 1, 7, 'neutral', 'singular'), ('wood.', 2, 12, 'neutral', 'singular')]


In [6]:
# a list of all possible pronouns: (pronoun, gender, numerus)

pronounslist = [('he', 'male', 'singular'),('she', 'female', 'singular'),
            ('it', 'neutral', 'singular'), ('they', 'no', 'plural'),
            ('her', 'female', 'singlular'), ('him', 'male', 'singlular'),
            ('them', 'no', 'plural'),
            ('his', 'male', 'singular'), ('her', 'female', 'singular'), ('their', 'no', 'plural'),
            ('hers', 'female', 'singular'),
            ('its', 'neutral', 'singular'), ('theirs', 'no', 'plural')]

In [7]:
# extract all pronouns of a text:
def extractpronouns(text):
    text = preparetext(text)
    pronouns1 = []
    for w in text:
        if w[1].startswith('PRP') == True:
            pronouns1 = pronouns1 + [w]
            
    #give a gender and a numerus to the pronouns:
    pronouns = []
    for w in pronouns1:
        for p in pronounslist:
            if w[0].lower() == p[0]:
                pronouns = pronouns + [(w[0], w[2], w[3], p[1], p[2])]
    
    return pronouns

print extractpronouns(test)

[('She', 1, 5, 'female', 'singular'), ('It', 2, 8, 'neutral', 'singular')]


In [8]:
# How much do a pronoun and a noun match? 
def match(a, b):
    factor = 0
    # according to gender:
    if a[3] == b[3]:
        factor = factor + 1
    # according to numerus:
    if a[4] == b[4]:
        factor = factor + 1
    return factor

print match(extractpronouns(test)[1], extractnouns(test)[3])

2


In [22]:
def protonouns(text):
    nouns = extractnouns(text)
    pronouns = extractpronouns(text)
    
    print nouns
    print pronouns
    
    # Check every pronoun in the list:
    for p in pronouns:
        
        possiblenouns = []
        for n in nouns:
            if p[1] == n[1]:
                possiblenouns = possiblenouns + [(n[0], n[1], n[2], n[3], n[4], match(p,n))]
            if (p[1] - 1) == n[1]:
                possiblenouns = possiblenouns + [(n[0], n[1], n[2], n[3], n[4], match(p,n))]
            if (p[1] - 2) == n[1]:
                possiblenouns = possiblenouns + [(n[0], n[1], n[2], n[3], n[4], match(p,n))]
        
        # Look for the best choice:
        candidates = []
        for n in possiblenouns:
            if n[5] == 2:
                candidates = candidates + [n]
            
        if candidates == []:
            for n in possiblenouns: 
                if n[5] == 1:
                    candidates = candidates + [n]
                    
        #Choose the nearest of the candidates:
        cand_with_distance = []
        for n in candidates:
            distance = abs(p[2] - n[2])
            cand_with_distance = cand_with_distance + [(n[0], n[1], n[2], n[3], n[4], n[5], distance)]
            
        bestchoice = cand_with_distance[0]
        for n in cand_with_distance:
            if n[6] < bestchoice[6]:
                bestchoice = n
        
        
        if bestchoice[5] == 2:
            print p[0] + " (word nr. " + str(p[2] + 1) + ") referrs to " + bestchoice[0] + " (word nr. " + str(bestchoice[2] + 1) + ")"
        if bestchoice[5] < 2:
            print p[0] + " (word nr. " + str(p[2] + 1) + ") referrs to " + bestchoice[0] + " (word nr. " + str(bestchoice[2] + 1) + "). But this may be wrong."
    
        
                
protonouns(test)

[('Kathi', 0, 0, 'female', 'singular'), ('theater.', 0, 4, 'neutral', 'singular'), ('guitar.', 1, 7, 'neutral', 'singular'), ('wood.', 2, 12, 'neutral', 'singular')]
[('She', 1, 5, 'female', 'singular'), ('It', 2, 8, 'neutral', 'singular')]
She (word nr. 6) referrs to Kathi (word nr. 1)
It (word nr. 9) referrs to guitar. (word nr. 8)


In [27]:
test2 = "I really like your new bag. It looks cute. But Jessy does not, she thinks it is ugly."
protonouns(test2)

[('bag.', 0, 5, 'neutral', 'singular'), ('cute.', 1, 8, 'neutral', 'plural'), ('Jessy', 2, 10, 'female', 'singular')]
[('It', 1, 6, 'neutral', 'singular'), ('she', 2, 13, 'female', 'singular'), ('it', 2, 15, 'neutral', 'singular')]
It (word nr. 7) referrs to bag. (word nr. 6)
she (word nr. 14) referrs to Jessy (word nr. 11)
it (word nr. 16) referrs to bag. (word nr. 6)


In [30]:
test3 = "I think this project is finished. It was confusing, but i like programming, because it is a challenge."
protonouns(test3)

[('project', 0, 3, 'neutral', 'singular'), ('programming,', 1, 12, 'neutral', 'singular'), ('challenge.', 1, 17, 'neutral', 'singular')]
[('It', 1, 6, 'neutral', 'singular'), ('it', 1, 14, 'neutral', 'singular')]
It (word nr. 7) referrs to project (word nr. 4)
it (word nr. 15) referrs to programming, (word nr. 13)
