In [1]:
import nltk.data
import operator
import string
import math
from nltk.corpus import stopwords
from nltk.corpus import wordnet

sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
stop = stopwords.words('english') + list(string.punctuation) + list(string.digits)

from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()


## Asociación de palabras

In [2]:
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

def tokenize(text):
    res = []
    sentences = sent_detector.tokenize(text.lower())
    for sentence in sentences:
        text = nltk.word_tokenize(sentence)
        pos_tags = nltk.pos_tag(text)
        for pos_tag in pos_tags:
            tag = get_wordnet_pos(pos_tag[1])
            if not tag is None:
                res.append(wordnet_lemmatizer.lemmatize(pos_tag[0], get_wordnet_pos(pos_tag[1])))
    return res


### Levantar el corpus AP, separando cada noticia como un elemento distinto en un diccionario (``<DOCNO>``: ``<TEXT>``)

In [3]:
apCorpus = {}
with open('ap/ap.txt', 'r') as ap:
    nextDoc = ap.readline()
    while nextDoc:
        docNumber = ap.readline().split(' ')[1]
        ap.readline()
        text = ap.readline().strip()
        ap.readline()
        ap.readline()
        apCorpus[docNumber] = tokenize(text)
        nextDoc = ap.readline()


### Calcular el tamaño del vocabulario

In [4]:
apWords = set()
apWordsFrequencies = {}
apCorpusSize = 0
for _, text in apCorpus.iteritems():
    apCorpusSize += len(text)
    tokens = set(text)
    apWords |= tokens
    for word in tokens:
        if not word in apWordsFrequencies:
            apWordsFrequencies[word] = 0
        apWordsFrequencies[word] += text.count(word)


In [5]:
print 'Tamaño del vocabulario:', len(apWords)
print 'Tamaño del corpus:', apCorpusSize

Tamaño del vocabulario: 32231
Tamaño del corpus: 571642


### Para las 500 palabras con más apariciones, calcular el par más asociado según la medida presentada

In [6]:
apTop500Freq = []
for i, (word, count) in enumerate(sorted(apWordsFrequencies.iteritems(), key=operator.itemgetter(1), reverse = True)):
    if i >= 500:
        break
    apTop500Freq.append(word)
print apTop500Freq

[u'be', u'say', u'have', 'not', 'year', u'do', 'i', 'state', 'new', 'percent', 'more', 'people', 'also', 'other', "n't", 'up', 'government', 'president', 'u.s.', 'make', 'official', 'last', u'report', 'go', u'take', "'s", '_', u'time', 'soviet', 'bush', u'day', 'first', 'police', u'include', u'get', 'american', u'tell', 'company', 'week', 'united', 'use', 'today', u'work', u'call', 'country', 'month', 'only', u'give', 'group', 'force', u'come', 'most', u'plan', 'national', 'city', 'house', 'party', 'out', 'thursday', 'court', u'price', 'down', 'member', 'market', u'high', 'tuesday', 'federal', u'issue', 'monday', u'leader', 'friday', u'know', 'wednesday', 'home', 'want', 'think', 'department', 'many', 'union', u'begin', 'now', 'news', 'find', u'nation', u'charge', 'end', u'service', 'york', u'show', u'leave', 'good', u'ask', 'just', u'right', u'hold', 'program', 'world', 'case', 'military', 'stock', u'kill', 'office', 'help', 'south', 'trade', u'support', 'former', 'increase', 'late', 

In [7]:
window = 5
bigram_measures = nltk.collocations.BigramAssocMeasures()
finders = {}
for doc, text in apCorpus.iteritems():
    finder = nltk.collocations.BigramCollocationFinder.from_words(text, window_size = window)
    finder.apply_ngram_filter(lambda w1, w2: w1 not in apTop500Freq or w2 not in apTop500Freq)
    finders[doc] = finder


[('association', u'long')]
[(u'agree', 'most')]
[('as', 'long')]
[('april', 'north')]
[('april', 'city')]
[(u'begin', 'ago')]
[(u'accord', 'news')]
[('administration', 'authority')]
[('away', 'wednesday')]
[('here', u'be')]
[(u'act', 'also')]
[("'m", "'re")]
[('about', u'state')]
[(u'announce', 'monday')]
[("'re", u'go')]
[('ask', 'monday')]
[('administration', 'expect')]
[('ago', 'sign')]
[('cost', u'cent')]
[('about', 'today')]
[("'re", 'state')]
[(u'accord', u'move')]
[('association', 'march')]
[(u'accuse', u'take')]
[(u'approve', 'committee')]
[('as', 'well')]
[('bill', u'raise')]
[('communist', u'control')]
[('area', u'go')]
[('about', 'chief')]
[(u'announce', 'national')]
[('do', 'public')]
[(u'country', u'include')]
[('chairman', 'chief')]
[('cause', 'john')]
[("'ve", 'more')]
[(u'accord', 'last')]
[(u'accord', u'report')]
[("'m", 'not')]
[(u'accord', 'leader')]
[('already', 'industry')]
[("'s", 'chief')]
[('also', u'begin')]
[("'m", u'ask')]
[('allow', 'country')]
[('america', 

[('attempt', 'do')]
[('aid', 'more')]
[('away', u'move')]
[('attempt', 'others')]
[(u'agree', 'last')]
[(u'begin', 'other')]
[("'s", 'try')]
[("'s", u'take')]
[(u'accord', 'school')]
[(u'accord', u'release')]
[('death', u'give')]
[('attack', 'long')]
[(u'concern', 'proposal')]
[('also', u'name')]
[("'s", 'past')]
[(u'accord', u'large')]
[('_', 'national')]
[("'s", 'far')]
[('also', u'die')]
[("'re", u'concern')]
[("'s", 'many')]
[("'s", u'appeal')]
[('administration', 'bush')]
[(u'appear', 'break')]
[(u'announce', u'early')]
[("'s", 'very')]
[(u'come', 'national')]
[("'ve", u'see')]
[('_', 'co.')]
[('possible', 'so')]
[('allow', 'increase')]
[(u'accuse', u'water')]
[(u'change', 'make')]
[('back', u'line')]
[(u'accord', u'figure')]
[('about', 'there')]
[(u'allow', 'most')]
[('about', 'capital')]
[("'s", 'case')]
[('allow', 'not')]
[('april', u'term')]
[(u'act', 'reagan')]
[(u'analyst', u'expect')]
[('already', 'meet')]
[("'s", u'come')]
[("'m", 'march')]
[("'s", u'believe')]
[('american

[('air', 'own')]
[('black', u'community')]
[(u'accuse', 'human')]
[('agency', 'later')]
[("'m", "'re")]
[('_', 'again')]
[(u'allow', 'private')]
[(u'allow', 'drop')]
[(u'accord', 'chairman')]
[("'s", 'as')]
[(u'believe', 'go')]
[('american', 'nation')]
[('american', 'exchange')]
[('earlier', 'large')]
[('_', 'far')]
[("'s", u'agree')]
[('ago', 'board')]
[(u'america', u'continue')]
[('country', 'get')]
[(u'airline', u'policy')]
[("'m", 'world')]
[("'re", 'find')]
[("'s", 'american')]
[(u'analyst', u'find')]
[('about', u'mile')]
[('as', 'high')]
[('_', 'fight')]
[(u'analyst', 'economy')]
[("'s", 'not')]
[('_', "n't")]
[('_', u'make')]
[('about', u'add')]
[(u'agree', 'april')]
[("'s", 'right')]
[('_', u'labor')]
[('agency', 'late')]
[('also', u'do')]
[('back', u'home')]
[('about', 'new')]
[('also', u'include')]
[('also', 'get')]
[("'ve", 'never')]
[("'s", u'problem')]
[('air', 'plane')]
[('several', u'day')]
[('association', u'rule')]
[("'s", u'arm')]
[('_', 'only')]
[(u'area', 'require')

In [23]:
sum_scores = {}
for _, finder in finders.iteritems():
    for score in finder.score_ngrams(bigram_measures.pmi):
        tupled = tuple(sorted(score[0]))
        if not tupled in sum_scores:
            sum_scores[tupled] = (score[1], 1)
        else:
            sum_scores[tupled] = (sum_scores[tupled][0] + score[1], sum_scores[tupled][1] + 1)

for pair, scores in sum_scores.iteritems():
    sum_scores[pair] = scores[0] / float(scores[1])

maximum = max(sum_scores, key = sum_scores.get)
print(maximum, sum_scores[maximum])

(('germany', u'plan'), 7.596189756144411)


In [25]:
# def f2(x, y):
#     res = 0
#     for _, text in apCorpus.iteritems():
#         for ind in [i for i, w in enumerate(text) if w == x]:
#             for j in range(1, window + 1):
#                 if ind + j < len(text) and text[ind+j] == y:
#                     res += 1
#     return float(res)

# def f1(x):
#     return float(apWordsFrequencies[x])
    
# def I(x, y):
#     p_x_y = f2(x,y) / (apCorpusSize * (window - 1))
#     p_x = f1(x) / apCorpusSize
#     p_y = f1(y) / apCorpusSize
#     factor = p_x_y / (p_x * p_y)
#     if factor > 0:
#         return math.log(factor, 2.0)
#     return 0.0

# max_pair_value = 0
# max_pair = None
# for i in range(500):
#     print i
#     for j in range(i + 1, 500):
#         new_I = I(apTop500Freq[i], apTop500Freq[j])
#         if max_pair_value < new_I:
#             max_pair_value = new_I
#             max_pair = (apTop500Freq[i], apTop500Freq[j])
# print max_pair, max_pair_value