In [1]:
import nltk
import spacy
from textblob import TextBlob
nltk.download('brown')
nltk.download('punkt')
import time
import re

[nltk_data] Downloading package brown to /home/saulo/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt to /home/saulo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [29]:
pronouns=['I','you','he','she','it','we','they','me','him','her','us','them','what','who','whom','mine','yours','his','hers','ours','theirs','this','that','these','those','who','whom','which','what','whose','whoever','whatever','whichever','whomever','myself','yourself','himself','herself','itself','ourselves','themselves','each other','one another','anything','everybody','another','each','few','many','none','some','all','any','anybody','anyone','everyone','everything','no one','nobody','nothing','none','other','others','several','somebody','someone','something','most','enough','little','more','both','either','neither','one','much','such']

ignored_words = ['part','and','a','the end','end','parts','use','s']

right_noun_pattern = '^(NOUN|PROPN)*(VERB)+(ADJ|ADP|DT)+$|^(((NOUN)+(ADJ)+|(ADJ)+(NOUN)+)(ADJ|NOUN)*(ADJ|ADP|DET)+)$'
left_noun_pattern = '^VERB\w*NOUN$|^(VERB)+ADP$'

In [3]:
nlp = spacy.load('en_core_web_sm', disable=['ner'])
nlp.max_length = 1500000

In [4]:
#simple all-pairs data generator

start = time.time()

#filename = '/home/saulo/projects/all-pairs/data/AA/wiki_00'
filename = '/home/saulo/projects/all-pairs/samples/queen.txt'

cp1 = time.time()

with open(filename, 'r') as myfile:
    data=myfile.read().replace('\n', ' ')
    
cp2 = time.time()
print('load time: '+str(cp2-cp1))
        
spacy_obj = nlp(data)
textblob_obj = TextBlob(data)

cp3 = time.time()
print('parser: '+str(cp3-cp2))

del data

nouns = [chunk.text.lower() for chunk in spacy_obj.noun_chunks]

cp4 = time.time()
print('get nouns: '+str(cp4-cp3))

clean_nouns = [n for n in nouns if n not in pronouns]  #remove pronouns

cp5 = time.time()
print('clean nouns: '+str(cp5-cp4))

del spacy_obj

everygrams = nltk.everygrams(textblob_obj.words,
                             min_len=3,
                             max_len=5)

cp6 = time.time()
print('build grams: '+str(cp6-cp5))

del textblob_obj

pairs = []

set_nouns = set(clean_nouns)

for gram in everygrams:
#    set_grams = set([g.lower() for g in grams])
     
    
    explode_gram = nltk.everygrams(gram,
                                  min_len=1,
                                  max_len=len(gram)-2)  #avoid context patterns with a single word
    
    ctx_candidates = [' '.join(g).lower() for g in explode_gram]
    
    intersection = list(set(ctx_candidates).intersection(set_nouns))
    
    joined = ' '.join(gram).lower()
    for noun in intersection:
        if joined.startswith(noun+" ") or joined.endswith(" "+noun):
            pairs.append((noun, joined.replace(noun,'_',1)))
            
cp7 = time.time()
print('build pairs: '+str(cp7-cp6))

end = time.time()
print(end - start)

load time: 0.00025200843811035156
parser: 0.1034095287322998
get nouns: 0.003258228302001953
clean nouns: 0.0006132125854492188
build grams: 0.021496295928955078
build pairs: 0.0241391658782959
0.15362143516540527


In [None]:
pairs

In [None]:
load time: 0.0005166530609130859
parser: 0.11905574798583984
get nouns: 0.0037605762481689453
clean nouns: 0.0006110668182373047
build grams: 0.021936416625976562
build pairs: 0.2348949909210205
0.3810858726501465

#    for word in grams[:1] + grams[-1:]:
#        if word in inter:
#            pairs.append((word, ' '.join(grams).replace(word,'_',1)))

#for grams in everygrams:
#    joined = ' '.join(grams).lower()
#    for noun in clean_nouns:
#        if ((len(joined.split()) >= len(noun.split()) + 2) and  #avoid context patterns with a single word 
#            (joined.startswith(noun+" ") or 
#             joined.endswith(" "+noun))):
#            pairs.append((noun, joined.replace(noun,'_',1)))
#            break

In [None]:
numbers = ['one','two','three','four','five','six','seven']

['one','two','three']

['two','three','four']

['four', 'five', 'six']

In [33]:
def build_pairs(filename, nlp_remote):
    
    start_time = time.time()
    
    #load data
    with open(filename, 'r') as myfile:
        data=myfile.read().replace('\n', ' ')
    
    #get noun phrases
    spacy_obj = nlp_remote(data)
    
    #remove puctuation
    tokens = [t for t in spacy_obj.doc if t.pos_ != 'PUNCT']
    
    nouns = [chunk.text.lower() for chunk in spacy_obj.noun_chunks]
    set_nouns = set([n for n in nouns if n not in pronouns + ignored_words])  #remove pronouns
    
    del nouns
    del spacy_obj

    #get candidate contexts
    everygrams = nltk.everygrams(tokens,
                                 min_len=3,
                                 max_len=5)
    
    #build pairs
    pairs = []
    
    for gram in everygrams:
        first_word = gram[0]
        last_word = gram[-1]
        
        explode_gram = list(nltk.everygrams(gram, min_len=1))
        
        for eg in explode_gram:
            if(len(eg) <= len(gram)-2):
                
                eg_string = ' '.join([g.text for g in eg]).lower()
                
                if(eg_string in set_nouns):
                    if(eg[0] == first_word):
                        context = [gr for gr in explode_gram if len(gr) == len(gram)-len(eg) and gr[-1] == last_word]
                        ctx_pos_pattern = ''.join([c.pos_ for c in context[0]])
                            
                        if(re.search(left_noun_pattern, ctx_pos_pattern)):
                            ctx_string = ' '.join([c.text for c in context[0]])
                            pairs.append((eg_string, "_ "+ctx_string))
                
                    if(eg[-1] == last_word):
                        context = [gr for gr in explode_gram if len(gr) == len(gram)-len(eg) and gr[0] == first_word]
                        ctx_pos_pattern = ''.join([c.pos_ for c in context[0]])
                        
                        if(re.search(right_noun_pattern, ctx_pos_pattern)):
                            ctx_string = ' '.join([c.text for c in context[0]])
                            pairs.append((eg_string, ctx_string+" _"))
    
    del set_nouns
    
    #save to database
    res = insert_pairs(pairs)
    
    end_time = time.time()
    
    #print results
    print('{}: {} {} {}s'.format(datetime.datetime.now(),
                                 filename,
                                 len(res.inserted_ids),
                                 end_time - start_time))

In [5]:
#spacy settings
nlp = spacy.load('en_core_web_sm', disable=['ner'])
nlp.max_length = 1500000

In [34]:
filename = '/home/saulo/projects/all-pairs/samples/queen.txt'
build_pairs(filename, nlp)

{'queen ii', 'rock history', 'freddie mercury', 'number', 'the rock and roll hall', "the world's best-selling music artists", 'nine weeks', 'queen', 'the band’s 1977 album news', 'more conventional and radio-friendly works', 'international success', 'their classic line-up', 'further styles', 'various publications', 'the 1985 live aid concert', 'the uk', 'vocalists', 'estimates', '"queen', 'brian may', 'the british phonographic industry', 'a fan', 'mercury', 'the champions', 'arena rock', 'vocals', '"bohemian rhapsody', 'the band smile', 'their 1981 compilation album', 'piano', 'knebworth', 'greatest hits', 'the biggest stadium rock bands', 'the opera', 'the dust', 'his last performance', 'the outstanding contribution', 'a complication', 'their second album', 'the band', 'the music video', 'the us', 'bass guitar', 'august', 'the songwriters hall', 'the queen name', 'may', 'sheer heart attack', 'events', 'smile', 'drums', 'taylor', 'heavy metal', 'more elaborate stage', 'pop rock', 'the 

In [37]:
pattern = ['b','c','d']
mylist = ['a','b','c','d','e','b']
x = list(filter(lambda x: x not in pattern, mylist))

x

['a', 'e']

In [None]:
for gram in everygrams:
#    set_grams = set([g.lower() for g in grams])
     
    
    explode_gram = nltk.everygrams(gram,
                                  min_len=1,
                                  max_len=len(gram)-2)  #avoid context patterns with a single word
    
    ctx_candidates = [' '.join(g).lower() for g in explode_gram]
    
    intersection = list(set(ctx_candidates).intersection(set_nouns))
    
    joined = ' '.join(gram).lower()
    for noun in intersection:
        if joined.startswith(noun+" ") or joined.endswith(" "+noun):
            pairs.append((noun, joined.replace(noun,'_',1)))

In [5]:
y = [1,3,4,5,6]

for yi in y:
    print(y.index(yi))

0
1
2
3
4


In [23]:
x = nlp("Alice broke the home run record")

for tk in x:
    print(tk.pos_)

PROPN
VERB
DET
NOUN
NOUN
NOUN


In [74]:
print(xi.pos_ for xi in x)

<generator object <genexpr> at 0x7fa0bd599840>
