In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

from sklearn.metrics.pairwise import cosine_similarity
import spacy
nlp = spacy.load('en')
from spacy.symbols import nsubj, dobj, VERB

In [2]:
from settings import *
from utils import *

### Loads GloVe Embeddings

In [3]:
from gloveEmbeddings import *
loadGloveEmbeddings(gloveFile)

### Creates Keyword Vectors

In [94]:
sourcesKeywords = [nlp(x)[0].lemma_ for x in ['paper', 'report', 'study', 'analysis', 'research', 'survey', 'release']]
peopleKeywords = [nlp(x)[0].lemma_ for x in ['expert', 'scientist']]
actionsKeywords = [nlp(x)[0].lemma_ for x in ['prove', 'demonstrate', 'reveal', 'state', 'mention', 'report', 'say', 'show', 'announce', 'claim', 'suggest', 'argue', 'predict', 'believe', 'think']]

sourcesKeywordsVec = [word2vec(x) for x in sourcesKeywords]
peopleKeywordsVec = [word2vec(x) for x in peopleKeywords]
actionsKeywordsVec = [word2vec(x) for x in actionsKeywords]

### Searches (on the vector space) for sentences containing the given subject and predicate keywords.

In [92]:
def keywordSearch(title, body):
    subjectThreshold = 0.9
    predicateThreshold = 0.9

    claims = []
    for s in sent_tokenize(body):
        subjectFound = predicateFound = False
        claim = ""
        for w in wordpunct_tokenize(s):

            if predicateFound == True:
                claim = s
                claims.append(claim)
                break

            wVec = word2vec(w)

            if subjectFound == False:
                for sVec in sourcesKeywordsVec+peopleKeywordsVec:
                    if abs(cosine_similarity(sVec.reshape(1, -1), wVec.reshape(1, -1))) > subjectThreshold:
                        subjectFound = True
                        break

            if subjectFound == True:
                for pVec in actionsKeywordsVec:
                    if abs(cosine_similarity(pVec.reshape(1, -1), wVec.reshape(1, -1))) > predicateThreshold:
                        predicateFound = True
                        break
    return claims


In [96]:
def quoteExtraction(limitDocuments=10):
    query = createQuery(limitDocuments, 'web')
    documents = queryDB(query)

    global cc 
    cc = 0
    
    def checkAction(w):
        enableEmbeddings = False
        threshold = 0.9
        
        for a in actionsKeywords:
            if w is not None and w.lemma_ == a:
                return True
        
        if (enableEmbeddings):
            wVec = word2vec(w.text)

            for aVec in actionsKeywordsVec:
                if abs(cosine_similarity(aVec.reshape(1, -1), wVec.reshape(1, -1))) > threshold:
                    print ("new action",w)
                    return True
        return False
    
    def acronym(phrase):
        fullAcronym = ''
        compactAcronym = ''
        upperAccronym = ''
        
        if len(phrase.split()) > 1:
            for w in phrase.split():
                for l in w:
                    if (l.isupper()):
                        upperAccronym += l
                if w not in stopWords:
                    compactAcronym += w[0]
                fullAcronym += w[0]
                
        return fullAcronym.lower(), compactAcronym.lower(), upperAccronym.lower()
            
    
    def improveQuoteeEntity(quotee, quoteeType, allEntities):

        if len(quotee.split()) == 1:
            #case where quotee is referred to with his/her first or last name.    
            for e in allEntities:
                if quotee in e.text.split() and quoteeType in ['PERSON']:
                    return e.text, e.label_
            
            #case where quotee is referred to with an acronym.
            for e in allEntities:
                if quotee.lower() in acronym(e.text)  and quoteeType in ['ORG']:
                    return e.text, e.label_
        
        return quotee, quoteeType
    
    def resolveQuotee(quotee, sentenceEntities, allEntities):
        
        try:
            c = next(nlp(quotee).noun_chunks)
        except:
            #heuristic: first entity of the sentence or of the whole text.
            for e in sentenceEntities + allEntities:
                if e.label_ in ['PERSON', 'ORG']:
                    return e.text, e.label_         
            return '', 'unknown'        
        
        for e in sentenceEntities:
            if c.text == e.text and e.label_ in ['PERSON', 'ORG']:
                return c.text, e.label_
                   
        for w in sourcesKeywords:
            if c.root.lemma_ == nlp(w)[0].lemma_:
                #heuristic: first entity of the sentence or of the whole text.
                for e in sentenceEntities + allEntities:
                    if e.label_ in ['PERSON', 'ORG']:
                        return e.text, e.label_
                return 'study', 'unknown'

        for w in peopleKeywords:
            if c.root.lemma_ == nlp(w)[0].lemma_:
                #heuristic: first entity of the sentence or of the whole text.
                for e in sentenceEntities + allEntities:
                    if e.label_ in ['ORG']:
                        return e.text, e.label_
                return 'expert', 'unknown'

        return c.text, 'unknown'   
    
    def dependencyGraphSearch(title, body):
        
        
        allEntities = nlp(body).ents + nlp(title).ents
        quotes = []
        
        for s in sent_tokenize(body):
            quoteFound = quoteeFound = False
            quote = quotee = quoteeType = ""
            
            doc = nlp(s)
            
            #find all verbs of the sentence.
            verbs = set()
            for v in doc:
                if v.head.pos == VERB:
                    verbs.add(v.head)
            
            if not verbs:
                continue
 
            rootVerb = ([w for w in doc if w.head is w] or [None])[0]
            
            #check first the root verb and then the others.
            verbs = [rootVerb] + list(verbs)
            
            for v in verbs:
                if checkAction(v):            
                    
                    for np in doc.noun_chunks:
                        if np.root.head == v:

                            if(np.root.dep == nsubj):
                                quotee = np.text
                                quoteeFound = True
                                
                            if(np.root.dep == dobj): #TODO
                                pass
                                
                            quoteFound = True
                    
                    if quoteeFound:
                        break
                            
                    
                
            if quoteFound:
                    quote = s                    
                    quotee, quoteeType = resolveQuotee(quotee, doc.ents, allEntities)
                    quotee, quoteeType = improveQuoteeEntity(quotee, quoteeType, allEntities)                    
        
                    quotes.append({'quote': quote, 'quotee':quotee, 'quoteeType':quoteeType})
                    #print('quote: ', quote)
                    #print('by: ', quotee, '(', quoteeType, ')')
                    #print()
                    continue
                    
        return quotes

    
    documents['quotes'] = documents.apply(lambda d: dependencyGraphSearch(d['title'],d['body']), axis=1)
    
    documents = documents[['title']].join(documents['quotes'].apply(pd.Series).stack().reset_index(level=1, drop=True).apply(pd.Series))
    
    print ('#quotesPerDocument: ',len(documents)/limitDocuments)
    return documents

quoteExtraction(50)    

#quotesPerDocument:  11.08


Unnamed: 0,title,quote,quotee,quoteeType
0,Bladder Infections in Women Linked with Factory Farmed Chicken,"According to ABC’s Senior National Correspondent, “A growing number of medical researchers say more than 8 million women are at risk of difficult-to-treat bladder infections because superbugs – resistant to antibiotics and growing in chickens – are being transmitted to humans in the form of E. coli.”\n\nResearchers from McGill University found the E. coli strain that is responsible for bladder infection closely matches the bacteria found in retail chicken.",A growing number,unknown
1,Cargill Warns Of Salmonella-Tainted Ground Beef In Latest Recall : The Salt : NPR,"Cargill Warns Of Salmonella-Tainted Ground Beef In Latest Recall\n\ni itoggle caption Carolina K. Smith/iStockphoto.com Carolina K. Smith/iStockphoto.com\n\nIf you think you've been hearing more about product recalls lately, you have.",you,unknown
1,Cargill Warns Of Salmonella-Tainted Ground Beef In Latest Recall : The Salt : NPR,"But if ""recall fatigue"" is setting in, you need to shake it off for this one: Cargill Beef Solutions is announcing a recall of about 30,000 pounds of fresh ground beef from a Pennsylvania plant because of possible contamination with Salmonella Enteritidis.",Cargill Beef Solutions,ORG
1,Cargill Warns Of Salmonella-Tainted Ground Beef In Latest Recall : The Salt : NPR,"""FSIS and the company are concerned that some product may be frozen in consumers' freezers,"" FSIS says.",FSIS and the,WORK_OF_ART
1,Cargill Warns Of Salmonella-Tainted Ground Beef In Latest Recall : The Salt : NPR,"So far, FSIS says the retail packages of beef have been traced to Hannaford grocery stores, a chain primarily found in northern New England.",FSIS and the,WORK_OF_ART
1,Cargill Warns Of Salmonella-Tainted Ground Beef In Latest Recall : The Salt : NPR,"""Ensuring our beef products are safe is our highest priority and an investigation is underway to determine the source of Salmonella in the animals we purchased for harvest and any actions necessary to prevent this from recurring,"" he says.",he,unknown
1,Cargill Warns Of Salmonella-Tainted Ground Beef In Latest Recall : The Salt : NPR,"Yes, maybe too much information, experts say.",Cargill Warns,ORG
1,Cargill Warns Of Salmonella-Tainted Ground Beef In Latest Recall : The Salt : NPR,"Just in the last month, the government has announced recalls of infant dietary supplements for possible salmonella contamination and chicken for potential Listeria contamination.",the government,unknown
1,Cargill Warns Of Salmonella-Tainted Ground Beef In Latest Recall : The Salt : NPR,"USA Today reported last month that there were 1,460 recalls in 2007.",USA Today,ORG
1,Cargill Warns Of Salmonella-Tainted Ground Beef In Latest Recall : The Salt : NPR,"Stores and government officials say the surge is due to better oversight, better testing, and increased communication about recalls, but some fear consumers may be developing ""recall fatigue"" and are tuning the messages out.",Stores,unknown
