In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

from sklearn.metrics.pairwise import cosine_similarity
import spacy
nlp = spacy.load('en')
from spacy.symbols import nsubj, dobj, VERB

In [2]:
from settings import *
from utils import *

### Loads GloVe Embeddings

In [3]:
from gloveEmbeddings import *
loadGloveEmbeddings(gloveFile)

### Creates Keyword Vectors

In [37]:
sourcesKeywords = [nlp(x)[0].lemma_ for x in ['paper', 'report', 'study', 'analysis', 'research', 'survey', 'release']]
peopleKeywords = [nlp(x)[0].lemma_ for x in ['expert', 'scientist']]
actionsKeywords = [nlp(x)[0].lemma_ for x in ['prove', 'demonstrate', 'reveal', 'state', 'mention', 'report', 'say', 'show', 'announce', 'claim', 'suggest', 'argue', 'predict']]

sourcesKeywordsVec = [word2vec(x) for x in sourcesKeywords]
peopleKeywordsVec = [word2vec(x) for x in peopleKeywords]
actionsKeywordsVec = [word2vec(x) for x in actionsKeywords]

### Searches (on the vector space) for sentences containing the given subject and predicate keywords.

In [38]:
def keywordSearch(title, body):
    subjectThreshold = 0.9
    predicateThreshold = 0.9

    claims = []
    for s in sent_tokenize(body):
        subjectFound = predicateFound = False
        claim = ""
        for w in wordpunct_tokenize(s):

            if predicateFound == True:
                claim = s
                claims.append(claim)
                break

            wVec = word2vec(w)

            if subjectFound == False:
                for sVec in sourcesKeywordsVec+peopleKeywordsVec:
                    if abs(cosine_similarity(sVec.reshape(1, -1), wVec.reshape(1, -1))) > subjectThreshold:
                        subjectFound = True
                        break

            if subjectFound == True:
                for pVec in actionsKeywordsVec:
                    if abs(cosine_similarity(pVec.reshape(1, -1), wVec.reshape(1, -1))) > predicateThreshold:
                        predicateFound = True
                        break
    return claims


In [75]:
def claimExtraction(limitDocuments=10):
    query = createQuery(limitDocuments, 'web')
    documents = queryDB(query)

    global cc 
    cc = 0
    
    def checkAction(w):
        threshold = 0.9
        
        for a in actionsKeywords:
            if w is not None and w.lemma_ == a:
                return True
        
        wVec = word2vec(w.text)

        for aVec in actionsKeywordsVec:
            if abs(cosine_similarity(aVec.reshape(1, -1), wVec.reshape(1, -1))) > threshold:
                print ("new action",w)
                return True
        return False
    
    def acronym(phrase):
        fullAcronym = ''
        compactAcronym = ''
        upperAccronym = ''
        
        if len(phrase.split()) > 1:
            for w in phrase.split():
                for l in w:
                    if (l.isupper()):
                        upperAccronym += l
                if w not in stopWords:
                    compactAcronym += w[0]
                fullAcronym += w[0]
                
        return fullAcronym.lower(), compactAcronym.lower(), upperAccronym.lower()
            
    
    def improveClaimerEntity(claimer, claimerType, allEntities):

        if len(claimer.split()) == 1:
            #case where claimer is referred to with his/her first or last name.    
            for e in allEntities:
                if claimer in e.text.split() and claimerType in ['PERSON']:
                    return e.text, e.label_
            
            #case where claimer is referred to with an acronym.
            for e in allEntities:
                if claimer.lower() in acronym(e.text)  and claimerType in ['ORG']:
                    return e.text, e.label_
        
        return claimer, claimerType
    
    def resolveClaimer(claimer, sentenceEntities, allEntities):
        
        try:
            c = next(nlp(claimer).noun_chunks)
        except:
            #heuristic: first entity of the sentence or of the whole text.
            for e in sentenceEntities + allEntities:
                if e.label_ in ['PERSON', 'ORG']:
                    return e.text, e.label_         
            return '', 'unknown'        
        
        for e in sentenceEntities:
            if c.text == e.text and e.label_ in ['PERSON', 'ORG']:
                return c.text, e.label_
                   
        for w in sourcesKeywords:
            if c.root.lemma_ == nlp(w)[0].lemma_:
                #heuristic: first entity of the sentence or of the whole text.
                for e in sentenceEntities + allEntities:
                    if e.label_ in ['PERSON', 'ORG']:
                        return e.text, e.label_
                return 'study', 'unknown'

        for w in peopleKeywords:
            if c.root.lemma_ == nlp(w)[0].lemma_:
                #heuristic: first entity of the sentence or of the whole text.
                for e in sentenceEntities + allEntities:
                    if e.label_ in ['ORG']:
                        return e.text, e.label_
                return 'expert', 'unknown'

        return c.text, 'unknown'   
    
    def dependencyGraphSearch(title, body):
        
        
        allEntities = nlp(body).ents + nlp(title).ents
        claims = []
        
        for s in sent_tokenize(body):
            claimFound = claimerFound = False
            claim = claimer = claimerType = ""
            
            doc = nlp(s)
            
            #find all verbs of the sentence.
            verbs = set()
            for v in doc:
                if v.head.pos == VERB:
                    verbs.add(v.head)
            
            if not verbs:
                continue
 
            rootVerb = ([w for w in doc if w.head is w] or [None])[0]
            
            #check first the root verb and then the others.
            verbs = [rootVerb] + list(verbs)
            
            for v in verbs:
                if checkAction(v):            
                    
                    for np in doc.noun_chunks:
                        if np.root.head == v:

                            if(np.root.dep == nsubj):
                                claimer = np.text
                                claimerFound = True
                                
                            if(np.root.dep == dobj): #TODO
                                pass
                                
                            claimFound = True
                    
                    if claimerFound:
                        break
                            
                    
                
            if claimFound:
                    claim = s
                    global cc 
                    cc += 1
                    claims.append(claim)
                    claimer, claimerType = resolveClaimer(claimer, doc.ents, allEntities)
                    claimer, claimerType = improveClaimerEntity(claimer, claimerType, allEntities)
                    print('claim: ', claim)
                    print('by: ', claimer, '(', claimerType, ')')
                    print()
                    continue

        return claims

    
    claims = documents.apply(lambda d: dependencyGraphSearch(d['title'],d['body']), axis=1)
    
    print (cc)
    return len(claims[0])

claimExtraction(2)    

claim:  Three studies show the benefits of healthier school meals.
by:  The School Nutrition Association ( ORG )

claim:  From The Rudd Center, now at U. Conn: A press release announces publication of its new study in Childhood Obesity demonstrating that the rules have led to an increase in fruit consumption without increasing plate waste.
by:  The Rudd Center ( ORG )

claim:  From the Harvard School of Public Health: It also sends a press release to announce its study demonstrating that an increase in consumption of fruits and vegetables is a direct result of the new USDA standards, and that these also do not increase plate waste.
by:  the Harvard School of Public Health ( ORG )

claim:  From the Union of Concerned Scientists: UCS announces a new position paper, “Lessons from the Lunchroom: Childhood Obesity, School Lunch, and the Way to a Healthier Future,” also documenting why school meals are so important to kids’ health.
by:  the Union of Concerned Scientists ( ORG )

claim:  Post

5