In [46]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

from sklearn.metrics.pairwise import cosine_similarity
import spacy
nlp = spacy.load('en')
from spacy.symbols import nsubj, dobj, VERB

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
from settings import *
from utils import *

### Loads GloVe Embeddings

In [3]:
from gloveEmbeddings import *
loadGloveEmbeddings(gloveFile)

### Creates Keyword Vectors

In [4]:
def keywordVectors():
    sources = ['paper', 'report', 'study', 'analysis', 'research', 'survey']
    people = ['expert', 'scientist']
    subjects = sources + people
    predicates = ['prove', 'demonstrate', 'reveal', 'state', 'mention', 'report', 'say', 'show', 'announce', 'claim', 'suggest', 'argue']
    
    subjectsVec = [word2vec(s) for s in subjects]
    predicatesVec = [word2vec(p) for p in predicates]
    
    return subjects, predicates, subjectsVec, predicatesVec

### Searches (on the vector space) for sentences containing the given subject and predicate keywords.

In [5]:
def keywordSearch(title, body, subjectVec, predicateVec):
    subjectThreshold = 0.9
    predicateThreshold = 0.9

    claims = []
    for s in sent_tokenize(body):
        subjectFound = predicateFound = False
        claim = ""
        for w in wordpunct_tokenize(s):

            if predicateFound == True:
                claim = s
                claims.append(claim)
                break

            wVec = word2vec(w)

            if subjectFound == False:
                for sVec in subjectVec:
                    if abs(cosine_similarity(sVec.reshape(1, -1), wVec.reshape(1, -1))) > subjectThreshold:
                        subjectFound = True
                        break

            if subjectFound == True:
                for pVec in predicateVec:
                    if abs(cosine_similarity(pVec.reshape(1, -1), wVec.reshape(1, -1))) > predicateThreshold:
                        predicateFound = True
                        break
    return claims


In [56]:
def claimExtraction(limitDocuments=10):
    query = createQuery(limitDocuments, 'web')
    documents = queryDB(query)

        
    subjects, predicates, subjectsVec, predicatesVec = keywordVectors() #global vars

    def checkPredicate(w):
        for p in predicates:
            if w is not None and w.lemma_ == nlp(p)[0].lemma_:
                return True
        return False
                
#             wVec = word2vec(str(rootVerb))
#             for pVec in predicatesVec:
#                 if abs(cosine_similarity(pVec.reshape(1, -1), wVec.reshape(1, -1))) > 0.75:
#                     predicateFound = True
#                     print ("sentence:",s)
#                     break
    
   
    
    def dependencyGraphSearch(title, body):
        
        claims = []
        for s in sent_tokenize(body):
            claimFound = claimerFound = False
            claim = ""
            claimer = ""
            
            doc = nlp(s)
            
            #find all verbs of the sentence.
            verbs = set()
            for v in doc:
                if v.head.pos == VERB:
                    verbs.add(v.head)
            
            if not verbs:
                continue
 
            rootVerb = ([w for w in doc if w.head is w] or [None])[0]
            
            #check first the root verb and then the others.
            verbs = [rootVerb] + list(verbs)


#             if (predicateFound):
#                 if s.startswith('From the Harvard School'):
#                     print ('sentence:', s)
#                     print ('root:',rootVerb)
                  
            for v in verbs:
                if checkPredicate(v):            
                    
                    for np in doc.noun_chunks:
                        if np.root.head == v:

                            if(np.root.dep == nsubj):
                                claimer = np.text
                                claimerFound = True
                            
                            if(np.root.dep == dobj): #TODO
                                pass
                                
                            claimFound = True
                    
                    if claimerFound:
                        break
                            
                    
                
            if claimFound:
                    claim = s
                    claims.append(claim)
                    print('claim:', claim)
                    print('by:', claimer)
                    print()
                    continue

        #return claims

    
    claims = documents.apply(lambda d: dependencyGraphSearch(d['title'],d['body']), axis=1)
    return claims[0]

claimExtraction(1)    

claim: Three studies show the benefits of healthier school meals.
by: Three studies

claim: From The Rudd Center, now at U. Conn: A press release announces publication of its new study in Childhood Obesity demonstrating that the rules have led to an increase in fruit consumption without increasing plate waste.
by: A press release

claim: From the Harvard School of Public Health: It also sends a press release to announce its study demonstrating that an increase in consumption of fruits and vegetables is a direct result of the new USDA standards, and that these also do not increase plate waste.
by: 

claim: From the Union of Concerned Scientists: UCS announces a new position paper, “Lessons from the Lunchroom: Childhood Obesity, School Lunch, and the Way to a Healthier Future,” also documenting why school meals are so important to kids’ health.
by: UCS

claim: Postscript: Dana Woldow argues that the school food scene would be much easier if schools actually got enough money to pay for wh