In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

from sklearn.metrics.pairwise import cosine_similarity
import spacy
nlp = spacy.load('en')
from spacy.symbols import nsubj

In [2]:
from settings import *
from utils import *

### Loads GloVe Embeddings

In [3]:
from gloveEmbeddings import *
loadGloveEmbeddings(gloveFile)

### Creates Keyword Vectors

In [4]:
def keywordVectors():
    source = ['paper', 'report', 'study', 'analysis', 'research', 'survey']
    people = ['expert', 'scientist']
    subject = source + people
    predicate = ['prove', 'demonstrate', 'reveal', 'state', 'mention', 'report', 'say', 'show', 'announce', 'claim', 'suggest']
    
    subjectVec =[word2vec(s) for s in subject]
    predicateVec = [word2vec(p) for p in predicate]
    
    return subjectVec, predicateVec

### Searches (on the vector space) for sentences containing the given subject and predicate keywords.

In [5]:
def keywordSearch(title, body, subjectVec, predicateVec):
    subjectThreshold = 0.9
    predicateThreshold = 0.9

    claims = []
    for s in sent_tokenize(body):
        subjectFound = predicateFound = False
        claim = ""
        for w in wordpunct_tokenize(s):

            if predicateFound == True:
                claim = s
                claims.append(claim)
                break

            wVec = word2vec(w)

            if subjectFound == False:
                for sVec in subjectVec:
                    if abs(cosine_similarity(sVec.reshape(1, -1), wVec.reshape(1, -1))) > subjectThreshold:
                        subjectFound = True
                        break

            if subjectFound == True:
                for pVec in predicateVec:
                    if abs(cosine_similarity(pVec.reshape(1, -1), wVec.reshape(1, -1))) > predicateThreshold:
                        predicateFound = True
                        break
    return claims


In [6]:
def claimExtraction(limitDocuments=10):
    query = createQuery(limitDocuments, 'web')
    documents = queryDB(query)

    subjectVec, predicateVec = keywordVectors()
    
    
    
    def dependencyGraphSearch(title, body, subjectVec, predicateVec):
        
        claims = []
        for s in sent_tokenize(body):
            subjectFound = predicateFound = False
            claim = ""
            holder = ""
            
            try:
                doc = nlp(s)
                rootVerb = [w for w in doc if w.head is w][0]
            except:
                continue
            
            wVec = word2vec(str(rootVerb))
            for pVec in predicateVec:
                if abs(cosine_similarity(pVec.reshape(1, -1), wVec.reshape(1, -1))) > 0.9:
                    predicateFound = True
                    break

                        
            if predicateFound == True:
                for np in doc.noun_chunks:
                    if(np.root.dep == nsubj and np.root.head == rootVerb):
                        wVec = word2vec(str(np.root.text))
                    
                        for sVec in subjectVec:    
                            if abs(cosine_similarity(sVec.reshape(1, -1), wVec.reshape(1, -1))) > 0.5:
                                    subjectFound = True
                                    holder = np.text
                                    break
                
            if subjectFound == True:
                    claim = s
                    claims.append(claim)
                    #print('claim:', claim)
                    #print('by:', holder)
                    break

        return claims

    
    claims = documents.apply(lambda d: dependencyGraphSearch(d['title'],d['body'], subjectVec, predicateVec), axis=1)
    return claims

claimExtraction(100)    

0                                                                                                                                                                                                                                                                                                                                             [Three studies show the benefits of healthier school meals.]
1                                                                                                                                                                                                                                                                                                                                  [Many studies show that calorie intake is higher when people dine out.]
2                                                                                                                                                                                                                 