In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

from sklearn.metrics.pairwise import cosine_similarity
import spacy
from spacy.symbols import nsubj, dobj, VERB
from pyspark.sql import SQLContext
from pyspark import SparkConf, SparkContext
sc = SparkContext(appName='quoteExtraction', master='local')

In [2]:
from settings import *
from utils import *

### Loads GloVe Embeddings

In [3]:
from gloveEmbeddings import *
loadGloveEmbeddings(gloveFile)

### Creates Keyword Vectors

In [4]:
sourcesKeywords = [nlp(x)[0].lemma_ for x in ['paper', 'report', 'study', 'analysis', 'research', 'survey', 'release']]
peopleKeywords = [nlp(x)[0].lemma_ for x in ['expert', 'scientist']]
actionsKeywords = [nlp(x)[0].lemma_ for x in ['prove', 'demonstrate', 'reveal', 'state', 'mention', 'report', 'say', 'show', 'announce', 'claim', 'suggest', 'argue', 'predict', 'believe', 'think']]

sourcesKeywordsVec = [word2vec(x) for x in sourcesKeywords]
peopleKeywordsVec = [word2vec(x) for x in peopleKeywords]
actionsKeywordsVec = [word2vec(x) for x in actionsKeywords]

### Searches (on the vector space) for sentences containing the given subject and predicate keywords.

In [5]:
def keywordSearch(title, body):
    subjectThreshold = 0.9
    predicateThreshold = 0.9

    claims = []
    for s in sent_tokenize(body):
        subjectFound = predicateFound = False
        claim = ""
        for w in wordpunct_tokenize(s):

            if predicateFound == True:
                claim = s
                claims.append(claim)
                break

            wVec = word2vec(w)

            if subjectFound == False:
                for sVec in sourcesKeywordsVec+peopleKeywordsVec:
                    if abs(cosine_similarity(sVec.reshape(1, -1), wVec.reshape(1, -1))) > subjectThreshold:
                        subjectFound = True
                        break

            if subjectFound == True:
                for pVec in actionsKeywordsVec:
                    if abs(cosine_similarity(pVec.reshape(1, -1), wVec.reshape(1, -1))) > predicateThreshold:
                        predicateFound = True
                        break
    return claims


In [None]:
def quoteExtraction(limitDocuments=10):
    query = createQuery(limitDocuments, 'web')
    documents = queryDB(query)

    global cc 
    cc = 0
    
    def checkAction(w):
        enableEmbeddings = False
        threshold = 0.9
        
        for a in actionsKeywords:
            if w is not None and w.lemma_ == a:
                return True
        
        if (enableEmbeddings):
            wVec = word2vec(w.text)

            for aVec in actionsKeywordsVec:
                if abs(cosine_similarity(aVec.reshape(1, -1), wVec.reshape(1, -1))) > threshold:
                    print ("new action",w)
                    return True
        return False
    
    def acronym(phrase):
        fullAcronym = ''
        compactAcronym = ''
        upperAccronym = ''
        
        if len(phrase.split()) > 1:
            for w in phrase.split():
                for l in w:
                    if (l.isupper()):
                        upperAccronym += l
                if w not in stopWords:
                    compactAcronym += w[0]
                fullAcronym += w[0]
                
        return fullAcronym.lower(), compactAcronym.lower(), upperAccronym.lower()
            
    
    def improveQuoteeEntity(quotee, quoteeType, allEntities):

        if len(quotee.split()) == 1:
            #case where quotee is referred to with his/her first or last name.    
            for e in allEntities:
                if quotee in e.text.split() and quoteeType in ['PERSON']:
                    return e.text, e.label_
            
            #case where quotee is referred to with an acronym.
            for e in allEntities:
                if quotee.lower() in acronym(e.text)  and quoteeType in ['ORG']:
                    return e.text, e.label_
        
        return quotee, quoteeType
    
    def resolveQuotee(quotee, sentenceEntities, allEntities):
        
        nlp = spacy.load('en')
        try:
            c = next(nlp(quotee).noun_chunks)
        except:
            #heuristic: first entity of the sentence or of the whole text.
            for e in sentenceEntities + allEntities:
                if e.label_ in ['PERSON', 'ORG']:
                    return e.text, e.label_         
            return '', 'unknown'        
        
        for e in sentenceEntities:
            if c.text == e.text and e.label_ in ['PERSON', 'ORG']:
                return c.text, e.label_
                   
        for w in sourcesKeywords:
            if c.root.lemma_ == w:
                #heuristic: first entity of the sentence.
                for e in sentenceEntities:
                    if e.label_ in ['PERSON', 'ORG']:
                        return e.text, e.label_
                return 'study', 'unknown'

        for w in peopleKeywords:
            if c.root.lemma_ == w:
                #heuristic: first entity of the sentence.
                for e in sentenceEntities:
                    if e.label_ in ['PERSON', 'ORG']:
                        return e.text, e.label_
                return 'expert', 'unknown'

        return c.text, 'unknown'   
    
    def dependencyGraphSearch(title, body):
        
        nlp = spacy.load('en')
        allEntities = nlp(body).ents + nlp(title).ents
        quotes = []
        
        for s in sent_tokenize(body):
            quoteFound = quoteeFound = False
            quote = quotee = quoteeType = ""
            
            doc = nlp(s)
            
            #find all verbs of the sentence.
            verbs = set()
            for v in doc:
                if v.head.pos == VERB:
                    verbs.add(v.head)
            
            if not verbs:
                continue
 
            rootVerb = ([w for w in doc if w.head is w] or [None])[0]
            
            #check first the root verb and then the others.
            verbs = [rootVerb] + list(verbs)
            
            for v in verbs:
                if checkAction(v):            
                    
                    for np in doc.noun_chunks:
                        if np.root.head == v:

                            if(np.root.dep == nsubj):
                                quotee = np.text
                                quoteeFound = True
                                
                            if(np.root.dep == dobj): #TODO
                                pass
                                
                            quoteFound = True
                    
                    if quoteeFound:
                        break
                            
                    
                
            if quoteFound:
                    quote = s                    
                    quotee, quoteeType = resolveQuotee(quotee, doc.ents, allEntities)
                    quotee, quoteeType = improveQuoteeEntity(quotee, quoteeType, allEntities)                    
        
                    quotes.append({'quote': quote, 'quotee':quotee, 'quoteeType':quoteeType})
                    #print('quote: ', quote)
                    #print('by: ', quotee, '(', quoteeType, ')')
                    #print()
                    continue
                    
        return quotes
    
    
    #TODO add case for Spark
    rddd = SQLContext(sc).createDataFrame(documents[['title','body']]).rdd
    documents['quotes'] = rddd.map(lambda s: dependencyGraphSearch(s.title, s.body)).collect()
    
    
    documents['quotes'] = documents.apply(lambda d: dependencyGraphSearch(d['title'],d['body']), axis=1)

    
    
    documents = documents.join(documents['quotes'].apply(pd.Series).stack().reset_index(level=1, drop=True).apply(pd.Series))
    
    print ('#quotesPerDocument: ',len(documents)/limitDocuments)
    return documents

documents = quoteExtraction(20)
documents.to_pickle('quotes.pkl')
documents[['topic_label', 'quotee']].groupby('topic_label').size()

rdd created
