In [2]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

from pyspark.sql import SQLContext
from pyspark import SparkConf, SparkContext
ctx = SQLContext(SparkContext(conf = (SparkConf().setMaster('local[*]').setAppName('quoteExtraction').set('spark.executor.memory', '2G').set('spark.driver.memory', '40G').set('spark.driver.maxResultSize', '10G'))))

In [3]:
from settings import *
from utils import *
from quoteExtraction import *

In [4]:
def quoteExtraction(limitDocuments=10):
    query = createQuery(limitDocuments, 'web')
    documents = queryDB(query)        

    if useSpark:
        rddd = ctx.createDataFrame(documents[['title','body']]).rdd
        documents['quotes'] = rddd.map(lambda s: dependencyGraphSearch(s.title, s.body)).collect()
    else:
        documents['quotes'] = documents.apply(lambda d: dependencyGraphSearch(d['title'],d['body']), axis=1)
    
    documents = documents[['topic_label']].join(documents['quotes'].apply(pd.Series).stack().reset_index(level=1, drop=True).apply(pd.Series))
    
    print ('#quotesPerDocument: ',len(documents)/limitDocuments)
    return documents

if useCache:
    documents = pd.read_pickle(cachedDataFrame)
else:
    documents = quoteExtraction(20)
    documents.to_pickle(cachedDataFrame)

In [5]:
documents[['topic_label', 'quotee']].groupby('topic_label').size()

topic_label
Addiction                                          32366
Additives, Preservatives                           38270
Bacteria, Disease                                  43713
Boycott, Ban, Campaign, Action                      7920
Children                                           50886
Concern, Risk, Worry                               34133
Congress, Government                               31849
Consumer Awareness, Transparency, Trust            17741
Consumer, Demand, Supply                           94979
Crop, Contamination                                61700
Diet, Healthy Eating                               59251
Evaluation, Examination, Test                      30218
Farm Bill, GMO Bill                                15835
Farm, Agriculture, Farmer                          72443
Fortification, Pasteurization, -Medieval             636
Fresh Food, Real Food, Healthy Food                71651
GMO, Bioengineering                                79528
Health, Well-Being,