In [None]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2
import seaborn as sns

from settings import *
from utils import *
from quoteAnalysis import *

In [None]:
#Run the pipeline to prepare the dataframe for the plots
documents, topics = quotePipeline()

In [None]:
topics = topics.toPandas()

In [None]:
def plotNumOfQuotes():
    
    #Filter (aesthetic purposes)
    countLimit = 0

    #Data manipulation
    total = pd.DataFrame(topics.groupby(['articleTopic']).size().sort_values().rename('count')).reset_index()
    total = total[total['count'] > countLimit]
    t = total['articleTopic']
    person = pd.DataFrame(topics.query("quoteeType == 'PERSON' & articleTopic in @t").groupby(['articleTopic']).size().sort_values().rename('count')).reset_index()
    personorg = pd.DataFrame(topics.query("quoteeType in ['PERSON', 'ORG'] & articleTopic in @t").groupby(['articleTopic']).size().sort_values().rename('count')).reset_index()
    personorgauth = pd.DataFrame(topics.query("(quotee == 'authority' | quoteeType in ['PERSON', 'ORG']) & articleTopic in @t").groupby(['articleTopic']).size().sort_values().rename('count')).reset_index()
    
    #Aesthetics
    sns.set(style="whitegrid")
    f, ax = plt.subplots(figsize=(6, 6))
    colors = sns.color_palette("Blues",  n_colors=4)

    #Barplots
    sns.barplot(x="count", y="articleTopic", data=total, ax=ax, label="total", color=colors[0])
    sns.barplot(x="count", y="articleTopic", data=personorgauth, ax=ax, label="authority", color=colors[1])
    sns.barplot(x="count", y="articleTopic", data=personorg, ax=ax, label="organizations", color=colors[2])
    sns.barplot(x="count", y="articleTopic", data=person, ax=ax, label="persons", color=colors[3])

    #Labels
    ax.set(ylabel="Topic of Article", xlabel="Number of Quotes")
    ax.legend(ncol=1, loc="upper right", frameon=True)
    sns.despine()
    locs, labels = plt.xticks()
    labels = [human_format(l) for l in locs]
    plt.xticks(locs, labels)

    #Depiction
    plt.savefig('plots/CountQuotes.png', dpi=1200, bbox_inches = 'tight')
    plt.show()
plotNumOfQuotes()

In [None]:
def plotHeatMap():
    
    #Data manipulation
    data = topics[topics['quoteSim']>0.1][['articleTopic', 'quoteTopic']].groupby(['articleTopic', 'quoteTopic']).size().reset_index(name='counts').pivot(index='articleTopic', columns='quoteTopic', values='counts').fillna(0)
    #normalization
    data = data.div(data.sum(axis=1), axis=0)
    
    #Aesthetics
    sns.set(style="whitegrid")
    f, ax = plt.subplots(figsize=(8, 8))
    sns.set_color_codes("pastel")

    #Plot
    ax = sns.heatmap(data)

    #Labels
    ax.set(ylabel="Topic of Article", xlabel="Topic of Quote")
    
    #Depiction
    plt.savefig('plots/TopicsHeatMap.png', dpi=1200, bbox_inches = 'tight')
    plt.show()    
plotHeatMap()

In [None]:
documents = documents.select('quotes').toPandas()

In [None]:
documents = documents['quotes'].apply(pd.Series).stack().reset_index(level=1, drop=True).apply(pd.Series)

In [None]:
def plotTopQuotees():
    
    #Data manipulation    
    data = documents[documents['quoteeType'] == 'PERSON']['quotee'].value_counts().reset_index()
    data.columns = ['quotee', 'count']
    data = data.head(20)
    data = data.drop(3) # dropping â

    #Aesthetics
    sns.set(style="whitegrid")
    f, ax = plt.subplots(figsize=(6, 6))

    #Barplots
    sns.set_color_codes("pastel")
    sns.barplot(x="count", y="quotee", data=data, ax=ax, label="total", color='b')

    #Labels
    ax.set(ylabel="Quotee Name", xlabel="Number of Quotes")
    sns.despine()
    locs, labels = plt.xticks()
    labels = [human_format(l) for l in locs]
    plt.xticks(locs, labels)

    #Depiction
    plt.savefig('plots/TopQuotees.png', dpi=1200, bbox_inches = 'tight')
    plt.show()
plotTopQuotees()

In [None]:
def plotTopOrganizations():
    
    #Data manipulation    
    data = documents[documents['quoteeType'] == 'ORG']['quoteeAffiliation'].value_counts().reset_index()
    data.columns = ['organization', 'count']
    data = data.head(20)
    #data = data.drop(3) # dropping â

    #Aesthetics
    sns.set(style="whitegrid")
    f, ax = plt.subplots(figsize=(6, 6))

    #Barplots
    sns.set_color_codes("pastel")
    sns.barplot(x="count", y="organization", data=data, ax=ax, label="total", color='b')

    #Labels
    ax.set(ylabel="Organization Name", xlabel="Number of Quotes")
    sns.despine()
    locs, labels = plt.xticks()
    labels = [human_format(l) for l in locs]
    plt.xticks(locs, labels)

    #Depiction
    plt.savefig('plots/TopOrganizations.png', dpi=1200, bbox_inches = 'tight')
    plt.show()
plotTopOrganizations()