#### Updating data

In [None]:
from scrapers import *
from utilities import *

In [4]:
guardianScraper()

-> CSV file found with 2712 articles! Latest article date: 2022-06-22
-> Checking articles from latest date onward...
-> API Query |████████████████████████████████████████| (!) 140 in 31.9s (4.39/s) 
-> 135 new articles saved to Guardian.csv! Total articles: 2847


In [6]:
NYTScraper()

-> CSV file found with 3111 articles! Latest article date: 20220622
-> Checking articles from latest date onward...
-> API Query |████████████████████████████████████████| (!) 245 in 2:53.9 (1.41/s) 
-> 205 new articles saved to NYT.csv! Total articles: 3316


In [None]:
guardian_data = pd.read_csv(PARENT_DIR + "/data/Guardian.csv")
nyt_data = pd.read_csv(PARENT_DIR + "/data/NYT.csv")

#### Stopword removal with Gensim

In [4]:
import spacy
import gensim
from spacy.lang.en import English
from gensim.parsing.preprocessing import remove_stopwords
#!python -m spacy download en_core_web_sm

In [29]:
guardian_data = guardian_data.dropna(subset=['Text'])
guardian_texts = guardian_data['Text'].tolist()
guardian_no_sw = []
for text in guardian_texts:
    text_no_sw = remove_stopwords(text)
    guardian_no_sw.append(text_no_sw) 
guardian_data['Text'] = guardian_no_sw

nyt_data = nyt_data.dropna(subset=['Text'])
nyt_texts = nyt_data['Text'].tolist()
nyt_no_sw = []
for text in nyt_texts:
    text_no_sw = remove_stopwords(text)
    nyt_no_sw.append(text_no_sw) 
nyt_data['Text'] = nyt_no_sw

guardian_data.to_csv(PARENT_DIR + "/data/Guardian_no_sw.csv", index=False)
nyt_data.to_csv(PARENT_DIR + "/data/NYT_no_sw.csv", index=False)


#### LDA topic identification

In [26]:
import argparse
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from lda import LDA

In [None]:
def learn_topics(dataframe, topicnum):

    texts = dataframe['Text'].tolist()

    # Get vocabulary and word counts.  Use the top 10,000 most frequent
    # lowercase unigrams with at least 3 alphabetical, non-numeric characters,
    # punctuation treated as separators.
    CVzer = CountVectorizer(token_pattern=r"(?u)\b[^\W\d]{3,}\b",
                            max_features=None,
                            lowercase=True)
    doc_vcnts = CVzer.fit_transform(texts)
    vocabulary = CVzer.get_feature_names_out()

    # Learn topics.  Refresh conrols print frequency.
    lda_model = LDA(topicnum, n_iter=8000, refresh=2000) 
    doc_topic = lda_model.fit_transform(doc_vcnts)
    topic_word = lda_model.topic_word_

    return doc_topic, topic_word, vocabulary

doc_topic, topic_word, vocabulary = learn_topics(guardian_data, 10) 

In [None]:
def save_topicmodel(doc_topic, topic_word, vocabulary):

    ## Topic mixtures.
    topicmixture_outpath = PARENT_DIR + "/results/GuardianTopicMixtures.txt"
    np.savetxt(topicmixture_outpath, doc_topic)

    ## Topics.
    topic_outpath = PARENT_DIR + "/results/GuardianTopics.txt"
    np.savetxt(topic_outpath, topic_word)

    ## Vocabulary order.
    vocab_outpath = PARENT_DIR + "/results/GuardianVocab.txt"
    with open(vocab_outpath, mode="w", encoding="utf-8") as f:
        for v in vocabulary:
            f.write(v + "\n")

    return topicmixture_outpath, topic_outpath, vocab_outpath

save_topicmodel(doc_topic, topic_word, vocabulary)

In [39]:

def KLdivergence_from_probdist_arrays(pdists0, pdists1):
    """
    Calculate KL divergence between probability distributions held on the same
    rows of two arrays.

    NOTE: elements of pdist* are assumed to be positive (non-zero), a
    necessary condition for using Kullback-Leibler Divergence.

    Args:
      pdists* (numpy.ndarray): arrays, where rows for each constitute the two
      probability distributions from which to calculate divergence.  pdists1
      contains the distributions holding probabilities in the numerator of the
      KL divergence summand.

    Returns:
      numpy.ndarray: KL divergences, where the second array's rows are the
        distributions in the numerator of the log in KL divergence

    """

    assert pdists0.shape == pdists1.shape, 'pdist* shapes must be identical'

    if len(pdists0.shape) == 1:
        KLdivs = (pdists1 * np.log2(pdists1/pdists0)).sum()
    elif len(pdists0.shape) == 2:
        KLdivs = (pdists1 * np.log2(pdists1/pdists0)).sum(axis=1)

    return KLdivs

In [40]:
def novelty_transience_resonance(thetas_arr, scale):
    """
    Calculate novelty, transience, and resonance for all center speeches with
    at least one scale of speeches in its past and its future.  Presidential
    speeches are excluded from the surrounding scales.
    
    Args:
      thetas_arr (numpy.ndarray): rows are topic mixtures
      scale (int): positive integer defining scale or scale size
    
    """

    # Find the first and last center speech offset, given scale size.
    speechstart = scale
    speechend = thetas_arr.shape[0] - scale

    # Calculate novelty, transience, resonance.
    novelties = []
    transiences = []
    resonances = []
    for j in range(speechstart, speechend, 1):

        center_theta = thetas_arr[j]

        # Define windows before and after center speech.
        after_boxend = j + scale + 1
        before_boxstart = j - scale

        before_theta_arr = thetas_arr[before_boxstart:j]
        beforenum = before_theta_arr.shape[0]
        before_centertheta_arr = np.tile(center_theta, reps=(beforenum, 1))

        after_theta_arr = thetas_arr[j+1:after_boxend]
        afternum = after_theta_arr.shape[0]
        after_centertheta_arr = np.tile(center_theta, reps=(afternum, 1))

        # Calculate KLDs.
        before_KLDs = KLdivergence_from_probdist_arrays(before_theta_arr,
                before_centertheta_arr)
        after_KLDs = KLdivergence_from_probdist_arrays(after_theta_arr,
                after_centertheta_arr)

        # Calculate means of KLD.
        novelty = np.mean(before_KLDs)
        transience = np.mean(after_KLDs)

        # Final measures for this center speech.
        novelties.append(novelty)
        transiences.append(transience)
        resonances.append(novelty - transience)

    return novelties, transiences, resonances

In [41]:
def save_novel_trans_reson(novelties, transiences, resonances):

    outpath = PARENT_DIR + "/results/GuardianNovelTransReson.txt"
    np.savetxt(outpath, np.vstack(zip(novelties, transiences, resonances)))

In [42]:
novelties, transiences, resonances = novelty_transience_resonance(doc_topic, 10)

In [None]:
save_novel_trans_reson(novelties, transiences, resonances)