In [23]:
import pandas as pd
import os
import regex as re
from datetime import datetime
import statsmodels.formula.api as smf
from sklearn.preprocessing import StandardScaler
from scipy.stats import pearsonr, spearmanr
from metrics import utils
import numpy as np
from lda import LDA
from gensim.parsing.preprocessing import remove_stopwords
from sklearn.feature_extraction.text import CountVectorizer
ROOT_DIR = os.path.dirname(os.path.abspath("__file__"))


In [24]:
def query_string_gen(file_dir, words):
    l=[]

    with open(file_dir, 'r') as file:
        lines = file.readlines()
    for line in lines:
        for word in words:
            if word in line:
                l.append(re.search(r'\d+', line).group())
    l = list(dict.fromkeys(l))

    string = ""
    for i,j in enumerate(l):
        if i==len(l)-1:
            string += f"Topic == {j}"
        else:
            string += f"Topic == {j} | "
    return string

In [25]:
utils.unite_sources()

-> Current Dataset:
AP: 6623 Articles
Fox: 5346 Articles
CNN: 3205 Articles
ABC: 1794 Articles
CBS: 3784 Articles
NYT: 3148 Articles
Mirror: 2053 Articles
Reuters: 18213 Articles
Express: 10804 Articles
HuffPost: 682 Articles
Guardian: 5293 Articles
DailyMail: 8239 Articles
-> Saved CSV with 69184 articles.



In [56]:
class NTR:
    def __init__(self) -> None:
        self.sources = ["AP", "Fox", "CNN", "ABC", "CBS", "NYT", "Mirror", "Reuters", "Express", "HuffPost", "Guardian", "DailyMail"]

    def kld_window(self, dataframe, date_start, date_end, kld_days_window):
        data = dataframe
        data["Date"] = pd.to_datetime(data["Date"])
        df_split = data.loc[(data["Date"] >= date_start) & (data["Date"] < date_end)]
        df_count = df_split.resample("D", on="Date").apply({"URL": "count"})
        daily_count = int(sum(df_count["URL"].tolist()) / len(df_count["URL"].tolist()))
        print(f"-> This dataset has an average of {daily_count} daily stories from {date_start} to {date_end}.")
        print(f"-> KLD window will be of {kld_days_window}*{daily_count} = {kld_days_window*daily_count} articles.\n")
        return kld_days_window * daily_count

    def learn_topics(self, dataframe, topicnum, vocabsize, num_iter):
        # Removes stopwords
        texts = dataframe["Text"].tolist()
        texts_no_sw = []
        for text in texts:
            text_no_sw = remove_stopwords(text)
            texts_no_sw.append(text_no_sw)

        # Get vocab and word counts. Use the top 10k most frequent
        # lowercase unigrams with at least 2 alphabetical, non-numeric characters,
        # punctuation treated as separators.
        texts = texts_no_sw
        count_vectorizer = CountVectorizer(token_pattern=r"(?u)\b[^\W\d]{2,}\b", max_features=vocabsize, lowercase=True)
        doc_vcnts = count_vectorizer.fit_transform(texts)
        vocabulary = count_vectorizer.get_feature_names_out()

        # Learn topics.
        lda_model = LDA(topicnum, n_iter=num_iter, refresh=100)
        doc_topic = lda_model.fit_transform(doc_vcnts)
        topic_word = lda_model.topic_word_

        return doc_topic, topic_word, vocabulary

    def save_topicmodel(self, path, doc_topic, topic_word, vocabulary, source):

        if not os.path.exists(path):
            os.makedirs(path)

        topicmixture_outpath = os.path.join(path, source + "_TopicMixtures.txt")
        np.savetxt(topicmixture_outpath, doc_topic)
        topic_outpath = os.path.join(path, source + "_Topics.txt")
        np.savetxt(topic_outpath, topic_word)
        vocab_outpath = os.path.join(path, source + "_Vocab.txt")
        with open(vocab_outpath, mode="w", encoding="utf-8") as file:
            for word in vocabulary:
                file.write(word + "\n")

        return topicmixture_outpath, topic_outpath, vocab_outpath

    def kld_from_probdists(self, pdists0, pdists1):

        assert pdists0.shape == pdists1.shape, "pdist* shapes must be identical"
        if len(pdists0.shape) == 1:
            kl_divergences = (pdists1 * np.log2(pdists1 / pdists0)).sum()
        elif len(pdists0.shape) == 2:
            kl_divergences = (pdists1 * np.log2(pdists1 / pdists0)).sum(axis=1)

        return kl_divergences

    def novelty_transience_resonance(self, thetas_arr, scale):

        speechstart = scale
        speechend = thetas_arr.shape[0] - scale
        novelties = []
        transiences = []
        resonances = []
        for j in range(speechstart, speechend, 1):
            center_theta = thetas_arr[j]
            after_boxend = j + scale + 1
            before_boxstart = j - scale
            before_theta_arr = thetas_arr[before_boxstart:j]
            beforenum = before_theta_arr.shape[0]
            before_centertheta_arr = np.tile(center_theta, reps=(beforenum, 1))
            after_theta_arr = thetas_arr[j + 1 : after_boxend]
            afternum = after_theta_arr.shape[0]
            after_centertheta_arr = np.tile(center_theta, reps=(afternum, 1))
            before_klds = self.kld_from_probdists(before_theta_arr, before_centertheta_arr)
            after_klds = self.kld_from_probdists(after_theta_arr, after_centertheta_arr)
            novelty = np.mean(before_klds)
            transience = np.mean(after_klds)
            novelties.append(novelty)
            transiences.append(transience)
            resonances.append(novelty - transience)
        for _ in range(0, scale):
            transiences.insert(0, 0)
            transiences.append(0)
            novelties.insert(0, 0)
            novelties.append(0)
            resonances.insert(0, 0)
            resonances.append(0)

        return novelties, transiences, resonances

    def save_novel_trans_reson(self, path, novelties, transiences, resonances, source):

        outpath = os.path.join(path, source + "_NovelTransReson.txt")
        np.savetxt(outpath, np.vstack(list(zip(novelties, transiences, resonances))))

    def routine(self, date_start, date_end, kld_days_window, vocabsize, num_iter):
        
        print("-> Reading first topic modeling results (LDA)...")
        results = pd.read_csv(os.path.join(ROOT_DIR,"results","All_Results.csv"))
        query_string = query_string_gen(os.path.join(ROOT_DIR,"results","All_TopicsWords.txt"), 
            words=["drones", "troops", "strike", "killed", "attack", "shelling", "strikes"]
            )
        print(f"-> Conflict-related topics on first LDA analysis: {query_string}\n")
        results = results.query(query_string)
        results = results.drop(columns=["Novelty","Transience","Resonance","Comments"])

        source_dfs = []
        for i, source in enumerate(self.sources):
            query = results.query(f"Source == '{str(source)}'")
            source_dfs.append(query.copy())
            print(f"{source} length: {len(query)}")
        results["Source"] = "All"
        source_dfs.append(results.copy())
        print(f"All length: {len(results)}\n")


        for i, j in enumerate(source_dfs):
            data = source_dfs[i]
            source = data.at[data.index[0], 'Source']
            print(f"-> Starting {source} second topic modeling (LDA)...")
            scale = self.kld_window(data, date_start, date_end, kld_days_window)

            doc_topic, topic_word, vocabulary = self.learn_topics(data, 30, vocabsize, num_iter)

            # getting topic of each text
            topics = []
            for i in range(len(data)):
                topics.append(doc_topic[i].argmax())

            self.save_topicmodel(os.path.join(ROOT_DIR, "results_two_lda"), doc_topic, topic_word, vocabulary, source)
            novelties, transiences, resonances = self.novelty_transience_resonance(doc_topic, scale)
            self.save_novel_trans_reson(os.path.join(ROOT_DIR, "results_two_lda"), novelties, transiences, resonances, source)
            ntr_data = data
            ntr_data["Novelty"] = novelties
            ntr_data["Transience"] = transiences
            ntr_data["Resonance"] = resonances
            ntr_data["Topic2"] = topics
            ntr_data.to_csv(os.path.join(ROOT_DIR, "results_two_lda", source + "_Results.csv"), index=False)

            # geting words of each topic
            words = []
            for i, topic_dist in enumerate(topic_word):
                topic_words = np.array(vocabulary)[np.argsort(topic_dist)][:-16:-1]
                words.append(f"Topic {i}: {' '.join(topic_words)}")
            with open(os.path.join(ROOT_DIR, "results_two_lda", source + "_TopicsWords.txt"), "w") as file:
                file.write("\n".join(map(str, words)))

            print("")

        print("-> All LDA data saved.\n")


In [57]:
NTR().routine(date_start="2022-03-01", date_end="2022-08-01", kld_days_window=2, vocabsize=10000, num_iter=300)

-> Reading first topic modeling results (LDA)...
-> Conflict-related topics on first LDA analysis: Topic == 4 | Topic == 7 | Topic == 20 | Topic == 45 | Topic == 47 | Topic == 49

AP length: 663
Fox length: 938
CNN length: 515
ABC length: 260
CBS length: 543
NYT length: 596
Mirror length: 623
Reuters length: 2736
Express length: 3004
HuffPost length: 81
Guardian length: 767
DailyMail length: 1758
All length: 12484

-> Starting AP second topic modeling (LDA)...
-> This dataset has an average of 1 daily stories from 2022-03-01 to 2022-08-01.
-> KLD window will be of 2*1 = 2 articles.



INFO:lda:n_documents: 663
INFO:lda:vocab_size: 10000
INFO:lda:n_words: 522391
INFO:lda:n_topics: 30
INFO:lda:n_iter: 300
INFO:lda:<0> log likelihood: -6207189
INFO:lda:<100> log likelihood: -4255133
INFO:lda:<200> log likelihood: -4220150
INFO:lda:<299> log likelihood: -4204303



-> Starting Fox second topic modeling (LDA)...
-> This dataset has an average of 3 daily stories from 2022-03-01 to 2022-08-01.
-> KLD window will be of 2*3 = 6 articles.



INFO:lda:n_documents: 938
INFO:lda:vocab_size: 10000
INFO:lda:n_words: 300467
INFO:lda:n_topics: 30
INFO:lda:n_iter: 300
INFO:lda:<0> log likelihood: -3676800
INFO:lda:<100> log likelihood: -2479846
INFO:lda:<200> log likelihood: -2460378
INFO:lda:<299> log likelihood: -2450753



-> Starting CNN second topic modeling (LDA)...
-> This dataset has an average of 1 daily stories from 2022-03-01 to 2022-08-01.
-> KLD window will be of 2*1 = 2 articles.



INFO:lda:n_documents: 515
INFO:lda:vocab_size: 10000
INFO:lda:n_words: 270644
INFO:lda:n_topics: 30
INFO:lda:n_iter: 300
INFO:lda:<0> log likelihood: -3300390
INFO:lda:<100> log likelihood: -2231738
INFO:lda:<200> log likelihood: -2212356
INFO:lda:<299> log likelihood: -2202899
INFO:lda:n_documents: 260
INFO:lda:vocab_size: 10000
INFO:lda:n_words: 117162
INFO:lda:n_topics: 30
INFO:lda:n_iter: 300



-> Starting ABC second topic modeling (LDA)...
-> This dataset has an average of 0 daily stories from 2022-03-01 to 2022-08-01.
-> KLD window will be of 2*0 = 0 articles.



INFO:lda:<0> log likelihood: -1481115
INFO:lda:<100> log likelihood: -975691
INFO:lda:<200> log likelihood: -965249
INFO:lda:<299> log likelihood: -962364
INFO:lda:n_documents: 543
INFO:lda:vocab_size: 10000
INFO:lda:n_words: 195211
INFO:lda:n_topics: 30
INFO:lda:n_iter: 300



-> Starting CBS second topic modeling (LDA)...
-> This dataset has an average of 1 daily stories from 2022-03-01 to 2022-08-01.
-> KLD window will be of 2*1 = 2 articles.



INFO:lda:<0> log likelihood: -2438649
INFO:lda:<100> log likelihood: -1622772
INFO:lda:<200> log likelihood: -1608721
INFO:lda:<299> log likelihood: -1602729



-> Starting NYT second topic modeling (LDA)...
-> This dataset has an average of 1 daily stories from 2022-03-01 to 2022-08-01.
-> KLD window will be of 2*1 = 2 articles.



INFO:lda:n_documents: 596
INFO:lda:vocab_size: 10000
INFO:lda:n_words: 382326
INFO:lda:n_topics: 30
INFO:lda:n_iter: 300
INFO:lda:<0> log likelihood: -4658176
INFO:lda:<100> log likelihood: -3196102
INFO:lda:<200> log likelihood: -3170519
INFO:lda:<299> log likelihood: -3159295
INFO:lda:n_documents: 623
INFO:lda:vocab_size: 10000
INFO:lda:n_words: 198041
INFO:lda:n_topics: 30
INFO:lda:n_iter: 300



-> Starting Mirror second topic modeling (LDA)...
-> This dataset has an average of 3 daily stories from 2022-03-01 to 2022-08-01.
-> KLD window will be of 2*3 = 6 articles.



INFO:lda:<0> log likelihood: -2427817
INFO:lda:<100> log likelihood: -1660150
INFO:lda:<200> log likelihood: -1645075
INFO:lda:<299> log likelihood: -1641339



-> Starting Reuters second topic modeling (LDA)...
-> This dataset has an average of 8 daily stories from 2022-03-01 to 2022-08-01.
-> KLD window will be of 2*8 = 16 articles.



INFO:lda:n_documents: 2736
INFO:lda:vocab_size: 10000
INFO:lda:n_words: 463909
INFO:lda:n_topics: 30
INFO:lda:n_iter: 300
INFO:lda:<0> log likelihood: -5621162
INFO:lda:<100> log likelihood: -3766377
INFO:lda:<200> log likelihood: -3736863
INFO:lda:<299> log likelihood: -3728293



-> Starting Express second topic modeling (LDA)...
-> This dataset has an average of 9 daily stories from 2022-03-01 to 2022-08-01.
-> KLD window will be of 2*9 = 18 articles.



INFO:lda:n_documents: 3004
INFO:lda:vocab_size: 10000
INFO:lda:n_words: 857281
INFO:lda:n_topics: 30
INFO:lda:n_iter: 300
INFO:lda:<0> log likelihood: -10178323
INFO:lda:<100> log likelihood: -7187780
INFO:lda:<200> log likelihood: -7122041
INFO:lda:<299> log likelihood: -7098328
INFO:lda:n_documents: 81
INFO:lda:vocab_size: 5199
INFO:lda:n_words: 24393
INFO:lda:n_topics: 30
INFO:lda:n_iter: 300
INFO:lda:<0> log likelihood: -310446



-> Starting HuffPost second topic modeling (LDA)...
-> This dataset has an average of 0 daily stories from 2022-03-01 to 2022-08-01.
-> KLD window will be of 2*0 = 0 articles.



INFO:lda:<100> log likelihood: -205362
INFO:lda:<200> log likelihood: -204087
INFO:lda:<299> log likelihood: -203376



-> Starting Guardian second topic modeling (LDA)...
-> This dataset has an average of 2 daily stories from 2022-03-01 to 2022-08-01.
-> KLD window will be of 2*2 = 4 articles.



INFO:lda:n_documents: 767
INFO:lda:vocab_size: 10000
INFO:lda:n_words: 363801
INFO:lda:n_topics: 30
INFO:lda:n_iter: 300
INFO:lda:<0> log likelihood: -4445640
INFO:lda:<100> log likelihood: -3072140
INFO:lda:<200> log likelihood: -3048979
INFO:lda:<299> log likelihood: -3041124



-> Starting DailyMail second topic modeling (LDA)...
-> This dataset has an average of 6 daily stories from 2022-03-01 to 2022-08-01.
-> KLD window will be of 2*6 = 12 articles.



INFO:lda:n_documents: 1758
INFO:lda:vocab_size: 10000
INFO:lda:n_words: 1655278
INFO:lda:n_topics: 30
INFO:lda:n_iter: 300
INFO:lda:<0> log likelihood: -19240875
INFO:lda:<100> log likelihood: -14072162
INFO:lda:<200> log likelihood: -13932201
INFO:lda:<299> log likelihood: -13882973



-> Starting All second topic modeling (LDA)...
-> This dataset has an average of 40 daily stories from 2022-03-01 to 2022-08-01.
-> KLD window will be of 2*40 = 80 articles.



INFO:lda:n_documents: 12484
INFO:lda:vocab_size: 10000
INFO:lda:n_words: 5277969
INFO:lda:n_topics: 30
INFO:lda:n_iter: 300
INFO:lda:<0> log likelihood: -60545641
INFO:lda:<100> log likelihood: -44574670
INFO:lda:<200> log likelihood: -44288865
INFO:lda:<299> log likelihood: -44213028



-> All LDA data saved.



# Correlation

In [51]:
def spearman_pearson(dir,source,filter=False):
    events = pd.read_csv(os.path.join(ROOT_DIR,"data","Ukraine_Black_Sea_2020_2022_Nov18.csv"), parse_dates=["EVENT_DATE"])
    events = events[["EVENT_DATE","EVENT_TYPE","FATALITIES"]].rename(columns={"EVENT_DATE":"Date","EVENT_TYPE":"Count","FATALITIES":"Fatalities"})  # pegando somente colunas relevantes
    events = events.set_index("Date") # convertendo coluna de datas pra datetime e setando indice
    events = events.resample('D').agg({'Count': 'count', 'Fatalities': 'sum'}) # resample: contagem de eventos, soma de fatalidades

    results = pd.read_csv(os.path.join(dir,f"{source}_Results.csv"), parse_dates=["Date"],index_col=["Date"])
    if filter == True:
        results = results.query(query_string_gen(os.path.join(dir,f"{source}_TopicsWords.txt"), words=["missiles", "strike", "killed", "attack", "shelling", "missile"]))
    results = results[["Resonance","Novelty","Transience"]]
    results = results.sort_index()
    results = results.loc["2018-01-01":"2022-11-18"] # Matching other dataframe
    results = results.resample('D').sum()
    results[["Count","Fatalities"]] = events[["Count","Fatalities"]].copy()
    results = results.loc["2020-01-01":"2022-11-16"] 

    s_fatalities, s_fatalities_p = spearmanr(results['Fatalities'], results['Resonance'])
    p_fatalities, p_fatalities_p = pearsonr(results['Fatalities'], results['Resonance'])
    s_events, s_events_p = spearmanr(results['Count'], results['Resonance'])
    p_events, p_events_p = pearsonr(results['Count'], results['Resonance'])

    if s_fatalities_p <= 0.05:
        print(f'{source} (Filter = {filter}) R X FAT: Spearman: {s_fatalities:.4f} p-value: {s_fatalities_p:.4f}')
    if p_fatalities_p <= 0.05:
        print(f'{source} (Filter = {filter}) R X FAT: Pearson: {p_fatalities:.4f} p-value: {p_fatalities_p:.4f}')
    if s_events_p <= 0.05:
        print(f'{source} (Filter = {filter}) R X EVE: Spearman: {s_events:.4f} p-value: {s_events_p:.4f}')
    if p_events_p <= 0.05:
        print(f'{source} (Filter = {filter}) R X EVE: Pearson: {p_events:.4f} p-value: {p_events_p:.4f}')

### 1 LDA

In [52]:
sources = ["AP", "Fox", "CNN", "ABC", "CBS", "NYT", "Mirror", "Reuters", "Express", "HuffPost", "Guardian", "DailyMail", "All"]
for source in sources:
    for i in [True,False]:
        spearman_pearson(os.path.join(ROOT_DIR,"results"),source,i)

AP (Filter = False) R X EVE: Spearman: 0.0649 p-value: 0.0364
ABC (Filter = True) R X FAT: Spearman: 0.0606 p-value: 0.0497
ABC (Filter = True) R X EVE: Spearman: 0.0755 p-value: 0.0144
ABC (Filter = True) R X EVE: Pearson: 0.0628 p-value: 0.0419
Express (Filter = True) R X EVE: Pearson: -0.0951 p-value: 0.0034
HuffPost (Filter = True) R X FAT: Spearman: 0.0932 p-value: 0.0030
DailyMail (Filter = False) R X EVE: Spearman: -0.0649 p-value: 0.0389


### 2 LDA

In [53]:
for source in sources:
    spearman_pearson(os.path.join(ROOT_DIR,"results_two_lda"),source)

CNN (Filter = False) R X FAT: Pearson: 0.0769 p-value: 0.0126
CBS (Filter = False) R X FAT: Pearson: 0.0748 p-value: 0.0157
DailyMail (Filter = False) R X FAT: Pearson: 0.0968 p-value: 0.0024
