In [165]:
import pandas as pd
import os
import regex as re
from datetime import datetime
import statsmodels.formula.api as smf
from sklearn.preprocessing import StandardScaler
from scipy.stats import pearsonr, spearmanr
from metrics import utils
import numpy as np
from lda import LDA
from gensim.parsing.preprocessing import remove_stopwords
from sklearn.feature_extraction.text import CountVectorizer
ROOT_DIR = os.path.dirname(os.path.abspath("__file__"))


In [166]:
def query_string_gen(file_dir, words):
    l=[]

    with open(file_dir, 'r') as file:
        lines = file.readlines()
    for line in lines:
        for word in words:
            if word in line:
                l.append(re.search(r'\d+', line).group())
    l = list(dict.fromkeys(l))

    string = ""
    for i,j in enumerate(l):
        if i==len(l)-1:
            string += f"Topic == {j}"
        else:
            string += f"Topic == {j} | "
    return string

In [167]:
# utils.unite_sources()

In [168]:
class NTR:
    def __init__(self) -> None:
        self.sources = ["AP", "Fox", "CNN", "ABC", "CBS", "NYT", "Mirror", "Reuters", "Express", "HuffPost", "Guardian", "DailyMail"]

    def kld_window(self, dataframe, date_start, date_end, kld_days_window):
        data = dataframe
        data["Date"] = pd.to_datetime(data["Date"])
        df_split = data.loc[(data["Date"] >= date_start) & (data["Date"] < date_end)]
        df_count = df_split.resample("D", on="Date").apply({"URL": "count"})
        daily_count = int(sum(df_count["URL"].tolist()) / len(df_count["URL"].tolist()))
        print(f"-> This dataset has an average of {daily_count} daily stories from {date_start} to {date_end}.")
        print(f"-> KLD window will be of {kld_days_window}*{daily_count} = {kld_days_window*daily_count} articles.\n")
        return kld_days_window * daily_count

    def learn_topics(self, dataframe, topicnum, vocabsize, num_iter):
        # Removes stopwords
        texts = dataframe["Text"].tolist()
        texts_no_sw = []
        for text in texts:
            text_no_sw = remove_stopwords(text)
            texts_no_sw.append(text_no_sw)

        # Get vocab and word counts. Use the top 10k most frequent
        # lowercase unigrams with at least 2 alphabetical, non-numeric characters,
        # punctuation treated as separators.
        texts = texts_no_sw
        count_vectorizer = CountVectorizer(token_pattern=r"(?u)\b[^\W\d]{2,}\b", max_features=vocabsize, lowercase=True)
        doc_vcnts = count_vectorizer.fit_transform(texts)
        vocabulary = count_vectorizer.get_feature_names_out()

        # Learn topics.
        lda_model = LDA(topicnum, n_iter=num_iter, refresh=100)
        doc_topic = lda_model.fit_transform(doc_vcnts)
        topic_word = lda_model.topic_word_

        return doc_topic, topic_word, vocabulary

    def save_topicmodel(self, path, doc_topic, topic_word, vocabulary, source):

        if not os.path.exists(path):
            os.makedirs(path)

        topicmixture_outpath = os.path.join(path, source + "_TopicMixtures.txt")
        np.savetxt(topicmixture_outpath, doc_topic)
        topic_outpath = os.path.join(path, source + "_Topics.txt")
        np.savetxt(topic_outpath, topic_word)
        vocab_outpath = os.path.join(path, source + "_Vocab.txt")
        with open(vocab_outpath, mode="w", encoding="utf-8") as file:
            for word in vocabulary:
                file.write(word + "\n")

        return topicmixture_outpath, topic_outpath, vocab_outpath

    def kld_from_probdists(self, pdists0, pdists1):

        assert pdists0.shape == pdists1.shape, "pdist* shapes must be identical"
        if len(pdists0.shape) == 1:
            kl_divergences = (pdists1 * np.log2(pdists1 / pdists0)).sum()
        elif len(pdists0.shape) == 2:
            kl_divergences = (pdists1 * np.log2(pdists1 / pdists0)).sum(axis=1)

        return kl_divergences

    def novelty_transience_resonance(self, thetas_arr, scale):

        speechstart = scale
        speechend = thetas_arr.shape[0] - scale
        novelties = []
        transiences = []
        resonances = []
        for j in range(speechstart, speechend, 1):
            center_theta = thetas_arr[j]
            after_boxend = j + scale + 1
            before_boxstart = j - scale
            before_theta_arr = thetas_arr[before_boxstart:j]
            beforenum = before_theta_arr.shape[0]
            before_centertheta_arr = np.tile(center_theta, reps=(beforenum, 1))
            after_theta_arr = thetas_arr[j + 1 : after_boxend]
            afternum = after_theta_arr.shape[0]
            after_centertheta_arr = np.tile(center_theta, reps=(afternum, 1))
            before_klds = self.kld_from_probdists(before_theta_arr, before_centertheta_arr)
            after_klds = self.kld_from_probdists(after_theta_arr, after_centertheta_arr)
            novelty = np.mean(before_klds)
            transience = np.mean(after_klds)
            novelties.append(novelty)
            transiences.append(transience)
            resonances.append(novelty - transience)
        for _ in range(0, scale):
            transiences.insert(0, 0)
            transiences.append(0)
            novelties.insert(0, 0)
            novelties.append(0)
            resonances.insert(0, 0)
            resonances.append(0)

        return novelties, transiences, resonances

    def save_novel_trans_reson(self, path, novelties, transiences, resonances, source):

        outpath = os.path.join(path, source + "_NovelTransReson.txt")
        np.savetxt(outpath, np.vstack(list(zip(novelties, transiences, resonances))))

    def routine(self, date_start, date_end, kld_days_window, vocabsize, num_iter):
        
        print("-> Reading first topic modeling results (LDA)...")
        results = pd.read_csv(os.path.join(ROOT_DIR,"results","All_Results.csv"))
        query_string = query_string_gen(os.path.join(ROOT_DIR,"results","All_TopicsWords.txt"), 
            words=["drones", "troops", "strike", "killed", "attack", "shelling", "strikes"]
            )
        print(f"-> Conflict-related topics on first LDA analysis: {query_string}\n")
        results = results.query(query_string)
        results = results.drop(columns=["Novelty","Transience","Resonance","Comments"])

        source_dfs = []
        for i, source in enumerate(self.sources):
            query = results.query(f"Source == '{str(source)}'")
            source_dfs.append(query.copy())
            print(f"{source} length: {len(query)}")
        results["Source"] = "All"
        source_dfs.append(results.copy())
        print(f"All length: {len(results)}\n")


        for i, j in enumerate(source_dfs):
            data = source_dfs[i]
            source = data.at[data.index[0], 'Source']
            print(f"-> Starting {source} second topic modeling (LDA)...")
            scale = self.kld_window(data, date_start, date_end, kld_days_window)

            doc_topic, topic_word, vocabulary = self.learn_topics(data, 30, vocabsize, num_iter)

            # getting topic of each text
            topics = []
            for i in range(len(data)):
                topics.append(doc_topic[i].argmax())

            self.save_topicmodel(os.path.join(ROOT_DIR, "results_two_lda"), doc_topic, topic_word, vocabulary, source)
            novelties, transiences, resonances = self.novelty_transience_resonance(doc_topic, scale)
            self.save_novel_trans_reson(os.path.join(ROOT_DIR, "results_two_lda"), novelties, transiences, resonances, source)
            ntr_data = data
            ntr_data["Novelty"] = novelties
            ntr_data["Transience"] = transiences
            ntr_data["Resonance"] = resonances
            ntr_data["Topic2"] = topics
            ntr_data.to_csv(os.path.join(ROOT_DIR, "results_two_lda", source + "_Results.csv"), index=False)

            # geting words of each topic
            words = []
            for i, topic_dist in enumerate(topic_word):
                topic_words = np.array(vocabulary)[np.argsort(topic_dist)][:-16:-1]
                words.append(f"Topic {i}: {' '.join(topic_words)}")
            with open(os.path.join(ROOT_DIR, "results_two_lda", source + "_TopicsWords.txt"), "w") as file:
                file.write("\n".join(map(str, words)))

            print("")

        print("-> All LDA data saved.\n")


In [169]:
NTR().routine(date_start="2022-03-01", date_end="2022-08-01", kld_days_window=1, vocabsize=10000, num_iter=300)

-> Reading first topic modeling results (LDA)...
-> Conflict-related topics on first LDA analysis: Topic == 3 | Topic == 13 | Topic == 14 | Topic == 17 | Topic == 29

AP length: 1161
Fox length: 1110
CNN length: 518
ABC length: 189
CBS length: 547
NYT length: 688
Mirror length: 908
Reuters length: 4610
Express length: 3903
HuffPost length: 83
Guardian length: 1344
DailyMail length: 2210
All length: 17271

-> Starting AP second topic modeling (LDA)...
-> This dataset has an average of 3 daily stories from 2022-03-01 to 2022-08-01.
-> KLD window will be of 1*3 = 3 articles.



INFO:lda:n_documents: 1161
INFO:lda:vocab_size: 10000
INFO:lda:n_words: 894200
INFO:lda:n_topics: 30
INFO:lda:n_iter: 300
INFO:lda:<0> log likelihood: -10501778
INFO:lda:<100> log likelihood: -7257301
INFO:lda:<200> log likelihood: -7200165
INFO:lda:<299> log likelihood: -7170132



-> Starting Fox second topic modeling (LDA)...
-> This dataset has an average of 4 daily stories from 2022-03-01 to 2022-08-01.
-> KLD window will be of 1*4 = 4 articles.



INFO:lda:n_documents: 1110
INFO:lda:vocab_size: 10000
INFO:lda:n_words: 330320
INFO:lda:n_topics: 30
INFO:lda:n_iter: 300
INFO:lda:<0> log likelihood: -4035057
INFO:lda:<100> log likelihood: -2718888
INFO:lda:<200> log likelihood: -2694723
INFO:lda:<299> log likelihood: -2687732
INFO:lda:n_documents: 518
INFO:lda:vocab_size: 10000
INFO:lda:n_words: 271472
INFO:lda:n_topics: 30
INFO:lda:n_iter: 300



-> Starting CNN second topic modeling (LDA)...
-> This dataset has an average of 1 daily stories from 2022-03-01 to 2022-08-01.
-> KLD window will be of 1*1 = 1 articles.



INFO:lda:<0> log likelihood: -3338562
INFO:lda:<100> log likelihood: -2260786
INFO:lda:<200> log likelihood: -2242463
INFO:lda:<299> log likelihood: -2233604
INFO:lda:n_documents: 189
INFO:lda:vocab_size: 9501
INFO:lda:n_words: 84065
INFO:lda:n_topics: 30
INFO:lda:n_iter: 300



-> Starting ABC second topic modeling (LDA)...
-> This dataset has an average of 0 daily stories from 2022-03-01 to 2022-08-01.
-> KLD window will be of 1*0 = 0 articles.



INFO:lda:<0> log likelihood: -1073548
INFO:lda:<100> log likelihood: -706160
INFO:lda:<200> log likelihood: -698766
INFO:lda:<299> log likelihood: -695801
INFO:lda:n_documents: 547
INFO:lda:vocab_size: 10000
INFO:lda:n_words: 201187
INFO:lda:n_topics: 30
INFO:lda:n_iter: 300



-> Starting CBS second topic modeling (LDA)...
-> This dataset has an average of 1 daily stories from 2022-03-01 to 2022-08-01.
-> KLD window will be of 1*1 = 1 articles.



INFO:lda:<0> log likelihood: -2518275
INFO:lda:<100> log likelihood: -1677125
INFO:lda:<200> log likelihood: -1661079
INFO:lda:<299> log likelihood: -1656249



-> Starting NYT second topic modeling (LDA)...
-> This dataset has an average of 2 daily stories from 2022-03-01 to 2022-08-01.
-> KLD window will be of 1*2 = 2 articles.



INFO:lda:n_documents: 688
INFO:lda:vocab_size: 10000
INFO:lda:n_words: 412261
INFO:lda:n_topics: 30
INFO:lda:n_iter: 300
INFO:lda:<0> log likelihood: -5030803
INFO:lda:<100> log likelihood: -3493707
INFO:lda:<200> log likelihood: -3464476
INFO:lda:<299> log likelihood: -3450698
INFO:lda:n_documents: 908
INFO:lda:vocab_size: 10000
INFO:lda:n_words: 278036
INFO:lda:n_topics: 30
INFO:lda:n_iter: 300



-> Starting Mirror second topic modeling (LDA)...
-> This dataset has an average of 4 daily stories from 2022-03-01 to 2022-08-01.
-> KLD window will be of 1*4 = 4 articles.



INFO:lda:<0> log likelihood: -3401406
INFO:lda:<100> log likelihood: -2336320
INFO:lda:<200> log likelihood: -2315928
INFO:lda:<299> log likelihood: -2312221



-> Starting Reuters second topic modeling (LDA)...
-> This dataset has an average of 14 daily stories from 2022-03-01 to 2022-08-01.
-> KLD window will be of 1*14 = 14 articles.



INFO:lda:n_documents: 4610
INFO:lda:vocab_size: 10000
INFO:lda:n_words: 736852
INFO:lda:n_topics: 30
INFO:lda:n_iter: 300
INFO:lda:<0> log likelihood: -8859137
INFO:lda:<100> log likelihood: -5982031
INFO:lda:<200> log likelihood: -5931792
INFO:lda:<299> log likelihood: -5914231



-> Starting Express second topic modeling (LDA)...
-> This dataset has an average of 12 daily stories from 2022-03-01 to 2022-08-01.
-> KLD window will be of 1*12 = 12 articles.



INFO:lda:n_documents: 3903
INFO:lda:vocab_size: 10000
INFO:lda:n_words: 1100784
INFO:lda:n_topics: 30
INFO:lda:n_iter: 300
INFO:lda:<0> log likelihood: -13034087
INFO:lda:<100> log likelihood: -9254729
INFO:lda:<200> log likelihood: -9184887
INFO:lda:<299> log likelihood: -9150466
INFO:lda:n_documents: 83
INFO:lda:vocab_size: 5243
INFO:lda:n_words: 22655
INFO:lda:n_topics: 30
INFO:lda:n_iter: 300
INFO:lda:<0> log likelihood: -289851



-> Starting HuffPost second topic modeling (LDA)...
-> This dataset has an average of 0 daily stories from 2022-03-01 to 2022-08-01.
-> KLD window will be of 1*0 = 0 articles.



INFO:lda:<100> log likelihood: -193871
INFO:lda:<200> log likelihood: -191776
INFO:lda:<299> log likelihood: -191233



-> Starting Guardian second topic modeling (LDA)...
-> This dataset has an average of 4 daily stories from 2022-03-01 to 2022-08-01.
-> KLD window will be of 1*4 = 4 articles.



INFO:lda:n_documents: 1344
INFO:lda:vocab_size: 10000
INFO:lda:n_words: 637303
INFO:lda:n_topics: 30
INFO:lda:n_iter: 300
INFO:lda:<0> log likelihood: -7680631
INFO:lda:<100> log likelihood: -5431263
INFO:lda:<200> log likelihood: -5387461
INFO:lda:<299> log likelihood: -5375603



-> Starting DailyMail second topic modeling (LDA)...
-> This dataset has an average of 8 daily stories from 2022-03-01 to 2022-08-01.
-> KLD window will be of 1*8 = 8 articles.



INFO:lda:n_documents: 2210
INFO:lda:vocab_size: 10000
INFO:lda:n_words: 1862966
INFO:lda:n_topics: 30
INFO:lda:n_iter: 300
INFO:lda:<0> log likelihood: -21693098
INFO:lda:<100> log likelihood: -15913299
INFO:lda:<200> log likelihood: -15783823
INFO:lda:<299> log likelihood: -15739493



-> Starting All second topic modeling (LDA)...
-> This dataset has an average of 58 daily stories from 2022-03-01 to 2022-08-01.
-> KLD window will be of 1*58 = 58 articles.



INFO:lda:n_documents: 17271
INFO:lda:vocab_size: 10000
INFO:lda:n_words: 6747630
INFO:lda:n_topics: 30
INFO:lda:n_iter: 300
INFO:lda:<0> log likelihood: -77389354
INFO:lda:<100> log likelihood: -56929592
INFO:lda:<200> log likelihood: -56622665
INFO:lda:<299> log likelihood: -56525380



-> All LDA data saved.



# Correlation

In [170]:
def spearman_pearson(dir,source,resample,pval):
    events = pd.read_csv(os.path.join(ROOT_DIR,"data","Ukraine_Black_Sea_2020_2022_Nov18.csv"), parse_dates=["EVENT_DATE"])
    events = events[["EVENT_DATE","EVENT_TYPE","FATALITIES"]].rename(columns={"EVENT_DATE":"Date","EVENT_TYPE":"Count","FATALITIES":"Fatalities"})  # pegando somente colunas relevantes
    events = events.set_index("Date") # convertendo coluna de datas pra datetime e setando indice
    events = events.resample(resample).agg({'Count': 'count', 'Fatalities': 'sum'}) # resample: contagem de eventos, soma de fatalidades

    results = pd.read_csv(os.path.join(dir,f"{source}_Results.csv"), parse_dates=["Date"],index_col=["Date"])
    results = results[["Resonance","Novelty","Transience"]]
    results = results.sort_index()
    results = results.loc["2018-01-01":"2022-11-18"] # Matching other dataframe
    results = results.resample(resample).sum()
    results[["Count","Fatalities"]] = events[["Count","Fatalities"]].copy()
    results = results.loc["2020-01-01":"2022-11-16"] 

    s_fatalities, s_fatalities_p = spearmanr(results['Fatalities'], results['Resonance'])
    p_fatalities, p_fatalities_p = pearsonr(results['Fatalities'], results['Resonance'])
    s_events, s_events_p = spearmanr(results['Count'], results['Resonance'])
    p_events, p_events_p = pearsonr(results['Count'], results['Resonance'])

    if s_fatalities_p <= pval:
        print(f'{source} (R X F) Spearman: {s_fatalities:.4f} p-value: {s_fatalities_p:.4f}')
    if p_fatalities_p <= pval:
        print(f'{source} (R X F) Pearson: {p_fatalities:.4f} p-value: {p_fatalities_p:.4f}')
    if s_events_p <= pval:
        print(f'{source} (R X CE) Spearman: {s_events:.4f} p-value: {s_events_p:.4f}')
    if p_events_p <= pval:
        print(f'{source} (R X CE) Pearson: {p_events:.4f} p-value: {p_events_p:.4f}')

### 1 LDA

In [171]:
sources = ["AP", "Fox", "CNN", "ABC", "CBS", "NYT", "Mirror", "Reuters", "Express", "HuffPost", "Guardian", "DailyMail", "All"]
for source in sources:
    spearman_pearson(os.path.join(ROOT_DIR,"results"),source,'W',0.05)

Fox (R X F) Pearson: 0.1852 p-value: 0.0233
NYT (R X F) Spearman: 0.1731 p-value: 0.0341
NYT (R X F) Pearson: 0.2085 p-value: 0.0105
NYT (R X CE) Pearson: 0.1818 p-value: 0.0260
Reuters (R X F) Pearson: 0.1934 p-value: 0.0177
Express (R X F) Pearson: 0.1919 p-value: 0.0247
All (R X F) Pearson: 0.2724 p-value: 0.0007


### 2 LDA

In [173]:
for source in sources:
    spearman_pearson(os.path.join(ROOT_DIR,"results_two_lda"),source,'W',0.05)

Express (R X F) Spearman: -0.2437 p-value: 0.0047
Express (R X CE) Spearman: -0.2148 p-value: 0.0131
Express (R X CE) Pearson: -0.2176 p-value: 0.0119
Guardian (R X F) Spearman: -0.1641 p-value: 0.0448
