In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.arima.model import ARIMA
from tqdm import tqdm
from collections import defaultdict
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.stattools import adfuller
from statsmodels.formula.api import glm
from datetime import datetime
from urllib.request import urlopen, Request
from urllib.parse import quote
from sklearn.metrics import mean_squared_error
import json
import time
import ast
import locale
locale.setlocale(locale.LC_ALL, "nl_NL")

tqdm.pandas()

In [2]:
with open('paper_selectors.txt', 'r') as file:
    papers = ast.literal_eval(file.read())

with open('party_selectors.txt', 'r') as file:
    parties = ast.literal_eval(file.read())

with open('party_synonyms.txt', 'r') as file:
    party_synonyms = ast.literal_eval(file.read())

with open('months.txt', 'r') as file:
    months = ast.literal_eval(file.read())

with open('min_mentions_per_article.txt', 'r') as file:
    min_val = int(file.read())

for synonym in party_synonyms:
    parties = [x for x in parties if x not in synonym[1:]]

body_sentiments = ['body_neu', 'body_pos', 'body_neg', 'body_compound']
title_sentiments = ['title_neu', 'title_pos', 'title_neg', 'title_compound']

In [3]:
df = pd.read_json("../../data/02_assigned_articles.json")

Remove all synonym names

In [4]:
def replace(name):
    synonyms = {
        "AD/Algemeen Dagblad.nl": "AD",
        "De Telegraaf.nl" : "Telegraaf",
        "De Volkskrant.nl" : "Volkskrant",
        "Het Parool.nl": "Parool",
        "Trouw.nl": "Trouw",
        'de Volkskrant': 'Volkskrant',
        'NRC Handelsblad': 'NRC',
        'Het Parool': 'Parool',
        'De Telegraaf': 'Telegraaf',
        'AD/Algemeen Dagblad': 'AD'
    }
    if name in synonyms.keys():
        return synonyms[name]
    else:
        return name
    
df['Newspaper'] = df['Newspaper'].apply(lambda x: replace(x))
display(df.head())


papers = list(df['Newspaper'].unique())
papers.remove(None)
print(papers)

Unnamed: 0,Newspaper,Title,Date,Author,Section,Body,VVD,CDA,D66,SGP,...,PvdD,PVV,50Plus,Denk,FvD,Bij1,BBB,Volt,BVNL,NSC
0,AD,Omtzigt mengt zich in strijd met boek vol plan...,2023-08-21,Niels Klaassen,,Pieter Omtzigt mikt op gematigde groei met zij...,2,5,0,1,...,0,3,0,0,1,0,4,0,0,4
1,Parool,Plofpartij of gamechanger: wat kunnen we verwa...,2023-08-21,Niels Klaassen,,Pieter Omtzigt mikt op gematigde groei met zij...,2,6,0,1,...,0,3,0,0,1,0,6,0,0,6
2,Volkskrant,Kaarten op rechts zijn opnieuw geschud,2023-08-21,RAOUL DU PRÉ,Ten Eerste,"Analyse VVD sluit PVV niet uit Frans Weisglas,...",15,2,2,0,...,0,15,0,0,1,0,1,0,0,0
3,Trouw,Niet met de PVV,2023-08-21,STEVO AKKERMAN,Vandaag,Toen Dilan Yesilgöz zich opwierp als opvolger...,6,0,0,0,...,0,5,0,0,0,0,0,0,0,0
4,AD,VVD-kamerlid en woordvoerder sport Rudmer Heer...,2023-08-21,Politieke redactie,,VVD-Kamerlid Rudmer Heerema keert na de aankom...,4,1,1,0,...,0,0,0,0,0,0,0,0,0,0


['AD', 'Parool', 'Volkskrant', 'Trouw', 'Telegraaf', 'NRC']


In [5]:
df["Body"] = df["Body"].replace(r"'", "", regex=True)
df["Body"] = df["Body"].replace(r'"', "", regex=True)
df.iloc[519]['Body']

'Campagne: Eerste treffen van lijsttrekkers in de Kamer Al voor het officiële begin van het debat over de Algemene Politieke Beschouwingen geeft Pieter Omtzigt zijn visitekaartje af. Op zn Omtzigts. De parlementaire eenpitter - tevens virtueel de leider van op dit moment de grootste partij - speurt nog naar een specifiek tabelletje in de stukken van Prinsjesdag: We hebben nog geen informatie over de marginale druk voor eenverdieners.   Later blijkt dat de bewuste tabel toch echt met het parlement gedeeld is, maar Omtzigts punt is gemaakt: door allerlei fiscale regelingen houden mensen soms bar weinig extra geld over als ze flink meer gaan werken. Wie zijn brutoloon ziet stijgen van 37.000 naar 47.000 euro, houdt netto maar 900 euro extra over. Partijen hebben het hier over accijnsverlaging, doceert Omtzigt aan VVD-fractieleider Sophie Hermans. Maar eenverdieners met een inkomen van 47.000 euro kunnen niet eens een auto rijden, dat is te duur. Flirt met de PVV Zo drukt Omtzigt direct o

In [6]:
class Translator:
    @staticmethod
    def get_response_from_request(request):
        response = urlopen(request)
        return response.read()

    @staticmethod
    def get_translation_from_response(response):
        lists = json.loads(response)
        translation_sentences = []
        for list_ in lists[0]:
            translation_sentences.append(list_[0])
        translation = ''.join(translation_sentences)
        return translation

    @staticmethod
    def translate(input_text, target_lang, source_lang='auto'):
        url = f"https://translate.googleapis.com/translate_a/single?client=gtx&sl={source_lang}&tl={target_lang}&dt=t&q={quote(input_text)}"
        response = Translator.get_response_from_request(Request(url=url))
        return Translator.get_translation_from_response(response)

Assign sentiment score to each article

In [7]:
# max batch size = 10 per second before google API blocks
vader_model = SentimentIntensityAnalyzer()

scores = []
for idx, corpus in enumerate(tqdm(df['Body'])):
    if idx // 9 == 0:
        time.sleep(1)
    
    en_corpus = Translator.translate(corpus[:10000], 'en') # I don't know why that is the max length for a text, but otherwise it throws a HTTP error 400.

    if en_corpus[-3:] == ' no':
        en_corpus = en_corpus[:-3]

    scores = [vader_model.polarity_scores(en_corpus)] + scores 

df['body_vader_scores'] = scores

# since we are only interested in the content of the tweets, we will select it
df["Body"]

for key in df['body_vader_scores'].iloc[0].keys():
    df[f"body_{key}"] = df["body_vader_scores"].apply(
        lambda score_dict: score_dict[key]
)

df.head()

  0%|          | 0/6769 [00:00<?, ?it/s]

100%|██████████| 6769/6769 [20:08<00:00,  5.60it/s]


Unnamed: 0,Newspaper,Title,Date,Author,Section,Body,VVD,CDA,D66,SGP,...,Bij1,BBB,Volt,BVNL,NSC,body_vader_scores,body_neg,body_neu,body_pos,body_compound
0,AD,Omtzigt mengt zich in strijd met boek vol plan...,2023-08-21,Niels Klaassen,,Pieter Omtzigt mikt op gematigde groei met zij...,2,5,0,1,...,0,4,0,0,4,"{'neg': 0.0, 'neu': 0.935, 'pos': 0.065, 'comp...",0.0,0.935,0.065,0.4019
1,Parool,Plofpartij of gamechanger: wat kunnen we verwa...,2023-08-21,Niels Klaassen,,Pieter Omtzigt mikt op gematigde groei met zij...,2,6,0,1,...,0,6,0,0,6,"{'neg': 0.046, 'neu': 0.833, 'pos': 0.121, 'co...",0.046,0.833,0.121,0.995
2,Volkskrant,Kaarten op rechts zijn opnieuw geschud,2023-08-21,RAOUL DU PRÉ,Ten Eerste,"Analyse VVD sluit PVV niet uit Frans Weisglas,...",15,2,2,0,...,0,1,0,0,0,"{'neg': 0.055, 'neu': 0.823, 'pos': 0.123, 'co...",0.055,0.823,0.123,0.9955
3,Trouw,Niet met de PVV,2023-08-21,STEVO AKKERMAN,Vandaag,Toen Dilan Yesilgöz zich opwierp als opvolger...,6,0,0,0,...,0,0,0,0,0,"{'neg': 0.136, 'neu': 0.808, 'pos': 0.056, 'co...",0.136,0.808,0.056,-0.9788
4,AD,VVD-kamerlid en woordvoerder sport Rudmer Heer...,2023-08-21,Politieke redactie,,VVD-Kamerlid Rudmer Heerema keert na de aankom...,4,1,1,0,...,0,0,0,0,0,"{'neg': 0.081, 'neu': 0.757, 'pos': 0.161, 'co...",0.081,0.757,0.161,0.9943


Assign sentiment score each article title

In [8]:
# max batch size = 10 per second before google API blocks
vader_model = SentimentIntensityAnalyzer()

scores = []
for idx, corpus in enumerate(tqdm(df['Title'])):
    if idx // 9 == 0:
        time.sleep(1)
    
    if isinstance(corpus, str):
        en_corpus = Translator.translate(corpus, 'en')
    else:
        en_corpus = ''

    scores = [vader_model.polarity_scores(en_corpus)] + scores 

df['title_vader_scores'] = scores

for key in df['title_vader_scores'].iloc[0].keys():
    df[f"title_{key}"] = df["title_vader_scores"].apply(
        lambda score_dict: score_dict[key]
)

df.head()

100%|██████████| 6769/6769 [16:08<00:00,  6.99it/s]  


Unnamed: 0,Newspaper,Title,Date,Author,Section,Body,VVD,CDA,D66,SGP,...,body_vader_scores,body_neg,body_neu,body_pos,body_compound,title_vader_scores,title_neg,title_neu,title_pos,title_compound
0,AD,Omtzigt mengt zich in strijd met boek vol plan...,2023-08-21,Niels Klaassen,,Pieter Omtzigt mikt op gematigde groei met zij...,2,5,0,1,...,"{'neg': 0.0, 'neu': 0.935, 'pos': 0.065, 'comp...",0.0,0.935,0.065,0.4019,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,1.0,0.0,0.0
1,Parool,Plofpartij of gamechanger: wat kunnen we verwa...,2023-08-21,Niels Klaassen,,Pieter Omtzigt mikt op gematigde groei met zij...,2,6,0,1,...,"{'neg': 0.046, 'neu': 0.833, 'pos': 0.121, 'co...",0.046,0.833,0.121,0.995,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,1.0,0.0,0.0
2,Volkskrant,Kaarten op rechts zijn opnieuw geschud,2023-08-21,RAOUL DU PRÉ,Ten Eerste,"Analyse VVD sluit PVV niet uit Frans Weisglas,...",15,2,2,0,...,"{'neg': 0.055, 'neu': 0.823, 'pos': 0.123, 'co...",0.055,0.823,0.123,0.9955,"{'neg': 0.121, 'neu': 0.823, 'pos': 0.056, 'co...",0.121,0.823,0.056,-0.3612
3,Trouw,Niet met de PVV,2023-08-21,STEVO AKKERMAN,Vandaag,Toen Dilan Yesilgöz zich opwierp als opvolger...,6,0,0,0,...,"{'neg': 0.136, 'neu': 0.808, 'pos': 0.056, 'co...",0.136,0.808,0.056,-0.9788,"{'neg': 0.402, 'neu': 0.598, 'pos': 0.0, 'comp...",0.402,0.598,0.0,-0.8126
4,AD,VVD-kamerlid en woordvoerder sport Rudmer Heer...,2023-08-21,Politieke redactie,,VVD-Kamerlid Rudmer Heerema keert na de aankom...,4,1,1,0,...,"{'neg': 0.081, 'neu': 0.757, 'pos': 0.161, 'co...",0.081,0.757,0.161,0.9943,"{'neg': 0.0, 'neu': 0.69, 'pos': 0.31, 'compou...",0.0,0.69,0.31,0.2023


In [1]:
df.iloc[0]

NameError: name 'df' is not defined

In [12]:
descriptives = pd.DataFrame()

for key in df['body_vader_scores'].iloc[0]:
    descriptives[f"body {key}"] = df['body_vader_scores'].apply(lambda score_dict: score_dict[key]).describe()
    descriptives[f"title {key}"] = df['title_vader_scores'].apply(lambda score_dict: score_dict[key]).describe()
    
descriptives

Unnamed: 0,body neg,title neg,body neu,title neu,body pos,title pos,body compound,title compound
count,6769.0,6769.0,6769.0,6769.0,6769.0,6769.0,6769.0,6769.0
mean,0.064161,0.089124,0.834893,0.821103,0.10095,0.089183,0.501837,0.004961
std,0.037254,0.133842,0.052837,0.173175,0.038873,0.128796,0.742851,0.359454
min,0.0,0.0,0.61,0.0,0.0,0.0,-0.9999,-0.9313
25%,0.038,0.0,0.8,0.708,0.072,0.0,0.34,-0.2263
50%,0.059,0.0,0.835,0.833,0.099,0.0,0.9547,0.0
75%,0.084,0.166,0.872,1.0,0.127,0.167,0.9922,0.25
max,0.294,1.0,1.0,1.0,0.307,1.0,0.9999,0.9209


Writing away the calculated sentiment scores for each article and title. Grouped and seperate.

In [16]:
df.to_csv("../../data/03_data_scored.csv")

index = pd.date_range("2023-08-21", datetime.now().strftime("%Y-%m-%d"))

party_articles = {}

for party in parties:
    party_articles[party] = df[df[party].ge(min_val)]

party_sentiment = defaultdict()

for party in parties:
    party_sentiment[party] = defaultdict()
    for key in body_sentiments + title_sentiments:
        party_sentiment[party][key] = party_articles[party].groupby(party_articles[party]['Date'])[key].mean().reset_index()
        party_sentiment[party][key] = party_sentiment[party][key].set_index('Date')
        party_sentiment[party][key] = party_sentiment[party][key].reindex(index)
    # party_sentiment[party] = party_sentiment[party].interpolate(limit_direction="both")
        party_sentiment[party][key].index = party_sentiment[party][key].index.strftime("%Y-%m-%d")

display(party_sentiment['NSC'])

party_dict = {}

for party_key in party_sentiment.keys():
    sentiment_dict = {}
    for sentiment_key in party_sentiment[party_key]:
        sentiment_dict.update(party_sentiment[party_key][sentiment_key].to_dict())
    party_dict.update({party_key: sentiment_dict})

print(party_dict)

with open("../../data/04_sentiment_per_party.json", 'w+') as file:
    file.write(json.dumps(party_dict, indent=4, sort_keys=True))

defaultdict(None,
            {'body_neu':             body_neu
             2023-08-21  0.851250
             2023-08-22  0.831700
             2023-08-23  0.795333
             2023-08-24  0.867000
             2023-08-25  0.781000
             ...              ...
             2023-12-03  0.841000
             2023-12-04  0.833000
             2023-12-05  0.835857
             2023-12-06  0.829467
             2023-12-07       NaN
             
             [109 rows x 1 columns],
             'body_pos':             body_pos
             2023-08-21  0.095750
             2023-08-22  0.119300
             2023-08-23  0.091000
             2023-08-24  0.065500
             2023-08-25  0.090000
             ...              ...
             2023-12-03  0.097000
             2023-12-04  0.115727
             2023-12-05  0.112429
             2023-12-06  0.090867
             2023-12-07       NaN
             
             [109 rows x 1 columns],
             'body_neg':             bod

{'VVD': {'body_neu': {'2023-08-21': 0.8351538461538461, '2023-08-22': 0.8405384615384615, '2023-08-23': 0.8213636363636364, '2023-08-24': 0.8278000000000001, '2023-08-25': 0.8193888888888889, '2023-08-26': 0.8139230769230769, '2023-08-27': 0.8273333333333333, '2023-08-28': 0.8380000000000001, '2023-08-29': 0.8077272727272727, '2023-08-30': 0.7892727272727273, '2023-08-31': 0.8303846153846154, '2023-09-01': 0.8406, '2023-09-02': 0.8406785714285715, '2023-09-03': 0.874, '2023-09-04': 0.8520416666666666, '2023-09-05': 0.8437777777777778, '2023-09-06': 0.8308695652173913, '2023-09-07': 0.8309, '2023-09-08': 0.8494074074074075, '2023-09-09': 0.831375, '2023-09-10': 0.8125, '2023-09-11': 0.8407272727272727, '2023-09-12': 0.83, '2023-09-13': 0.8470714285714286, '2023-09-14': 0.8478333333333333, '2023-09-15': 0.8171052631578948, '2023-09-16': 0.836, '2023-09-17': 0.8216666666666667, '2023-09-18': 0.7933636363636364, '2023-09-19': 0.8429655172413794, '2023-09-20': 0.8222592592592592, '2023-09-2

In [24]:
index = pd.date_range("2023-08-21", datetime.now().strftime("%Y-%m-%d"))

party_articles = {}

for party in parties:
    party_articles[party] = df[df[party].ge(min_val)]

party_sentiment_per_paper = defaultdict()

for party in parties:
    party_sentiment_per_paper[party] = defaultdict()
    for paper in papers:
        party_sentiment_per_paper[party][paper] = defaultdict()
        for key in body_sentiments:
            party_sentiment_per_paper[party][paper][key] = party_articles[party].groupby(party_articles[party][party_articles[party]['Newspaper'] == paper]['Date'])[key].mean().reset_index()
            party_sentiment_per_paper[party][paper][key] = party_sentiment_per_paper[party][paper][key].set_index('Date')
            party_sentiment_per_paper[party][paper][key] = party_sentiment_per_paper[party][paper][key].reindex(index)
            # party_sentiment[party] = party_sentiment[party].interpolate(limit_direction="both")
            party_sentiment_per_paper[party][paper][key].index = party_sentiment_per_paper[party][paper][key].index.strftime("%Y-%m-%d")

display(party_sentiment_per_paper['NSC'])

party_dict = {}

for party_key in party_sentiment_per_paper.keys():
    paper_dict = {}
    for paper_key in party_sentiment_per_paper[party_key]:
        sentiment_dict = {}
        for sentiment_key in party_sentiment_per_paper[party_key][paper_key]:
            sentiment_dict.update(party_sentiment_per_paper[party_key][paper_key][sentiment_key].to_dict())
            paper_dict.update({paper_key: sentiment_dict})
    party_dict.update({party_key: paper_dict})

print(party_dict['VVD'].keys())

with open("../../data/05_sentiment_per_party_per_paper.json", 'w+') as file:
    file.write(json.dumps(party_dict, indent=4, sort_keys=True))

defaultdict(None,
            {'AD': defaultdict(None, {'body_neu':             body_neu
                          2023-08-21  0.832800
                          2023-08-22  0.823667
                          2023-08-23  0.861000
                          2023-08-24  0.889000
                          2023-08-25       NaN
                          ...              ...
                          2023-12-03       NaN
                          2023-12-04  0.835000
                          2023-12-05  0.844500
                          2023-12-06  0.855500
                          2023-12-07       NaN
                          
                          [109 rows x 1 columns],
                          'body_pos':             body_pos
                          2023-08-21  0.101400
                          2023-08-22  0.127667
                          2023-08-23  0.067000
                          2023-08-24  0.049000
                          2023-08-25       NaN
                       

dict_keys(['AD', 'Parool', 'Volkskrant', 'Trouw', 'Telegraaf', 'NRC'])


In [None]:
# party_dict = {}

# for party_key in party_sentiment_per_paper.keys():
#     paper_dict = {}
#     for paper_key in party_sentiment_per_paper[party_key]:
#         sentiment_dict = {}
#         for sentiment_key in party_sentiment_per_paper[party_key][paper_key]:
#             sentiment_dict.update(party_sentiment_per_paper[party_key][paper_key][sentiment_key].to_dict())
#             paper_dict.update({paper_key: sentiment_dict})
#     party_dict.update({party_key: paper_dict})

# print(party_dict['VVD'].keys())

# with open("../../data/sentiment_per_party_per_paper.json", 'w+') as file:
#     file.write(json.dumps(party_dict, indent=4, sort_keys=True))

dict_keys(['AD', 'Parool', 'Volkskrant', 'Trouw', 'Telegraaf', 'NRC'])


In [None]:
# party_dict = {}

# for party_key in party_sentiment.keys():
#     sentiment_dict = {}
#     for sentiment_key in party_sentiment[party_key]:
#         sentiment_dict.update(party_sentiment[party_key][sentiment_key].to_dict())
#     party_dict.update({party_key: sentiment_dict})

# print(party_dict)

# with open("../../data/sentiment_per_party.json", 'w+') as file:
#     file.write(json.dumps(party_dict, indent=4, sort_keys=True))

{'VVD': {'body_neu': {'2023-08-21': 0.8317692307692308, '2023-08-22': 0.8303076923076923, '2023-08-23': 0.8320000000000001, '2023-08-24': 0.8133000000000001, '2023-08-25': 0.8297222222222222, '2023-08-26': 0.8100769230769231, '2023-08-27': 0.8106666666666666, '2023-08-28': 0.8443333333333333, '2023-08-29': 0.7991818181818182, '2023-08-30': 0.7938181818181818, '2023-08-31': 0.839153846153846, '2023-09-01': 0.83512, '2023-09-02': 0.8511428571428572, '2023-09-03': 0.8688, '2023-09-04': 0.8409166666666666, '2023-09-05': 0.8277222222222222, '2023-09-06': 0.8316956521739131, '2023-09-07': 0.8369, '2023-09-08': 0.8511851851851852, '2023-09-09': 0.825125, '2023-09-10': 0.8745, '2023-09-11': 0.8160909090909091, '2023-09-12': 0.8364782608695652, '2023-09-13': 0.8579285714285714, '2023-09-14': 0.81725, '2023-09-15': 0.8314736842105264, '2023-09-16': 0.8188181818181818, '2023-09-17': 0.8038333333333334, '2023-09-18': 0.8207727272727272, '2023-09-19': 0.8370689655172413, '2023-09-20': 0.81374074074

In [17]:
from statsmodels.formula.api import ols

temp = ols(formula='body_compound ~ Newspaper', data=df).fit()

temp.summary()

0,1,2,3
Dep. Variable:,body_compound,R-squared:,0.0
Model:,OLS,Adj. R-squared:,-0.0
Method:,Least Squares,F-statistic:,0.3413
Date:,"do, 07 dec 2023",Prob (F-statistic):,0.888
Time:,16:27:21,Log-Likelihood:,-7588.8
No. Observations:,6768,AIC:,15190.0
Df Residuals:,6762,BIC:,15230.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.5025,0.013,38.239,0.000,0.477,0.528
Newspaper[T.NRC],0.4836,0.525,0.920,0.357,-0.546,1.514
Newspaper[T.Parool],0.0060,0.037,0.161,0.872,-0.067,0.079
Newspaper[T.Telegraaf],0.0108,0.026,0.422,0.673,-0.039,0.061
Newspaper[T.Trouw],-0.0180,0.027,-0.669,0.504,-0.071,0.035
Newspaper[T.Volkskrant],-0.0011,0.027,-0.040,0.968,-0.054,0.052

0,1,2,3
Omnibus:,1095.637,Durbin-Watson:,1.813
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1690.59
Skew:,-1.216,Prob(JB):,0.0
Kurtosis:,2.723,Cond. No.,60.5


In [18]:
temp = glm(formula="body_neu ~ Newspaper", 
               data=df).fit()

temp.summary()

0,1,2,3
Dep. Variable:,body_neu,No. Observations:,6768.0
Model:,GLM,Df Residuals:,6762.0
Model Family:,Gaussian,Df Model:,5.0
Link Function:,Identity,Scale:,0.0027896
Method:,IRLS,Log-Likelihood:,10304.0
Date:,"do, 07 dec 2023",Deviance:,18.863
Time:,16:27:25,Pearson chi2:,18.9
No. Iterations:,3,Pseudo R-squ. (CS):,0.001602
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,0.8370,0.001,895.944,0.000,0.835,0.839
Newspaper[T.NRC],-0.0290,0.037,-0.778,0.437,-0.102,0.044
Newspaper[T.Parool],-0.0042,0.003,-1.602,0.109,-0.009,0.001
Newspaper[T.Telegraaf],-0.0033,0.002,-1.821,0.069,-0.007,0.000
Newspaper[T.Trouw],-0.0046,0.002,-2.410,0.016,-0.008,-0.001
Newspaper[T.Volkskrant],-0.0043,0.002,-2.217,0.027,-0.008,-0.000


In [19]:
temp = glm(formula="body_pos ~ Newspaper", 
               data=df).fit()

temp.summary()

0,1,2,3
Dep. Variable:,body_pos,No. Observations:,6768.0
Model:,GLM,Df Residuals:,6762.0
Model Family:,Gaussian,Df Model:,5.0
Link Function:,Identity,Scale:,0.0015099
Method:,IRLS,Log-Likelihood:,12381.0
Date:,"do, 07 dec 2023",Deviance:,10.21
Time:,16:27:27,Pearson chi2:,10.2
No. Iterations:,3,Pseudo R-squ. (CS):,0.001498
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,0.0996,0.001,144.924,0.000,0.098,0.101
Newspaper[T.NRC],0.0309,0.027,1.124,0.261,-0.023,0.085
Newspaper[T.Parool],0.0038,0.002,1.960,0.050,7.68e-07,0.008
Newspaper[T.Telegraaf],0.0022,0.001,1.632,0.103,-0.000,0.005
Newspaper[T.Trouw],0.0015,0.001,1.038,0.299,-0.001,0.004
Newspaper[T.Volkskrant],0.0034,0.001,2.404,0.016,0.001,0.006


In [20]:
temp = glm(formula="body_neg ~ Newspaper", 
               data=df).fit()

temp.summary()

0,1,2,3
Dep. Variable:,body_neg,No. Observations:,6768.0
Model:,GLM,Df Residuals:,6762.0
Model Family:,Gaussian,Df Model:,5.0
Link Function:,Identity,Scale:,0.0013872
Method:,IRLS,Log-Likelihood:,12668.0
Date:,"do, 07 dec 2023",Deviance:,9.3803
Time:,16:27:29,Pearson chi2:,9.38
No. Iterations:,3,Pseudo R-squ. (CS):,0.0008292
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,0.0633,0.001,96.137,0.000,0.062,0.065
Newspaper[T.NRC],-0.0018,0.026,-0.070,0.944,-0.053,0.050
Newspaper[T.Parool],0.0004,0.002,0.232,0.816,-0.003,0.004
Newspaper[T.Telegraaf],0.0012,0.001,0.903,0.367,-0.001,0.004
Newspaper[T.Trouw],0.0032,0.001,2.333,0.020,0.001,0.006
Newspaper[T.Volkskrant],0.0009,0.001,0.639,0.523,-0.002,0.004


In [None]:
# max batch size = 10 per second before google API blocks
vader_model = SentimentIntensityAnalyzer()

scores = []
for idx, corpus in enumerate(tqdm(df['Title'])):
    if idx // 9 == 0:
        time.sleep(1)
    
    if isinstance(corpus, str):
        en_corpus = Translator.translate(corpus, 'en')
    else:
        en_corpus = ''

    scores = [vader_model.polarity_scores(en_corpus)] + scores 

df['title_vader_scores'] = scores
df

100%|██████████| 6405/6405 [16:40<00:00,  6.40it/s]  


Unnamed: 0,Newspaper,Title,Date,Author,Section,Body,VVD,CDA,D66,SGP,...,BBB,Volt,BVNL,NSC,body_vader_scores,body_neg,body_neu,body_pos,body_compound,title_vader_scores
0,AD,Omtzigt mengt zich in strijd met boek vol plan...,2023-08-21,Niels Klaassen,,Pieter Omtzigt mikt op gematigde groei met zij...,2,5,0,1,...,4,0,0,4,"{'neg': 0.081, 'neu': 0.757, 'pos': 0.161, 'co...",0.081,0.757,0.161,0.9943,"{'neg': 0.0, 'neu': 0.69, 'pos': 0.31, 'compou..."
1,Parool,Plofpartij of gamechanger: wat kunnen we verwa...,2023-08-21,Niels Klaassen,,Pieter Omtzigt mikt op gematigde groei met zij...,2,6,0,1,...,6,0,0,6,"{'neg': 0.096, 'neu': 0.769, 'pos': 0.135, 'co...",0.096,0.769,0.135,0.9844,"{'neg': 0.0, 'neu': 0.769, 'pos': 0.231, 'comp..."
2,Volkskrant,Kaarten op rechts zijn opnieuw geschud,2023-08-21,RAOUL DU PRÉ,Ten Eerste,"Analyse VVD sluit PVV niet uit Frans Weisglas,...",15,2,2,0,...,1,0,0,0,"{'neg': 0.038, 'neu': 0.883, 'pos': 0.079, 'co...",0.038,0.883,0.079,0.9776,"{'neg': 0.355, 'neu': 0.645, 'pos': 0.0, 'comp..."
3,Trouw,Niet met de PVV,2023-08-21,STEVO AKKERMAN,Vandaag,Toen Dilan Yesilgöz zich opwierp als opvolger...,6,0,0,0,...,0,0,0,0,"{'neg': 0.053, 'neu': 0.851, 'pos': 0.096, 'co...",0.053,0.851,0.096,0.9751,"{'neg': 0.0, 'neu': 0.867, 'pos': 0.133, 'comp..."
4,AD,VVD-kamerlid en woordvoerder sport Rudmer Heer...,2023-08-21,Politieke redactie,,VVD-Kamerlid Rudmer Heerema keert na de aankom...,4,1,1,0,...,0,0,0,0,"{'neg': 0.15, 'neu': 0.731, 'pos': 0.119, 'com...",0.150,0.731,0.119,-0.9725,"{'neg': 0.0, 'neu': 0.89, 'pos': 0.11, 'compou..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7461,Telegraaf,'Amsterdam vindt dat burger 't verkeerd deed';...,2023-11-24,Marijn Schrijver,REGIO,Niet alle ex-raadsleden zeker van zetel in Den...,1,0,1,0,...,0,0,0,3,"{'neg': 0.004, 'neu': 0.922, 'pos': 0.075, 'co...",0.004,0.922,0.075,0.9677,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
7462,AD,Jongerenclubs Winterswijk en Meddo springleven...,2023-11-25,Tanja Kits,,De twee jongerenclubs in de gemeente Winterswi...,0,0,0,0,...,0,0,0,2,"{'neg': 0.123, 'neu': 0.774, 'pos': 0.103, 'co...",0.123,0.774,0.103,-0.9460,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
7463,AD,"Verslag doen van de verkiezingsuitslag, dat is...",2023-11-25,Leo van Raaij,,Op de redactie stroomde vorige week de adrenal...,0,0,0,0,...,0,0,0,2,"{'neg': 0.08, 'neu': 0.809, 'pos': 0.112, 'com...",0.080,0.809,0.112,0.9903,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
7469,Trouw,Ook vervuiling tast onze bestaanszekerheid aan...,2023-11-26,Sijas Akkerman,,Jarenlang konden industriële bedrijven Nederl...,0,0,0,0,...,0,0,0,5,"{'neg': 0.042, 'neu': 0.804, 'pos': 0.154, 'co...",0.042,0.804,0.154,0.9995,"{'neg': 0.0, 'neu': 0.803, 'pos': 0.197, 'comp..."


In [None]:
for key in df['title_vader_scores'].iloc[0].keys():
    df[f"title_{key}"] = df["title_vader_scores"].apply(
        lambda score_dict: score_dict[key]
)

df.head()

Unnamed: 0,Newspaper,Title,Date,Author,Section,Body,VVD,CDA,D66,SGP,...,body_vader_scores,body_neg,body_neu,body_pos,body_compound,title_vader_scores,title_neg,title_neu,title_pos,title_compound
0,AD,Omtzigt mengt zich in strijd met boek vol plan...,2023-08-21,Niels Klaassen,,Pieter Omtzigt mikt op gematigde groei met zij...,2,5,0,1,...,"{'neg': 0.081, 'neu': 0.757, 'pos': 0.161, 'co...",0.081,0.757,0.161,0.9943,"{'neg': 0.0, 'neu': 0.69, 'pos': 0.31, 'compou...",0.0,0.69,0.31,0.2023
1,Parool,Plofpartij of gamechanger: wat kunnen we verwa...,2023-08-21,Niels Klaassen,,Pieter Omtzigt mikt op gematigde groei met zij...,2,6,0,1,...,"{'neg': 0.096, 'neu': 0.769, 'pos': 0.135, 'co...",0.096,0.769,0.135,0.9844,"{'neg': 0.0, 'neu': 0.769, 'pos': 0.231, 'comp...",0.0,0.769,0.231,0.2023
2,Volkskrant,Kaarten op rechts zijn opnieuw geschud,2023-08-21,RAOUL DU PRÉ,Ten Eerste,"Analyse VVD sluit PVV niet uit Frans Weisglas,...",15,2,2,0,...,"{'neg': 0.038, 'neu': 0.883, 'pos': 0.079, 'co...",0.038,0.883,0.079,0.9776,"{'neg': 0.355, 'neu': 0.645, 'pos': 0.0, 'comp...",0.355,0.645,0.0,-0.5106
3,Trouw,Niet met de PVV,2023-08-21,STEVO AKKERMAN,Vandaag,Toen Dilan Yesilgöz zich opwierp als opvolger...,6,0,0,0,...,"{'neg': 0.053, 'neu': 0.851, 'pos': 0.096, 'co...",0.053,0.851,0.096,0.9751,"{'neg': 0.0, 'neu': 0.867, 'pos': 0.133, 'comp...",0.0,0.867,0.133,0.3818
4,AD,VVD-kamerlid en woordvoerder sport Rudmer Heer...,2023-08-21,Politieke redactie,,VVD-Kamerlid Rudmer Heerema keert na de aankom...,4,1,1,0,...,"{'neg': 0.15, 'neu': 0.731, 'pos': 0.119, 'com...",0.15,0.731,0.119,-0.9725,"{'neg': 0.0, 'neu': 0.89, 'pos': 0.11, 'compou...",0.0,0.89,0.11,0.2732


In [None]:
from statsmodels.formula.api import ols

temp = ols(formula='title_compound ~ Newspaper', data=df).fit()

temp.summary()

0,1,2,3
Dep. Variable:,title_compound,R-squared:,0.001
Model:,OLS,Adj. R-squared:,0.001
Method:,Least Squares,F-statistic:,1.737
Date:,"di, 05 dec 2023",Prob (F-statistic):,0.123
Time:,12:26:36,Log-Likelihood:,-2547.5
No. Observations:,6404,AIC:,5107.0
Df Residuals:,6398,BIC:,5148.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.0153,0.006,2.359,0.018,0.003,0.028
Newspaper[T.NRC],-0.0153,0.255,-0.060,0.952,-0.515,0.484
Newspaper[T.Parool],-0.0075,0.019,-0.401,0.689,-0.044,0.029
Newspaper[T.Telegraaf],-0.0356,0.013,-2.787,0.005,-0.061,-0.011
Newspaper[T.Trouw],-0.0026,0.014,-0.192,0.847,-0.029,0.024
Newspaper[T.Volkskrant],-0.0185,0.014,-1.359,0.174,-0.045,0.008

0,1,2,3
Omnibus:,25.597,Durbin-Watson:,1.887
Prob(Omnibus):,0.0,Jarque-Bera (JB):,21.387
Skew:,-0.075,Prob(JB):,2.27e-05
Kurtosis:,2.76,Cond. No.,58.8


In [None]:
from statsmodels.formula.api import ols

temp = ols(formula='title_pos ~ Newspaper', data=df).fit()

temp.summary()

0,1,2,3
Dep. Variable:,title_pos,R-squared:,0.001
Model:,OLS,Adj. R-squared:,0.001
Method:,Least Squares,F-statistic:,1.732
Date:,"di, 05 dec 2023",Prob (F-statistic):,0.124
Time:,12:26:44,Log-Likelihood:,4028.6
No. Observations:,6404,AIC:,-8045.0
Df Residuals:,6398,BIC:,-8005.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.0913,0.002,39.250,0.000,0.087,0.096
Newspaper[T.NRC],-0.0518,0.091,-0.568,0.570,-0.231,0.127
Newspaper[T.Parool],0.0074,0.007,1.113,0.266,-0.006,0.020
Newspaper[T.Telegraaf],-0.0110,0.005,-2.402,0.016,-0.020,-0.002
Newspaper[T.Trouw],-0.0003,0.005,-0.071,0.944,-0.010,0.009
Newspaper[T.Volkskrant],-0.0024,0.005,-0.503,0.615,-0.012,0.007

0,1,2,3
Omnibus:,1936.607,Durbin-Watson:,1.952
Prob(Omnibus):,0.0,Jarque-Bera (JB):,5462.867
Skew:,1.601,Prob(JB):,0.0
Kurtosis:,6.196,Cond. No.,58.8


In [None]:
from statsmodels.formula.api import ols

temp = ols(formula='title_neg ~ Newspaper', data=df).fit()

temp.summary()

0,1,2,3
Dep. Variable:,title_neg,R-squared:,0.001
Model:,OLS,Adj. R-squared:,0.0
Method:,Least Squares,F-statistic:,1.003
Date:,"di, 05 dec 2023",Prob (F-statistic):,0.414
Time:,12:27:09,Log-Likelihood:,3782.3
No. Observations:,6404,AIC:,-7553.0
Df Residuals:,6398,BIC:,-7512.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.0855,0.002,35.371,0.000,0.081,0.090
Newspaper[T.NRC],-0.0460,0.095,-0.485,0.627,-0.232,0.140
Newspaper[T.Parool],0.0100,0.007,1.449,0.147,-0.004,0.024
Newspaper[T.Telegraaf],0.0075,0.005,1.587,0.113,-0.002,0.017
Newspaper[T.Trouw],0.0058,0.005,1.154,0.249,-0.004,0.016
Newspaper[T.Volkskrant],0.0063,0.005,1.243,0.214,-0.004,0.016

0,1,2,3
Omnibus:,2028.056,Durbin-Watson:,1.816
Prob(Omnibus):,0.0,Jarque-Bera (JB):,5792.123
Skew:,1.678,Prob(JB):,0.0
Kurtosis:,6.232,Cond. No.,58.8


In [None]:
from statsmodels.formula.api import ols

temp = ols(formula='title_neu ~ Newspaper', data=df).fit()

temp.summary()

0,1,2,3
Dep. Variable:,title_neu,R-squared:,0.001
Model:,OLS,Adj. R-squared:,0.0
Method:,Least Squares,F-statistic:,1.157
Date:,"di, 05 dec 2023",Prob (F-statistic):,0.328
Time:,12:28:06,Log-Likelihood:,2130.3
No. Observations:,6404,AIC:,-4249.0
Df Residuals:,6398,BIC:,-4208.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.8221,0.003,262.644,0.000,0.816,0.828
Newspaper[T.NRC],0.0989,0.123,0.805,0.421,-0.142,0.340
Newspaper[T.Parool],-0.0165,0.009,-1.837,0.066,-0.034,0.001
Newspaper[T.Telegraaf],0.0044,0.006,0.720,0.472,-0.008,0.016
Newspaper[T.Trouw],-0.0045,0.007,-0.690,0.490,-0.017,0.008
Newspaper[T.Volkskrant],-0.0040,0.007,-0.604,0.546,-0.017,0.009

0,1,2,3
Omnibus:,669.1,Durbin-Watson:,1.844
Prob(Omnibus):,0.0,Jarque-Bera (JB):,907.637
Skew:,-0.852,Prob(JB):,8.11e-198
Kurtosis:,3.703,Cond. No.,58.8


In [None]:
# index = pd.date_range("2023-08-21", datetime.now().strftime("%Y-%m-%d"))
index = pd.date_range("2023-08-21", datetime.now().strftime("%Y-%m-%d"))

party_articles = {}

for party in parties:
    party_articles[party] = df[df[party].ge(min_val)]

party_sentiment = defaultdict()

for party in parties:
    party_sentiment[party] = defaultdict()
    for key in title_sentiments:
        party_sentiment[party][key] = party_articles[party].groupby(party_articles[party]['Date'])[key].mean().reset_index()
        party_sentiment[party][key] = party_sentiment[party][key].set_index('Date')
        party_sentiment[party][key] = party_sentiment[party][key].reindex(index)
        # party_sentiment[party][key] = party_sentiment[party][key].interpolate(limit_direction="both")
        party_sentiment[party][key].index = party_sentiment[party][key].index.strftime("%Y-%m-%d")

display(party_sentiment['NSC'])

for party_key in party_sentiment.keys():
    sentiment_dict = {}
    for sentiment_key in title_sentiments:
        sentiment_dict.update(party_sentiment[party_key][sentiment_key].to_dict())
    party_dict.update({party_key: sentiment_dict})

print(party_dict)

with open("../../data/title_sentiment_per_party.json", 'w+') as file:
    file.write(json.dumps(party_dict, indent=4, sort_keys=True))

defaultdict(None,
            {'title_neu':             title_neu
             2023-08-21   0.794750
             2023-08-22   0.736800
             2023-08-23   0.880333
             2023-08-24   0.774500
             2023-08-25   0.734500
             ...               ...
             2023-12-01        NaN
             2023-12-02        NaN
             2023-12-03        NaN
             2023-12-04        NaN
             2023-12-05        NaN
             
             [107 rows x 1 columns],
             'title_pos':             title_pos
             2023-08-21     0.1420
             2023-08-22     0.1514
             2023-08-23     0.0580
             2023-08-24     0.2255
             2023-08-25     0.1740
             ...               ...
             2023-12-01        NaN
             2023-12-02        NaN
             2023-12-03        NaN
             2023-12-04        NaN
             2023-12-05        NaN
             
             [107 rows x 1 columns],
             '

{'VVD': {'title_neu': {'2023-08-21': 0.7753076923076924, '2023-08-22': 0.824923076923077, '2023-08-23': 0.8776363636363637, '2023-08-24': 0.9448000000000001, '2023-08-25': 0.7687777777777778, '2023-08-26': 0.8111538461538461, '2023-08-27': 0.9119999999999999, '2023-08-28': 0.7956666666666666, '2023-08-29': 0.7684545454545454, '2023-08-30': 0.647090909090909, '2023-08-31': 0.8423846153846154, '2023-09-01': 0.85116, '2023-09-02': 0.8586785714285714, '2023-09-03': 0.9850999999999999, '2023-09-04': 0.8472083333333332, '2023-09-05': 0.8232777777777778, '2023-09-06': 0.847304347826087, '2023-09-07': 0.8147666666666668, '2023-09-08': 0.7111851851851853, '2023-09-09': 0.7614375, '2023-09-10': 0.7915, '2023-09-11': 0.8719090909090909, '2023-09-12': 0.8405217391304348, '2023-09-13': 0.9051428571428571, '2023-09-14': 0.6578333333333334, '2023-09-15': 0.7949473684210526, '2023-09-16': 0.7502272727272726, '2023-09-17': 0.8006666666666667, '2023-09-18': 0.7684090909090909, '2023-09-19': 0.7453448275