# TODO CLEANING

- stopwords removal
- punctuation removal
- lowercasing
- lemmatizing

# TODO vectorization

LDA
vectorizer
- naive: bag of words (CountVectorizer)
- TF-IDF
- Word2Vec
- w

In [46]:
from tqdm import tqdm
from pandarallel import pandarallel
from nltk.corpus import stopwords
from gensim import models
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from gensim.models import LdaModel, CoherenceModel
import pandas as pd
import string
import ast
import spacy

tqdm.pandas()

In [2]:
with open('paper_selectors.txt', 'r') as file:
    papers = ast.literal_eval(file.read())

with open('party_selectors.txt', 'r') as file:
    parties = ast.literal_eval(file.read())

with open('party_synonyms.txt', 'r') as file:
    party_synonyms = ast.literal_eval(file.read())

with open('months.txt', 'r') as file:
    months = ast.literal_eval(file.read())

with open('min_mentions_per_article.txt', 'r') as file:
    min_val = int(file.read())

for synonym in party_synonyms:
    parties = [x for x in parties if x not in synonym[1:]]

In [3]:
df = pd.read_csv("../../data/03_data_scored.csv")

In [18]:
def df_apply(df: pd.DataFrame, column: str, function) -> None:
    """Wrapper to apply a function in place on a DataFrame.

    Allows for a shorter representation of applying a function on a dataframe
    with a progress bar. tqdm.progress_apply does not support
    in-place modification. By wrapping it, the code becomes more readable.

    Args:
        df (pd.DataFrame): DataFrame to be modified
        column (str): Column to be modified.
        function (str): Function to apply to column.
    """
    df[column] = df[column].progress_apply(function)

def remove_stopwords(text: str) -> str:
    """Return a text with all stopwords removed.

    Args:
        text (str): Text to remove stopwords from.
    """
    filler: list[str] = stopwords.words("dutch")
    return " ".join([word for word in text.split() if word not in filler])

def remove_punctuation(text: str) -> str:
    """Return a text with all punctuation removed.

    Args:
        text (str): Text to remove punctuation from.
    """
    additional_punct: str = string.punctuation + '"“‘—’”"'
    return text.translate(str.maketrans("", "", additional_punct))

def lemmatizer(text: str) -> str:
    """Return a given string in its lemmatized form."""
    nlp = spacy.load("nl_core_news_sm", disable=["parser", "ner"])
    doc = nlp(text)
    return " ".join([word.lemma_ for word in doc])

First we clean the data by removing punctuation, casing and lemmatizing the words.

In [5]:
for corpus in ['Title', 'Body']:
    df_apply(df, corpus, lambda x: x.lower() if isinstance(x, str) else x)
    df_apply(df, corpus, lambda x: remove_punctuation(x) if isinstance(x, str) else x)
    df_apply(df, corpus, lambda x: remove_stopwords(x) if isinstance (x,str) else x)

100%|██████████| 6769/6769 [00:00<00:00, 772803.19it/s]
100%|██████████| 6769/6769 [00:00<00:00, 302591.41it/s]
  0%|          | 0/6769 [00:00<?, ?it/s]

100%|██████████| 6769/6769 [00:00<00:00, 23952.69it/s]
100%|██████████| 6769/6769 [00:00<00:00, 70058.59it/s]
100%|██████████| 6769/6769 [00:01<00:00, 6753.53it/s]
100%|██████████| 6769/6769 [00:02<00:00, 3202.91it/s]


In [6]:
df.head()

Unnamed: 0.1,Unnamed: 0,Newspaper,Title,Date,Author,Section,Body,VVD,CDA,D66,...,body_vader_scores,body_neg,body_neu,body_pos,body_compound,title_vader_scores,title_neg,title_neu,title_pos,title_compound
0,0,AD,omtzigt mengt strijd boek vol plannen drie ton...,2023-08-21,Niels Klaassen,,pieter omtzigt mikt gematigde groei nieuwe par...,2,5,0,...,"{'neg': 0.0, 'neu': 0.935, 'pos': 0.065, 'comp...",0.0,0.935,0.065,0.4019,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,1.0,0.0,0.0
1,1,Parool,plofpartij gamechanger we verwachten nieuwe pa...,2023-08-21,Niels Klaassen,,pieter omtzigt mikt gematigde groei partij nie...,2,6,0,...,"{'neg': 0.046, 'neu': 0.833, 'pos': 0.121, 'co...",0.046,0.833,0.121,0.995,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0,1.0,0.0,0.0
2,2,Volkskrant,kaarten rechts opnieuw geschud,2023-08-21,RAOUL DU PRÉ,Ten Eerste,analyse vvd sluit pvv frans weisglas oudvoorzi...,15,2,2,...,"{'neg': 0.055, 'neu': 0.823, 'pos': 0.123, 'co...",0.055,0.823,0.123,0.9955,"{'neg': 0.121, 'neu': 0.823, 'pos': 0.056, 'co...",0.121,0.823,0.056,-0.3612
3,3,Trouw,pvv,2023-08-21,STEVO AKKERMAN,Vandaag,dilan yesilgöz opwierp opvolger mark rutte vi...,6,0,0,...,"{'neg': 0.136, 'neu': 0.808, 'pos': 0.056, 'co...",0.136,0.808,0.056,-0.9788,"{'neg': 0.402, 'neu': 0.598, 'pos': 0.0, 'comp...",0.402,0.598,0.0,-0.8126
4,4,AD,vvdkamerlid woordvoerder sport rudmer heerema ...,2023-08-21,Politieke redactie,,vvdkamerlid rudmer heerema keert aankomende tw...,4,1,1,...,"{'neg': 0.081, 'neu': 0.757, 'pos': 0.161, 'co...",0.081,0.757,0.161,0.9943,"{'neg': 0.0, 'neu': 0.69, 'pos': 0.31, 'compou...",0.0,0.69,0.31,0.2023


Now we lemmatize the title and body

In [22]:
pandarallel.initialize(progress_bar=True)

df['Title'].parallel_apply(lambda x: lemmatizer(x) if isinstance(x, str) else x)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=847), Label(value='0 / 847'))), HB…

0       omtzigen mengen strijd boek vol plan drie ton ...
1       plofpartij gamechang we verwachten nieuw parti...
2                           kaart rechts opnieuw schudden
3                                                     pvv
4       vvdkamerlid woordvoerder Sport rudm heerema st...
                              ...                        
6764              vervuiling tasten ons bestaanszekerheid
6765    pieter omtzigen veronrustend rapport vuurwerkr...
6766    pensioen stijgen verschil groot zorg verkiezin...
6767    fonds verhogen pensioen intussen bezorgd verki...
6768                     yesilgöz reageren omtzigtnotitie
Name: Title, Length: 6769, dtype: object

In [23]:
df['Body'].parallel_apply(lambda x: lemmatizer(x) if isinstance(x, str) else x)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=847), Label(value='0 / 847'))), HB…

0       pieter omtzigt mikt matigen groei nieuw partij...
1       pieter omtzigt mikt matigen groei partij nieuw...
2       analyse vvd aansluiten pvv Frans weisglas oudv...
3       dilan yesilgöz opwierp opvolg Mark ruten vall...
4       vvdkamerlid rudmer heerema terugkeren aankomen...
                              ...                        
6764    leefomgeving nieuw sociaal contract nsc partij...
6765    „ dit gebeuren wanneer regering stuk achterhou...
6766    pensioengerechtigd zien pensioen volgen jaar w...
6767    elk jaar kijken fonds weer pensioen mee laten ...
6768    vvdpartijleiad dilan yesilgöz Den Haag reager...
Name: Body, Length: 6769, dtype: object

In [48]:
df.to_csv('../../data/06_preprocessed_data.csv')

model source: 
https://github.com/coosto/dutch-word-embeddings