In [None]:
! pip install simplemma

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting simplemma
  Downloading simplemma-0.9.0-py3-none-any.whl (76.2 MB)
[K     |████████████████████████████████| 76.2 MB 1.1 MB/s 
[?25hInstalling collected packages: simplemma
Successfully installed simplemma-0.9.0


In [None]:
import re
import string
import numpy as np
import pandas as pd

import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk import word_tokenize
import simplemma

from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense, Flatten, LSTM, Dropout, Activation, Embedding, Bidirectional

In [None]:
from sklearn.datasets import fetch_openml
dutch_news = fetch_openml(name='Dutch-News-Articles', version=1)

In [None]:
df = dutch_news['data']

In [None]:
df['category'].value_counts()

Buitenland          91194
Binnenland          80451
Politiek            21608
Economie            19391
Regionaal nieuws    16069
Koningshuis          3051
Opmerkelijk          2748
Cultuur  Media       2335
Tech                 1014
Name: category, dtype: int64

In [None]:
df_nieuws = pd.concat([df[df['category'] == 'Politiek'],
                       df[df['category'] == 'Economie'],
                       df[df['category'] == 'Regionaal nieuws']])
df = df_nieuws
df.sample(3)

Unnamed: 0,datetime,title,content,category,url
71723,2013-11-25 10:03:00,Omzet chemie opnieuw lager,De positie van de Nederlandse chemie-industrie...,Economie,https://nos.nl/artikel/579005-omzet-chemie-opn...
131724,2016-06-02 19:31:00,Oppositie verbijsterd over gasboringen bij Sch...,Oppositiepartijen in de Tweede Kamer reageren ...,Politiek,https://nos.nl/artikel/2108799-oppositie-verbi...
177892,2018-07-05 11:59:00,Utrecht beveiligt vijf pleinen tegen terreur,De gemeente Utrecht beveiligt vijf pleinen teg...,Regionaal nieuws,https://nos.nl/artikel/2239983-utrecht-beveili...


In [None]:
category_map = {'Politiek': 0, 'Economie': 1, 'Regionaal nieuws': 2}
df['category'] = df['category'].map(category_map)

In [None]:
df['category'].value_counts()

0    21608
1    19391
2    16069
Name: category, dtype: int64

In [None]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
def text_preprocessing(t):
    numbers_removed = re.sub(pattern=r'\d+', repl=" ", string=t.lower())
    punctuations_removed = numbers_removed.translate(str.maketrans('', '', string.punctuation))
    text = []
    stop_words = list(stopwords.words('dutch'))
    tokens = word_tokenize(punctuations_removed)
    for i in range(len(tokens)):
        lemma_word = simplemma.lemmatize(tokens[i], lang='nl')
        tokens[i] = lemma_word
        if tokens[i] not in stop_words:
            text.append(tokens[i])
    whitespace_removed = re.sub(pattern=r'\s+[a-zA-Z]\s+', repl=" ", string=" ".join(text))
    return whitespace_removed

In [None]:
df['content'] = df['content'].apply(lambda x: text_preprocessing(x))

In [None]:
news_train, news_test, topic_train, topic_test = train_test_split(df['content'], df['category'], test_size=0.2, random_state=42, shuffle=True, stratify=df['category'])

In [None]:
tokenizer = Tokenizer(num_words = 5000, oov_token='<OOV>')
tokenizer.fit_on_texts(news_train)

train_padded = pad_sequences(tokenizer.texts_to_sequences(news_train), 
                             maxlen=200, padding='post', truncating='post')
validation_padded = pad_sequences(tokenizer.texts_to_sequences(news_test), 
                                  maxlen=200, padding='post', truncating='post')

In [None]:
print(list(tokenizer.word_index)[:100])

['<OOV>', 'zeggen', 'gaan', 'jaar', 'moeten', 'komen', 'volgens', 'zullen', 'willen', 'twee', 'we', 'mens', 'maken', 'procent', 'euro', 'groot', 'wel', 'vinden', 'bedrijf', 'krijgen', 'goed', 'nederland', 'nieuw', 'ander', 'minister', 'kamer', 'partij', 'zien', 'kabinet', 'gelden', 'miljoen', 'laten', 'week', 'nemen', 'vorig', 'bank', 'staat', 'blijven', 'onderzoeken', 'nederlands', 'weer', 'heel', 'politie', 'één', 'houden', 'man', 'minder', 'miljard', 'ver', 'land', 'tijd', 'eerder', 'waar', 'geven', 'maand', 'aantal', 'zaak', 'alleen', 'mogen', 'gemeente', 'zonnen', 'afgelopen', 'schrijven', 'lang', 'rutte', 'mogelijk', 'staan', 'weet', 'melden', 'europees', 'betalen', 'vandaag', 'tussen', 'per', 'vraag', 'vooral', 'drie', 'dag', 'werken', 'vvd', 'af', 'steeds', 'noemen', 'probleem', 'uur', 'liggen', 'mee', 'pvda', 'spreken', 'zoals', 'omroep', 'deel', 'snel', 'verkopen', 'bijvoorbeeld', 'bekennen', 'politiek', 'ruim', 'eigen', 'sinds']


In [None]:
model = Sequential([Embedding(5000, 64), 
                    Dropout(0.5), 
                    Bidirectional(LSTM(64)),
                    Dense(4, activation='softmax')])
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(learning_rate=0.001, decay=1e-6), metrics=['accuracy'])
model.fit(train_padded, topic_train, epochs=2, validation_data=(validation_padded, topic_test), verbose=1)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f219d5a0970>