<a href="https://colab.research.google.com/github/stAIner1988/NLP/blob/main/Projekt_XX_German_News_Article1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<img align="right" width="400" src="https://www.fhnw.ch/de/++theme++web16theme/assets/media/img/fachhochschule-nordwestschweiz-fhnw-logo.svg" alt="FHNW Logo">


# German News Articles

by Joel Akeret and Fabian Märki

## Summary
This is a short intro on how to access the *Ten Thousand German News Articles Dataset* for the *default project* (we still encourage you to work with your own dataset).

## Links
- [Ten Thousand German News Articles Dataset](https://tblock.github.io/10kGNAD/)

<a href="https://colab.research.google.com/github/markif/2023_HS_DAS_NLP_Notebooks/blob/master/XX_German_News_Article.ipynb">
  <img align="left" src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

In [None]:
from google.colab import drive
drive.mount('https://drive.google.com/drive/folders/1kOacBkp9_6oESaMyhYlnO2PL-rke2p48?usp=share_link')

# Neuer Abschnitt

In [None]:
#%%capture

!pip install 'fhnw-nlp-utils>=0.8.0,<0.9.0'
!pip install transformers
from fhnw.nlp.utils.processing import parallelize_dataframe
from fhnw.nlp.utils.processing import is_iterable
from fhnw.nlp.utils.storage import download
from fhnw.nlp.utils.storage import save_dataframe
from fhnw.nlp.utils.storage import load_dataframe
import pandas as pd
import numpy as np
import re
import tqdm
from datetime import datetime

import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import multilabel_confusion_matrix
import matplotlib.pyplot as plt
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer

tqdm.tqdm.pandas()
sns.set_style('whitegrid')

pd.options.display.max_colwidth = 600
pd.options.display.max_rows = 400


# Neuer Abschnitt

We recommend to use the stratified train/test split proposed by the maintainer of the dataset.

In [None]:
file = "data/german_news_articles_original_train.parq"
download("https://drive.switch.ch/index.php/s/mRnuzx4BLpMLqyz/download", file)
data_train = load_dataframe(file)

file = "data/german_news_articles_original_test.parq"
download("https://drive.switch.ch/index.php/s/DKUnZraeGp3EIK3/download", file)
data_test = load_dataframe(file)

Get a first impression...

In [None]:
print(data_train.shape)
print(data_test.shape)

In [None]:
data_train["split"] = "train"
data_test["split"] = "test"

In [None]:
data_train.head(3)

In [None]:
data_test.head(3)

In [None]:
data_all = pd.concat([data_train, data_test])
data = data_train

In [None]:
data.head(3)

In [None]:
data_all['text_original'].str.len().plot(kind = 'hist', bins = 50)
print(len(data_all))

In [None]:
data_all['label'].value_counts().plot(kind = 'bar')

https://www.kaggle.com/datasets/tblock/10kgnad/code

In [None]:
for col in data_all.columns:
    print(f"Missing data for column [{col}]: {data_all[col].isnull().sum()}")

In [None]:
df_all = data_all["label"].value_counts().sort_index()
df_train = data_train["label"].value_counts().sort_index()
df_test = data_test["label"].value_counts().sort_index()
df = pd.concat([df_all.to_frame(), df_train.to_frame(), df_test.to_frame()], axis=1)#, keys=["all","train","test"])
df.columns=["all","train","test"]
print (df)

In [None]:
def clean_text(text, keep_punctuation=False):
    """Cleans text by removing html tags, non ascii chars, digits and optionally punctuation"""

    import re

    # Compile RE pattern for HTTPS address, then Substitute it for blank
    RE_HTTPS = re.compile(r"https?://\S+ ")
    text = re.sub(RE_HTTPS, "", text)

    # Subsitute twitter picutures for blank
    text = re.sub(r'pic.twitter.com/[\w]*',"", text)

    # Subsitute multiple points space for 1 point
    text = re.sub(r"\(?[.][.]+\)?", ".", text)

    # Compile RE pattern for HTML tags, then Substitute it for blank
    RE_TAGS = re.compile(r"<[^>]+>")
    text = re.sub(RE_TAGS, " ", text)

    # Compile RE patterns for general text, including punctuation rule
    if keep_punctuation:
        RE_ASCII = re.compile(r"[^a-züöä,.!?]", re.IGNORECASE)
        RE_SINGLECHAR = re.compile(r"\b[a-züöä,.!?]\b", re.IGNORECASE)
    else:
        RE_ASCII = re.compile(r"[^A-Za-zÀ-ž ]", re.IGNORECASE)
        RE_SINGLECHAR = re.compile(r"\b[A-Za-zÀ-ž]\b", re.IGNORECASE)

    # keep only ASCII + European Chars and whitespace, no digits
    text = re.sub(RE_ASCII, " ", text)
    # convert all whitespaces (tabs etc.) to single wspace
    text = re.sub(RE_SINGLECHAR, " ", text)

    # Subsitute multiple blank space for 1 blank space
    text = re.sub(r"\s+", " ", text)

    # Subsitute double punctuation (left-over after previous subsitutions) for 1 point
    text = re.sub(r" [.,]+ [,.]+", ".", text)

    return text

In [None]:
data_all = data_all.drop(["text"], axis=1, errors='ignore')

# Apply text cleaning using MODIN pandas dataframe (parallelized)
data_all["text"] =  data_all["text_original"].apply(clean_text, keep_punctuation = True)

In [None]:
data_all.head(3)

In [None]:
data_all.loc[:,['text','label']]

In [None]:

from transformers import AutoTokenizer, TFAutoModel
tokenizer = BertTokenizer.from_pretrained("bert-base-german-cased")


In [None]:

MAXLEN = 192

def tokenize(data_all):
    encoded = tokenizer.encode_plus(
        text= data_all,
        add_special_tokens=True,  # Add `[CLS]` and `[SEP]`
        max_length=MAXLEN,  # Max length to truncate/pad
        padding='max_length',  # Pad sentence to max length
        return_attention_mask=False,  # attention mask not needed for our task
        return_token_type_ids=False,
        truncation=True, )

    return encoded['input_ids']


In [None]:
data_train = data_all[data_all['split'] == 'train'].loc[:,['text','label']]
data_train.head()

In [None]:
data_test = data_all[data_all['split'] == 'test'].loc[:,['text','label']]
data_test.head()

In [None]:
input_ids_train = np.array([tokenize(data_train) for data_train in tqdm.tqdm(data_train['text'])])
input_ids_test = np.array([tokenize(data_test) for data_test in tqdm.tqdm(data_test['text'])])

In [None]:

label_binarizer = LabelBinarizer()
label_binarizer.fit(data_all["label"])
print(f"Classes: {label_binarizer.classes_}")
print (f"Encoding:\n {label_binarizer.transform(label_binarizer.classes_).T}")

In [None]:
train_ids, test_ids, train_labels, test_labels = input_ids_train, input_ids_test, label_binarizer.transform(data_train['label']),label_binarizer.transform(data_test['label'])

In [None]:
#train_labels.drop_duplicates()

In [None]:
BATCH_SIZE = 16
EPOCHS = 8
LEARNING_RATE = 1e-5

In [None]:
train_dataset = (tf.data.Dataset.from_tensor_slices((train_ids, train_labels))
                    .shuffle(buffer_size=len(train_ids), reshuffle_each_iteration=True)
                    .repeat(EPOCHS)
                    .batch(BATCH_SIZE))

test_dataset = (tf.data.Dataset.from_tensor_slices((test_ids, test_labels))
                    .batch(BATCH_SIZE))


In [None]:
NUM_CLASSES = len(pd.DataFrame(train_labels).drop_duplicates())
NUM_CLASSES

In [None]:
def build_model(max_len=MAXLEN): #                dropout_rate=0.2):
    """ add multiclass classification to pretrained model
    """

    input_word_ids = tf.keras.layers.Input(
        shape=(max_len,), dtype=tf.int32, name="input_word_ids"
    )

    bert_model = TFBertModel.from_pretrained("bert-base-german-cased")
    encoder_outputs = bert_model(input_word_ids)

    ##########################
    ## YOUR CODE HERE START ##
    ##########################

    # Either use last_hidden_state use pooler_output
    # that were returned in encoder_outputs
    last_hidden_state = encoder_outputs[0]
    pooler_output = encoder_outputs[1]

    # In this case we will use the cls_embedding
    cls_embedding = pooler_output
    # Adding a Dropout layer
    #dropout_layer = tf.keras.layers.Dropout(rate=dropout_rate)(cls_embedding)

    # Create a feed-forward neural network with one hidden layer
    hidden = tf.keras.layers.Dense(128, activation='relu')(cls_embedding)
    # Change the output dimension to match the number of classes
    output = tf.keras.layers.Dense(NUM_CLASSES, activation='softmax')(hidden)

    ##########################
    ## YOUR CODE HERE END ##
    ##########################

    model = tf.keras.models.Model(inputs=input_word_ids, outputs=output)

    return model


In [None]:
model = build_model(max_len=MAXLEN)
model.summary()

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)
loss = loss="categorical_crossentropy"

model.compile(optimizer, loss=loss, metrics=["accuracy"], jit_compile=True)

In [None]:
from datetime import datetime  # Stellen Sie sicher, dass Sie datetime importiert haben

# Ändern Sie den Log-Pfad
log_dir = 'logs/' + datetime.now().strftime("%Y%m%d_%H%M%S")

hist = model.fit(
    train_dataset,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_data=test_dataset,
    steps_per_epoch=int(np.floor((len(input_ids_train) / BATCH_SIZE))),
    verbose=1,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(monitor="accuracy", verbose=1, patience=1, restore_best_weights=True),
        tf.keras.callbacks.TensorBoard(log_dir=log_dir)  # Verwenden Sie den geänderten Log-Pfad
    ],
)

Epoch 1/8
Epoch 2/8
 91/577 [===>..........................] - ETA: 4:49 - loss: 0.2602 - accuracy: 0.9190

In [None]:
"""
!pip install session-info
import session_info
session_info.show()
"""


In [None]:
history = pd.DataFrame({'epoch': hist.epoch, **hist.history}).set_index('epoch')
history

	0.014772	0.995451	0.537464	0.899805 war vorher

In [None]:
history.plot()

In [None]:
result = model.predict(test_ids)


In [None]:
result = result > .5
y_pred = result.astype(int)
y_true = test_labels

In [None]:
l = list(label_binarizer.classes_)

In [None]:
conf = multilabel_confusion_matrix(y_true, y_pred)

conf

In [None]:
import pickle
tf_model_from_google = { "model": model, "result_test": result, 'label_binarizer' : label_binarizer }
pickle.dump( tf_model_from_google, open( r"C:\Users\reto.steiner\Desktop\NLP\nlp.p", "wb" ))