In this notebook, the annotated data set will be processed for the Exploratory Data Analysis and the implementation of the autmatic detection task using a Keras language model (https://keras.io/guides/sequential_model/).

In [None]:
#importing the necessary libraries
import pandas as pd
import numpy as np
import spacy
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
#downloading the large German model
!python -m spacy download de_core_news_lg

In [None]:
#loading the model
nlp = spacy.load('de_core_news_lg')

# Preprocessing

## Creating different DataFrames for different tasks

In [None]:
#reading in the data set
df_og = pd.read_csv('/kaggle/input/completed-annotation/all_annotated.csv')

df_og.head()

In [None]:
#creating sub-dataframe for either enigmatic or descriptive compounds
#for easier data manipulation

#splitting the DataFrame based on 'Annotation' value
df_en = df_og[df_og['Annotation'] == 1]
df_de = df_og[df_og['Annotation'] == 0]

print("DataFrame with Annotation == 1:")
print(df_en.head())

print("\nDataFrame with Annotation == 0:")
print(df_de.head())


## Cleaning the compounds (removing hyphens, lowercasing and lemmatizing them)

### Modifying the original DataFrame

In [None]:
#cleaning the text, i.e., making it lowercase, removing hyphens from the compounds, etc.
import re

#making a copy of the original DataFrame
df_mod = df_og.copy()

#removing hyphens from hyphenated compounds
def modify_compound(row):
    word = row['Compound']
    if '-' in word:
        #remove hyphen and merge the two constituents into one
        modified_word = re.sub(r'(\w+)-(\w+)', lambda match: match.group(1).capitalize() + match.group(2).lower(), word)
    else:
        #handle all uppercase words
        modified_word = word.lower().capitalize()
    return modified_word

#apply the function to the 'Compound' column
df_mod['Compound'] = df_mod.apply(modify_compound, axis=1)

print(df_mod)

In [None]:
#lemmatizing each word to obtain the underlying form (i.e., remove plural ending, etc.)
df_mod['Compound'] = df_mod['Compound'].apply(lambda x: [token.lemma_ for token in nlp(x)][0])
print(df_mod.head())

### modifying the Enigmatic DataFrame

In [None]:
#copying the original DataFrame
df_mod_en = df_en.copy()

#removing hyphens from hyphenated compounds
def modify_compound(row):
    word = row['Compound']
    if '-' in word:
        #remove hyphen and merge the two constituents into one
        modified_word = re.sub(r'(\w+)-(\w+)', lambda match: match.group(1).capitalize() + match.group(2).lower(), word)
    else:
        #handle all uppercase words
        modified_word = word.lower().capitalize()
    return modified_word

#apply the function to the 'Compound' column
df_mod_en['Compound'] = df_mod_en.apply(modify_compound, axis=1)

print(df_mod_en)

In [None]:
#lemmatizing each word to obtain the underlying form (i.e., remove plural ending, etc.)
df_mod_en['Compound'] = df_mod_en['Compound'].apply(lambda x: [token.lemma_ for token in nlp(x)][0])
print(df_mod_en)  

### Modifying the descriptive DataFrame

In [None]:
df_mod_de = df_de.copy()

def modify_compound(row):
    word = row['Compound']
    if '-' in word:
        modified_word = re.sub(r'(\w+)-(\w+)', lambda match: match.group(1).capitalize() + match.group(2).lower(), word)
    else:
        modified_word = word.lower().capitalize()
    return modified_word

df_mod_de['Compound'] = df_mod_de.apply(modify_compound, axis=1)

print(df_mod_de)

In [None]:
#lemmatizing each word to obtain the underlying form (i.e., remove plural ending, etc.)
df_mod_de['Compound'] = df_mod_de['Compound'].apply(lambda x: [token.lemma_ for token in nlp(x)][0])
print(df_mod_de)

# Statistical Analysis

## Exploratory Data Analysis

In the following part of the notebook, I am performing an Exploratory Data Analysis (EDA) to get a better insight into the data set and the statistical distribution of the compounds. Specifically, the following distributions will be calculated:

* the total number of compounds (enigmatic vs. descriptive) in each newspaper
* the most frequent compounds (enigmatic vs. descriptive) overall and for each newspaper
* the distribution of compounds (enigmatic vs. descriptive) across the years


As the compounds were normalized above, different forms of the same compound are counted as 1 instance of the base form when calculating the most frequent compounds. This is necessary for an accurate calculation, as for example, Flüchtlingskrise, Flüchtlings-Krise, and FLÜCHTLINGSKRISE should all be counted as an instance of the same token.

# Distribution of enigmatic vs. descriptive

## Total Number of enigmatic vs. descriptive Compounds by Newspaper

In [None]:
#calculating the total distribution of enigmatic compounds from each newspaper
#grouping by newspaper source and counting the occurrences with size()
comp_en_news = df_en.groupby('Source')['Compound'].size()

print("Total number of enigmatic Compounds from each newspaper:", "\n", comp_en_news)

In [None]:
#calculating the total distribution of descriptive compounds from each newspaper
#grouping by newspaper source and counting the occurrences with size()
comp_de_news = df_de.groupby('Source')['Compound'].size()

print("Total number of descriptive Compounds from each newspaper:", "\n", comp_de_news)

## Total Number of enigmatic vs. descriptive Compounds by Year

In [None]:
#calculating the distribution of compounds from each year
comp_year_en = df_en.groupby('Year')['Compound'].size()

print("Total number of enigmatic Compounds from each year:", "\n", comp_year_en)

In [None]:
#calculating the distribution of compounds from each year
comp_year_de = df_de.groupby('Year')['Compound'].size()

print("Total number of descriptive Compounds from each year:", "\n", comp_year_de)

## Most frequent enigmatic compounds

### Enigmatic

In [None]:
#determining the most frequent enigmatic compound
#calculating the frequency of each compound
en_counts = df_mod_en['Compound'].value_counts()

#getting the 10 most frequent compounds
en_all = en_counts.nlargest(10)

print("The 10 most frequent enigmatic compounds are:", "\n", en_all)

In [None]:
#visualizing the results using a horizontal bar plot
en_all_sorted = en_all.sort_values(ascending=True)

plt.barh(en_all_sorted.index, en_all_sorted.values, color='pink')
plt.xlabel('Frequency')
plt.ylabel('Compound')
plt.title('Top 10 Most Frequent Compounds Overall')
plt.grid(True)

plt.show()

### Descriptive

In [None]:
#determining the most frequent descriptive compound
#calculating the frequency of each compound
de_counts = df_mod_de['Compound'].value_counts()

#getting the 10 most frequent compounds
de_all = de_counts.nlargest(10)

print("The 10 most frequent descriptive compounds are:", "\n", de_all)

In [None]:
de_all_sorted = de_all.sort_values(ascending=True)

plt.barh(de_all_sorted.index, de_all_sorted.values, color='pink')
plt.xlabel('Frequency')
plt.ylabel('Compound')
plt.title('Top 10 Most Frequent descriptive Compounds')
plt.grid(True)

plt.show()

## Most common compounds by newspaper

### Enigmatic

In [None]:
#determining the most frequent Compounds for each newspaper
#sorting the compounds by newspaper and counting them
comp_en_news = df_mod_en.groupby('Source')['Compound'].value_counts()

#reset_index(drop=True) removes the outer index level, i.e., the newspaper names
year_en = comp_en_news.groupby(level=0).nlargest(5).reset_index(level=0, drop=True)

print("The most frequent enigmatic compounds for each newspaper:", "\n", year_en)

### Descriptive

In [None]:
#determining the most frequent Compounds for each newspaper
comp_de_news = df_mod_de.groupby('Source')['Compound'].value_counts()

#getting the most frequent compounds for each newspaper
#reset_index(drop=True) removes the outer index level, i.e., the newspaper names
year_de = comp_de_news.groupby(level=0).nlargest(5).reset_index(level=0, drop=True)

print("The most frequent compounds for each newspaper:", "\n", year_de)

# Most common compounds by newspaper and by year

## Enigmatic

In [None]:
comp_en_years = df_mod_en.groupby(['Year', 'Source'])['Compound'].value_counts()

years_en = comp_en_years.groupby(level=[0, 1]).nlargest(10).reset_index(level=[0, 1], drop=True)

print("The 5 most frequent enigmatic compounds for each newspaper and each year:", years_en)

## Descriptive

In [None]:
comp_de_years = df_mod_de.groupby(['Year', 'Source'])['Compound'].value_counts()

years_de = comp_de_years.groupby(level=[0, 1]).nlargest(5).reset_index(level=[0, 1], drop=True)

print("The 5 most frequent descriptive compounds for each newspaper and each year:", years_de)

## Stat Analysis

In the next step, a chi-square test is performed to determine if there are significant differences in the distribution of enigmatic and descriptive compounds between newspapers. For this, I am using the library SciPy (https://scipy.org/).

In [None]:
import pandas as pd
from scipy.stats import chi2_contingency

#creating a contingency table
contingency_table = pd.crosstab(df_og['Source'], df_og['Annotation'])

#performing the Chi-squared test
chi2, p, _, _ = chi2_contingency(contingency_table)
print("Chi-squared value:", chi2)
print("p-value:", p)

With an alpha level of 0.05 and a very low p-value of 5.06e-72, there is a significant association between the class of compounds and the newspapers. Thus the Null Hypothesis can be rejected with very high certainty due to an extremely low probability of the Null Hypothesis being true. This means, it can be predicted when either a descriptive or an enigmatic compound is used in a newspaper. This is in line with previous observations of the reporting style of tabloid media, such as BILD, in contrast to the more serious and less sensational reporting style of quality newspapers, as FAZ and SZ (see Chapter 4). To sum up, this adds a nuance to the differences in the use of enigmatic compounds between the three newspapers.

# The Language Model

# Preprocessing for the Automated Detection Task

Even though common practice in the preprocessing step in NLP tasks, the data was manually annotated, thus there will be no missing values (NAs) that need to be removed in either DataFrame.

Due to the nature of the ECs, this data does not yield a huge number of them. This became evident already during the manual annotation, but to visualize this, the difference in class balance is illustrated in the following:

In [None]:
#plotting the data distribution according to the classes 0 = 'descriptive' and 1 = 'enigmatic'
counts = df_mod['Annotation'].value_counts()
counts.plot(kind='bar', color=['orange', 'blue'])
plt.xlabel('Label')
plt.ylabel('Count')
plt.title('Distribution of Classes')
plt.xticks(rotation=0)
plt.show()

Working with imbalanced data is generally not recommended, as it can lead to a preference for the majority class (in this case the descriptive compounds) as well as overfitting. 
There are two alternatives to deal with imbalanced data: over- and undersampling. In cases of extreme class imbalances, oversampling would create an impractical number of fabricated examples, thus undersampling poses as a practical alternative. This means, the data set of the descriptive compounds was drastically reduced to match with the minority class (see also Section 5.3).

In [None]:
#checking the length of the descriptive DataFrame
len(df_mod_de)

In [None]:
#checking the length of the enigmatic DataFrame
len(df_mod_en)

In [None]:
#removing random samples from the descriptive DataFrame to match with the size
#of the enigmatic DataFrame
import pandas as pd
import numpy as np

len_en = len(df_mod_en)

# Randomly sample from df_mod_de to match the length of df_mod_en
if len(df_mod_de) > len_en:
    # Set a seed for reproducibility
    np.random.seed(42)
    # Randomly shuffle the larger DataFrame
    df_mod_de = df_mod_de.sample(n=len_en, random_state=42)

In [None]:
#checking whether the descriptive DataFrame has the correct length
len(df_mod_de)

As the DataFrames have already been cleaned and lemmatized, they will now be tokenized and further processed so that they can be used as input for the language model. Generally, this includes converting the tokenized text to sequences and then padding those, and splitting the data into training and test sets.

In [None]:
#combining the information from both the descriptive as well as enigmatic 
#DataFrames into the text column of the new DataFrame df_LM
import pandas as pd

# Concatenate the dataframes vertically
df_LM = pd.concat([df_mod_de[['Compound', 'Title', 'Annotation']], df_mod_en[['Compound', 'Title', 'Annotation']]])

# Reset the index of the new dataframe
df_LM.reset_index(drop=True, inplace=True)

## Tokenization

For the tokenization, a general process for NLP tasks is implemented, including defining the input dimensions and using the built-in keras Tokenizer (https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/text/Tokenizer) used for language models.
As the labels are already binary, there is no need to one-hot encode the labels for this model, which is one of the requirements to be able to implement them. Thus, the labels are merely transformed into a Numpy array, so ensure that all of the tensors have the same shape.

In [None]:
import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

#defining the lengths of the input
#using the first 100 words of each headline (the headlines are not longer than 100 words)
max_len = 100
#using the first 10k words from the data set
vocab_size = 10000

#providing both the titles as well as the compounds as input data
text = df_LM['Title'] + df_LM['Compound']
#transforming the ratings from the column 'fraudulent' into a numpy array
labels = df_LM['Annotation'].values

#tokenizing
tokenizer = keras.preprocessing.text.Tokenizer(num_words=vocab_size, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True)
#creating the mapping between words and integers
#to convert new text into a sequence of integers -> model input
tokenizer.fit_on_texts(text)
sequences = tokenizer.texts_to_sequences(text)  

#creating a dictionary where each word is a key 
#and the index is the corresponding value
word_index = tokenizer.word_index  

#padding the sequences, so that they are the same length
data = pad_sequences(sequences, maxlen=max_len) 

#shuffling the data so that it is random
indices = np.arange(data.shape[0]) 
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

## Training and test data split

For the data split, the entire data set is first split into training and test sets. Then, the training set is split into a smaller training and validation set. This process is most commonly used for this type of classification task, as it retains comparability. The function train_test_split from the sklearn library (https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html) is used for this. It has predefined parameters according to which it splits the data.

The random_state parameter was set to 1, so for every iteration of the model, the data split stays the same.

In [None]:
from sklearn.model_selection import train_test_split

y = labels
X = data

#splitting the data into test and training sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state= 1)

#creating a validation set from the training data set
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1)

# Model Definition and Compiling
As mentioned before, the data set consists of two classes: enigmatic (1) and descriptive (0). Thus, I am using Logistic Regression, as it is specifically designed for binary classification. LR calculates a probability estimate between 0 and 1 for each item according to which class it belongs. The closer the probability score is to 1, the higher the confidence of the model that a specific instance belongs to class 1, i.e., the enigmatic compounds, in this case.
Logistic Regression is easy to interpret due to this simple classification.

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout 

#setting the random seed so that the weights stay the same
np.random.seed(115)

#defining the model
#setting the output dimensions to 20
out_dim=20

#using a sequential model
model = Sequential(name = "model")
#using an embedding, LSTM, and dense layer
model.add(Embedding(vocab_size,out_dim,input_length=max_len))
model.add(Bidirectional(LSTM(10 )))
model.add(Dense(1,activation='sigmoid'))

#compiling the model, i.e., setting the parameters
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

print(model.summary())

# Fitting the model

In [None]:
#fitting the model
history = model.fit(X_train, y_train,
                    epochs=10,
                    batch_size=32,
                    validation_data=(X_val, y_val))

# Plotting the model's performance

In [None]:
loss = history.history['loss']
val_loss = history.history['val_loss']
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']

epochs = range(1, len(loss) + 1)

plt.plot(epochs, acc, 'k', linestyle = 'dashdot', label='Training acc')
plt.plot(epochs, val_acc, 'm', linewidth = '3', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel("epochs")
plt.ylabel("accuracy")
plt.legend()

plt.figure()

plt.plot(epochs, loss, 'k', linestyle = 'dashdot', label='Training loss')
plt.plot(epochs, val_loss, 'm', linewidth = '3', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel("epochs")
plt.ylabel("loss")
plt.legend()

plt.show()

# Evaluation of the model

In [None]:
print('Evaluating the model on the test data:')
results = model.evaluate(X_test, y_test)

In [None]:
#predicting on the test data
y_pred = model.predict(X_test)
y_pred = np.around(y_pred, decimals=0)

print('confusion matrix:',confusion_matrix(y_test, y_pred))
print('classification report:',classification_report(y_test,y_pred, target_names=['0', '1']))