In [1]:
import torch

if torch.cuda.is_available():       
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

In [3]:
import re, random, spacy
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from nltk import word_tokenize
from nltk.util import ngrams
from collections import defaultdict, Counter, OrderedDict
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertForSequenceClassification, AdamW, BertConfig, get_linear_schedule_with_warmup, BertTokenizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from scipy.spatial.distance import cosine
from yellowbrick.cluster import SilhouetteVisualizer

2024-04-14 21:50:05.356867: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
df = pd.read_csv("../data/feature_statistics/sentence_level_statistics.tsv", sep="\t", encoding="utf-8")
print(df.shape)
print(df["label"].value_counts())
df.head()

(472484, 32)
label
0    194969
1    177716
2     99799
Name: count, dtype: int64


Unnamed: 0,id,sentence_index,label,text,text_preprocessed,total_token_count,boosters,hedges,adverbs_for_iteration_or_continuation,scalar_particles,...,legal,morality,policy,politics,public_opinion,security,welfare,topoi_of_natural_disaster,topoi_of_abuse_and_tragedy,every_xth
0,BildBund_09012016_141190358.xml,0,0,Die Berlinale ( 11. - 21. 2. ) wird in diesem ...,der Berlinale -- 11. -- 21. 2. -- werden in di...,18,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,BildBund_09012016_141190358.xml,1,0,Berlinale-Chef Dieter Kosslick zu [ NEWSPAPE...,Berlinale-Chef Dieter Kosslick zu [ NEWSPAPE...,22,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,BildBund_09012016_141190358.xml,2,0,"""",--,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,BildBund_09012016_141190358.xml,3,0,Unter den diesjährigen internationalen Stars w...,unter der diesjährig international Star werden...,12,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,BildOnline_28092016_0844Uhr441.xml,0,0,In Deutschland leben einem Bericht zufolge geg...,in Deutschland leben ein Bericht zufolge gegen...,15,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
german_stopwords = []
f = open("../data/german_stopwords_plain.txt", "r")
for i in f.readlines():
    if not i.startswith(";"):
        german_stopwords.append(i.strip("\n"))
    
keywords = ["schon wieder", "immer wieder", "erneut"]

In [6]:
idx_selected = []
for idx, row in df.iterrows():
    for w in keywords:
        if w in row["text_preprocessed"]:
            idx_selected.append(idx)
            
df_selected = df.iloc[np.r_[idx_selected], ]
df_selected = df_selected.drop_duplicates()
print(df_selected.shape)
print(df_selected["label"].value_counts())

(2690, 32)
label
0    1408
1     653
2     629
Name: count, dtype: int64


In [7]:
nlp = spacy.load("de_core_news_sm")

In [8]:
text_preprocessed_cleaned = []


for idx, row in df_selected.iterrows():
    doc = nlp(row["text"])
    lemmata = [token.lemma_ for sent in doc.sents for token in sent]
    pos = [token.tag_ for sent in doc.sents for token in sent]
    
    '''
    lemmata_cleaned_PVassembled: a variable for list of lemmata with..
        - iter-advs removed
        - stopwords removed
        - punctuations removed
        - particle verbs whose particle and verb stem separated again re-assembled as one word
    '''
    lemmata_cleaned_PVassembled = None

    # ----- Deal with particle verbs with particle and verb stem separated-----
    idx_vfin = None
    idx_ptkvz = None
    vfin = ""
    ptkvz = ""
    particle_verb_segments = []
    idx_lemma_remove = set()
        
    for idx_lemma, lemma in enumerate(lemmata):
        if pos[idx_lemma] == "VVFIN":
            idx_vfin = idx_lemma
            vfin = lemma
        if pos[idx_lemma] == "PTKVZ":
            idx_ptkvz = idx_lemma
            ptkvz = lemma
        if idx_vfin and idx_ptkvz and idx_ptkvz > idx_vfin:
            particle_verb_segments.append((ptkvz, vfin))
            idx_lemma_remove.add(idx_vfin)
            idx_lemma_remove.add(idx_ptkvz)
            
            idx_vfin = None
            idx_ptkvz = None
            vfin = ""
            ptkvz = ""
                
    if particle_verb_segments:
        lemmata_copy = lemmata
        pos_copy = pos
        for i in sorted(idx_lemma_remove, reverse=True):
            del lemmata_copy[i]
            del pos_copy[i]
            
        lemmata_cleaned_PVassembled = [
            lemma for idx_lemma, lemma in enumerate(lemmata_copy) 
            if lemma not in german_stopwords 
            and lemma not in ["schon", "immer", "erneut", "[", "]"] # "wieder" is included in german_stopwords
            and lemma.startswith("NEWSPAPER-NAME") is False
            and re.match("(\$|CARD|FM|PAV|PWAV|PRELAT)", pos_copy[idx_lemma]) is None # PAV/PWAV/PRELAT: remove "deren", "dafür", "warum", "weshalb" etc. that are not included in NLTK's stopword list
        ]
        
        for p, v in particle_verb_segments:
            lemmata_cleaned_PVassembled.append(p + v)
    else:
        lemmata_cleaned_PVassembled = [
            lemma for idx_lemma, lemma in enumerate(lemmata)
            if lemma not in german_stopwords 
            and lemma not in ["schon", "immer", "erneut", "[", "]"] 
            and lemma.startswith("NEWSPAPER-NAME") is False
            and re.match("(\$|CARD|FM|PAV|PWAV|PRELAT)", pos[idx_lemma]) is None 
        ]
    
    # END DEALING WITH PARTICLE VERBS 
    
    # ----- Lines for checking results: -----
    #print(row["text"])
    #print(lemmata_cleaned_PVassembled)
    #print("-----\n")
    
    text_preprocessed_cleaned.append(" ".join(lemmata_cleaned_PVassembled))
    
df_selected.insert(5, "text_preprocessed_cleaned", text_preprocessed_cleaned)
df_selected

Unnamed: 0,id,sentence_index,label,text,text_preprocessed,text_preprocessed_cleaned,total_token_count,boosters,hedges,adverbs_for_iteration_or_continuation,...,legal,morality,policy,politics,public_opinion,security,welfare,topoi_of_natural_disaster,topoi_of_abuse_and_tragedy,every_xth
268,SZ_08052014_A57073089.xml,22,2,Bislang hatten die Innenminister immer wieder ...,bislang haben der Innenminister immer wieder z...,Innenminister bestimmt Stichtag Bleiberecht ge...,25,0,0,1,...,2,0,1,0,0,0,0,0,0,0
360,BildOnline_27022017_1538Uhr250.xml,21,0,Der Direktor der EU-Grenzschutzagentur Frontex...,der Direktor der EU-Grenzschutzagentur Frontex...,Direktor EU-Grenzschutzagentur Frontex Fabrice...,41,0,0,1,...,0,0,0,0,0,0,0,0,0,0
458,BildBund_22102015_139422408.xml,1,0,Nachdem die als Zeugen vernommenen Flüchtlinge...,nachdem der als Zeuge vernommen Flüchtling sic...,Zeuge vernommen Flüchtling Widerspruch verstri...,20,0,0,1,...,2,0,0,0,0,0,0,0,0,0
560,BildOnline_12122016_1004Uhr412.xml,8,0,Am Sonntagnachmittag brach erneut eine Schläge...,an Sonntagnachmittag brechen erneut ein Schläg...,Sonntagnachmittag Schlägerei ausbrechen,8,0,0,1,...,0,0,0,0,0,1,0,0,0,0
563,BildOnline_12122016_1004Uhr412.xml,11,0,Nach zwei Stunden kam es erneut zu einer handf...,nach zwei Stunde kommen es erneut zu ein handf...,Stunde handfest Streitigkeit Person Heimbewohn...,20,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
471148,BildOnline_19072014_1423Uhr12.xml,0,0,Schon wieder eine Schreckensmeldung von der it...,schon wieder ein Schreckensmeldung von der ita...,Schreckensmeldung italienisch Mittelmeerinsel ...,19,0,0,1,...,0,1,0,0,0,2,0,0,0,0
471777,BildOnline_13092015_2330Uhr579.xml,9,0,Und in der Flüchtlingskrise zeigte sich Kanzle...,und in der Flüchtlingskrise zeigen sich Kanzle...,Flüchtlingskrise zeigen Kanzlerin Merkel Muste...,17,0,0,1,...,0,0,1,1,0,0,0,0,0,0
471797,FAZfaz_17092015_FD2201509174675448.xml,8,1,Das kurze Techtelmechtel der französischen Lin...,der kurz Techtelmechtel der französisch linke ...,kurz Techtelmechtel französisch linke Kanzleri...,20,0,0,1,...,0,0,0,1,0,0,0,0,0,0
471980,FAZfaz_16092015_FD2201509164674661.xml,15,1,Der Zugverkehr nach Deutschland muss immer wie...,der Zugverkehr nach Deutschland muss immer wie...,Zugverkehr Deutschland unterbrechen Flüchtling...,26,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [9]:
df_selected.to_csv("./output/iteradv_sentences.tsv", sep="\t", encoding="utf-8", index=False)