In [1]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

torch.cuda.set_device(1)
torch.cuda.current_device()

1

In [3]:
import os, re
import pandas as pd
import numpy as np
import string
from nltk.corpus import stopwords
from gensim.models import Word2Vec, Phrases, KeyedVectors

# 1. Preprocessing

In [4]:
df = pd.read_csv("../data/feature_statistics/document_level_statistics.tsv", sep = "\t", encoding = "utf-8")
print(df.value_counts("label"))
df

label
0    12107
1     6686
2     4536
Name: count, dtype: int64


Unnamed: 0,id,label,text,text_preprocessed,total_token_count,boosters,hedges,adverbs_for_iteration_or_continuation,scalar_particles,factive_verbs,...,legal,morality,policy,politics,public_opinion,security,welfare,topoi_of_natural_disaster,topoi_of_abuse_and_tragedy,every_xth
0,BildBund_09012016_141190358.xml,0,Die Berlinale ( 11. - 21. 2. ) wird in diesem ...,der Berlinale -- 11. -- 21. 2. -- werden in di...,53,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,BildOnline_28092016_0844Uhr441.xml,0,In Deutschland leben einem Bericht zufolge geg...,in Deutschland leben ein Bericht zufolge gegen...,119,0,0,0,0,0,...,0,0,1,1,0,2,0,0,0,0
2,SZ_28072015_A60857742.xml,2,Baden-Württembergs grüner Ministerpräsident Wi...,Baden-Württemberg grün Ministerpräsident Winfr...,461,0,2,0,0,0,...,1,2,10,21,2,1,1,0,0,0
3,FAZfaz_08072016_FD2201607084896611.xml,1,Der Zuspruch zu einer Willkommenskultur Einw...,der Zuspruch zu ein Willkommenskultur Einwan...,97,0,0,0,0,0,...,1,3,6,0,0,0,0,0,0,0
4,FAZfaz_23112015_FD1201511234725141.xml,1,Über ihren Leitantrag zur Flüchtlingspolitik...,über ihr Leitantrag zu Flüchtlingspolitik --...,850,3,2,1,3,3,...,2,8,15,34,4,5,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23324,BildOnline_05092016_1748Uhr581.xml,0,Der Leiter des Koordinierungsstabs Flüchtlinge...,der Leiter der Koordinierungsstab Flüchtling i...,68,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
23325,FAZfaz_21122015_FDA201512214747193.xml,1,Vom Kampf gegen Mietwucher und Wohnungen für F...,von Kampf gegen Mietwucher und Wohnung für Flü...,432,4,1,2,2,1,...,1,1,4,0,0,1,0,0,0,0
23326,FAZfaz_10032016_FD3201603104805469.xml,1,"Die Balkan-Route ist geschlossen , die Schenge...",der Balkan-Route sein schließen -- der Schenge...,1093,3,2,2,1,2,...,5,5,33,13,0,14,0,2,0,0
23327,BildOnline_14072016_1614Uhr127.xml,0,Deutschland hat 2015 die höchste Zuwanderung s...,Deutschland haben 2015 der hoch Zuwanderung se...,91,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [5]:
stops = set(stopwords.words('german'))
puncts = string.punctuation

In [6]:
def clear_stopwords_and_puncts(texts):
    all_cleaned_texts = []
    
    for text in texts:
        current_cleaned_text = []
        for word in text.split(" "):
            if word not in stops and word not in puncts and re.fullmatch("(\d+[\.,]?\d*|--)", word) is None and word.startswith("NEWSPAPER-NAME") is False:
                current_cleaned_text.append(word)
            
        all_cleaned_texts.append(current_cleaned_text)
        
    return all_cleaned_texts

In [7]:
df_bild = df[df["label"] == 0]
df_faz = df[df["label"] == 1]
df_sz = df[df["label"] == 2]

bild_text = clear_stopwords_and_puncts(df_bild["text_preprocessed"]) 
faz_text = clear_stopwords_and_puncts(df_faz["text_preprocessed"])
sz_text = clear_stopwords_and_puncts(df_sz["text_preprocessed"])

text_dict = {
    "BILD": bild_text,
    "FAZ": faz_text,
    "SZ": sz_text
}

print(len(bild_text))
print(len(faz_text))
print(len(sz_text))

12107
6686
4536


# 2. Train embeddings

In [8]:
VECTOR_SIZE = 300
MIN_COUNT = 3
SEED = 42

In [9]:
for label, text in text_dict.items():
    bigram_transformer = Phrases(text)
    model = Word2Vec(bigram_transformer[text], min_count=MIN_COUNT, vector_size=VECTOR_SIZE, workers=1, seed=SEED)
    model.save(os.path.join("../output/word2vec", label+"_"+str(VECTOR_SIZE)+"d_allyears.model"))
    print(label, ": Done with training.")

BILD : Done with training.
FAZ : Done with training.
SZ : Done with training.


# 3. Add refugee-centroid

In [10]:
fluechtling_keywords = [
    "Flüchtling",
    "Geflüchtete", "Geflüchteten", "Geflüchteter", "Geflüchtetem",
    "Migrant", "Migrantin", "Migranten",
    "Asylant", "Asylanten",
    "Asylwerber", "Asylwerberin",
    "Asylbewerber", "Asylbewerberin",
    "Asylsuchende", "Asylsuchenden", "Asylsuchender", "Asylsuchendem",
]

In [11]:
word2vec_model_paths = [
    "../output/word2vec/BILD_300d_allyears.model",
    "../output/word2vec/FAZ_300d_allyears.model",
    "../output/word2vec/SZ_300d_allyears.model"
]

In [12]:
save_path = "../output/word2vec_with_refugee_centroid"

for path in word2vec_model_paths:
    model = KeyedVectors.load(path)
    vocab = model.wv.index_to_key
    
    keyword_vectors = []
    for k in fluechtling_keywords:
        if k in vocab:
            keyword_vectors.append(model.wv[k])
            
    keyword_matrix = np.concatenate(keyword_vectors).reshape(len(keyword_vectors), VECTOR_SIZE)
    keyword_centroid = np.mean(keyword_matrix, axis=0)
    model.wv.add_vector("<FLÜCHTLING_CENTROID>", keyword_centroid)

    model.save(os.path.join(save_path, path.split("/")[-1]))

print("Done with adding keyword-centroid.")



Done with adding keyword-centroid.
