
# Code for Master's Thesis: Topic Modeling

## Research Questions

1. Welche Themen können mithilfe von Topic Modeling aus den DHd-Abstracts
der Tagungen zwischen 2014 und 2023 gefunden werden?

2. Welche Themen kommen häufig gemeinsam in einem Dokument vor und weisen
daher eine hohe Themenähnlichkeit (topic similarity) auf?

3. Wie haben sich die Themenschwerpunkte im Verlauf der Jahre verändert -
welche Trends sind zu erkennen?

4. Welche Entwicklungen sind in Bezug auf die Verwendung verschiedener Forschungsmethoden festzustellen?

5. Welche Personen sind besonders häufig mit Abstracts vertreten, in welchen
Autor:innenteams treten sie auf und wie verändern sich diese im Zeitverlauf?

6. Welche Personencluster sind in Bezug auf die Themenschwerpunkte zu erkennen und wie verändern sich diese?

### Necessary imports

In [2]:
#Reading in necessary pdf- and xml-files
import zipfile
from bs4 import BeautifulSoup
import PyPDF2
from io import BytesIO

#(pre)processing the files
import re
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
import spacy
from langdetect import detect
from gensim.models import TfidfModel

#LDA
import gensim
import gensim.corpora as corpora

#Evaluation
from gensim.models import CoherenceModel

#Visualisation
import pyLDAvis
import pyLDAvis.gensim_models

  from .autonotebook import tqdm as notebook_tqdm


### Preprocessing function: eliminating non-German texts from corpus

In [3]:
def detect_language(text):
    
    #gets text as input
    lang = detect(text)

    #returns the language tag of detected language
    return lang        

### Preprocessing function: cleaning the texts

In [4]:
def clean_text(text):
    
    # lowering text case
    # clean = text.lower()
    
    #filtering weblinks
    clean = re.sub('http(.*?) ', '', str(text))
    
    # filtering numbers
    clean = re.sub(r'\d', '', clean)
    
    # filtering paragraphs
    clean = re.sub(r'\n', '', clean)
    
    # filtering markup from XML
    clean = re.sub(r'<(.*?)>', '', clean)

    # filtering punctuation
    punctuation = '''!“()´`¨[]{};:'"\,<>./?@#$%^&*_~'''
    for word in clean:
        if word in punctuation:
            clean = clean.replace(word, "")
            
    
    # filtering abbreviations
    clean = re.sub('bspw', '', clean)
    clean = re.sub('sog', '', clean)
    clean = re.sub('zb', '', clean)
    clean = re.sub('ua', '', clean)
    clean = re.sub('vgl', '', clean)

    return clean

  punctuation = '''!“()´`¨[]{};:'"\,<>./?@#$%^&*_~'''


### Preprocessig function: PDF-specific cleaning steps

In [5]:
def pdf_specific_clean(text):
    
    # processing German Umlaute correctly
    clean = re.sub(" ¨a", "ä", text)
    clean = re.sub(" ¨u", "ü", clean)
    clean = re.sub(" ¨o", "ö", clean)
    clean = re.sub("¨a", "ä", clean)
    clean = re.sub("¨u", "ü", clean)
    clean = re.sub("¨o", "ö", clean)
    
    # PDF-specific substitution
    clean = re.sub("”", " ", clean)
    #clean = re.sub("-", "", clean)

    # return cleaned-up text
    return clean

### Preprocessing function: removing stopwords and very short/long words

In [6]:
def remove_stopwords(text):
    
    # import German stopword list 
    stops = set(stopwords.words("german"))
    
    # convert a document into a list of lowercase tokens, ignoring tokens that are too short or too long, no deaccentation (by default)
    words = gensim.utils.simple_preprocess(text)
    
    # filter stopwords
    words_filtered = []
    for w in words:
        if w not in stops:
            words_filtered.append(w)
    
    # return list of words that are NOT stopwords
    return words_filtered


### Preprocessing function: (morpho-syntactic) lemmatization
Hint: 'de_core_news_md' model has to be downloaded via pip beforehand

In [7]:
def lemmatization(texts):
    
    # only words tagged as nouns, verbs, adjectives and adverbs should be considered
    allowed_tags = ['NOUN', 'VERB', 'ADJ', 'ADV']

    # disabling parser and ner-tool to accelerate computing 
    nlp = spacy.load('de_core_news_md', disable=['parser', 'ner'])
    texts_out = []
    for text in texts:
        doc = nlp(text)
        new_text = []
        for token in doc:
            if token.pos_ in allowed_tags:
                new_text.append(token.lemma_.lower())
        # delete all empty sets where the pos-tag was not in allowed list
        if new_text != []:        
            final = " ".join(new_text)
            texts_out.append(final)
    
    # return list of lemmatized words
    return (texts_out)
    

### Function: Extracting Keywords from XML-File
- extracts tags \<keywords n="topics" scheme="ConfTool"> and \<keywords n="keywords" scheme="ConfTool"> to get keywords of the texts
- checks validity of keywords

In [9]:
def extract_keywords(xmldata):
    
    # finds all tags <keywords n="keywords"> and <keywords n="topics">, removes all tags within
    keywords = str(soup.find_all("keywords", {"n": ("keywords", "topics")}))
    keywords = re.sub("<(.*?)>", "", keywords)
    keywords = keywords.split("\n")

    # filters keywords shorter than 2 letters
    for item in keywords:
        if len(item) <= 2 and item in keywords:
            keywords.remove(item)
            
    # returns list
    return keywords

### Function: Counting number of extracted keywords
- function creates dictionary from the list of keywords
- counts how often each method is used
- returns the dictionary

In [10]:
def count_methods(keywords, methods_dict):
    
    # for each item in keyword list, check if it is alredy in dictionary
    # if not, add and set count to 1, if yes add +1 to count
    for item in keywords:
        if item not in methods_dict.keys():
            methods_dict[item] = 1
        else:
            methods_dict[item] += 1
    # sort dictionary according to highest count in the values
    sorted_dict = sorted(methods_dict.items(), key=lambda x: x[1], reverse=True)

    # return the sorted dictionary
    return sorted_dict

### Function: Extracting the author names
Extracts the names of the authors and returns a list of lists containing the names of the single texts' authors

In [27]:
def extract_authors(all_authors, title_stmt):
    
    # navigating to the title statement and finding all tags <author>
    authors = title_stmt.find_all("author")
    fore_and_surnames = []
    
    # extracting the <surname> and <forename> tags and cleaning the outcome from the tags and the brackets
    for element in authors:
        names = element.find_all(['surname', 'forename'])
        names =  re.sub("<(.*?)>", "", str(names))
        names = re.sub("</(.*?)>", "", str(names))
        names = re.sub(r'\]', "", names)
        names = re.sub(r'\[', "", names)
        fore_and_surnames.append(names)
    all_authors.append(fore_and_surnames)
    
    return all_authors

### Function: Extracting text from XML-files

In [11]:
def extract_xml_text(soup):
    
    # extract <p> tags from body of xml-document to find the actual text 
    document_body = soup.body
    p_tags = document_body.find_all("p")
    
    # return the text from p-tags
    return p_tags

### Functions: Making bigrams and trigrams

In [8]:
def make_bigrams(texts, bigram):
    return([bigram[doc] for doc in texts])

def make_trigrams(texts, trigram,bigram):
    return ([trigram[bigram[doc]] for doc in texts])

In [12]:
def create_bigrams_trigrams(texts):
    
    """ ADJUST MIN COUNT!! """
    bigram_phrases = gensim.models.Phrases(texts, min_count=2, threshold=100)
    trigram_phrases = gensim.models.Phrases(bigram_phrases[texts], threshold=100)

    bigram = gensim.models.phrases.Phraser(bigram_phrases)
    trigram = gensim.models.phrases.Phraser(trigram_phrases)

    data_bigrams = make_bigrams(texts, bigram)
    data_bigrams_trigrams = make_trigrams(data_bigrams, trigram, bigram)

    return data_bigrams_trigrams

### Function: Creating bag of words

In [13]:
def create_bow(data_words): 
    
    # mapping the documents' words to a dictionary   
    id2word = corpora.Dictionary(data_words)

    # creating a bag of words by using index of dictionary
    bag_of_words_corpus = []
    for text in data_words:
        new = id2word.doc2bow(text)
        bag_of_words_corpus.append(new)

    # returning id2word-reference as well as bag of word itself, both needed for LDA    
    return id2word, bag_of_words_corpus
    

### Function: TF-IDF weighting

In [14]:
def tf_idf(id2word, texts):
    # simple bow for each document, containing tuples with (index, number of appearances of the word in the document)
    corpus = [id2word.doc2bow(text) for text in texts]

    # calculates term frequency (TF) weighted by the inverse document frequency (IDF) for every word/index in the bag of words
    tfidf = TfidfModel(corpus, id2word=id2word)

    # low_value as threshold
    low_value = 0.03
    words  = []
    words_missing_in_tfidf = []

    # for every single bag of words
    for i in range(0, len(corpus)):
        # consider each bow for each document
        bow = corpus[i]
        low_value_words = [] #reinitialize to be safe. You can skip this.
        
        # for each tuple (index, tfidf-value) in the bag of words, extract index (tfidf_ids)
        tfidf_ids = [id for id, value in tfidf[bow]]
        
        # for each tuple (index, bow-value without tfidf), extract index
        bow_ids = [id for id, value in bow]
        
        # if the value in the (index, tfidf-value) tuple is lower than 0.03, put id into list low_value_words
        low_value_words = [id for id, value in tfidf[bow] if value < low_value]
        
        drops = low_value_words+words_missing_in_tfidf
        
        # which words will be deleted from the bow?
        for item in drops:
            words.append(id2word[item])
    
        words_missing_in_tfidf = [id for id in bow_ids if id not in tfidf_ids] # The words with tf-idf score 0 will be missing
        
        # add words which indexes are not in low_value_words and not in words_missing_in_tfidf to the new bag of words 
        new_bow = [b for b in bow if b[0] not in low_value_words and b[0] not in words_missing_in_tfidf]
        
        # new bow is missing certain indexes
        corpus[i] = new_bow
    
    return corpus

## Main Code:

Reading in zip-files of DHd-conferences (where only PDF-files are accessible) 

In [15]:
filenames = ['Testfile-pdf.zip']
#filenames = ['DHd_2014.zip', 'DHd_2015.zip']

all_pdf_texts = []
for conference_file in filenames:
    archive = zipfile.ZipFile(conference_file, 'r')
    for name in archive.namelist():
        if name[-4:] == '.pdf':
            # Reading the PDF-files from the zip-archive            
            PDF_read = PyPDF2.PdfReader(BytesIO(archive.read(name)))
            # using doc_length to iterate over all pages of each abstract, using doc_text to generate one list of words per abstract (not one per page)
            doc_length = len(PDF_read.pages)
            doc_text = " "
            for i in range(doc_length):
                page_text = PDF_read.pages[i]
                page_text = page_text.extract_text()
                doc_text = str(doc_text) + str(page_text)
            doc_text = pdf_specific_clean(doc_text)
            all_pdf_texts.append(doc_text)

                


Reading in the zip-files of the DHd-Conferences (where XML-files were published)

In [16]:
filenames = ['Testfile-xml.zip']
# filenames = ['DHd_2016.zip', 'DHd_2017.zip', 'DHd_2018.zip', 'DHd_2019.zip', 'DHd_2020.zip',
#              'DHd_2022.zip', 'DHd_2023.zip',]

all_xml_files = []
for conference_file in filenames:
    archive = zipfile.ZipFile(conference_file, 'r')
    # read in all files in the zip-file and check that they are xml-files
    for name in archive.namelist():
        if name[-4:] == '.xml' and not name[-9:] == 'final.xml':
            all_xml_files.append(archive.read(name))

XML-Files: Extracting the keywords given in the metadata of the abstracts in order to find the scientific methods used

In [33]:
# Creating dictionary to count how often each method is used
methods_dict = {}
all_xml_texts = []
author_names = []

for item in all_xml_files:
    
    soup = BeautifulSoup(item, 'xml')
    
    """Code for extracting the keywords used in xml-files"""
    
    keywords = extract_keywords(soup)
    methods_used = count_methods(keywords, methods_dict)
    # print("methods used", methods_used)
    
    
    """Code for extracting the actual text from xml-files"""
    
    xml_text = extract_xml_text(soup)
    all_xml_texts.append(xml_text)


    """Code for extracting the author names"""

    # extract author names
    title_stmt = soup.titleStmt
    authors = extract_authors(author_names, title_stmt)
print(authors) 


[['Wissik, Tanja', 'Krek, Simon', 'Jakubicek, Milos', 'Tiberius, Carole', 'Navigli, Roberto', 'McCrae, John', 'Tasovac, Toma', 'Varadi, Tamas', 'Koeva, Svetla', 'Costa, Rute', 'Kernerman, Ilan', 'Monachini, Monica', 'Trap-Jensen, Lars', 'Pedersen, Bolette S.', 'Hildenbrandt, Vera', 'Kallas, Jelena', 'Porta-Zamorano, Jordi'], ['Wissik, Tanja', 'Resch, Claudia'], ['Zirker, Angelika'], ['Zirker, Angelika', 'Bauer, Matthias', 'Kirchhoff, Leonie', 'Lahrsow, Miriam']]


Merging the PDF texts and the XML texts for further processing

In [16]:
whole_texts = all_pdf_texts + all_xml_texts
list_all_texts = []
for text in whole_texts:
    lang = detect_language(str(text))
    if lang == 'de':
        text_item = clean_text(text)
        text_item = remove_stopwords(text_item)
        ''' EVTL Prozess nur einmal, damit nicht so lange '''
        text_item = lemmatization(text_item)
        list_all_texts.append(text_item)
    else:
        next

Finding bigrams and trigrams

In [20]:
# creating bigrams and trigrams from lemmatized words
data_bigrams_trigrams = create_bigrams_trigrams(list_all_texts)


[['jahrestagung', 'digital', 'raum', 'vortrag', 'vortrag', 'diskussiontitel', 'frauenfrag', 'gegenstand', 'ntrovers', 'kommunika', 'tion', 'umkreis', 'erster', 'frauenbewegung', 'ressource', 'tersuchung', 'wolffstiftung', 'archiv', 'deutsch', 'kasseltel', 'kasseldekerstin', 'historikerin', 'arbeitsschw', 'erpunkt', 'liegen', 'bereichen', 'schen', 'frauenforschung', 'erforschung', 'r', 'frauenbewegung', 'leiten', 'stiftung', 'archiv', 'deutsch', 'geykenberlin', 'akademie', 'wissenschaft', 'digital', 'wörterbuch', 'deutsch', 'berlintel', 'geyk', 'arbeitsstellen', 'dwds', 'bbaw', 'liegen', 'bereichen', 'lexikographie', 'puslinguistik', 'gloninginstitut', 'istik', 'jlu', 'gießenotto', 'straße', 'gießentel', 'giessendethomas', 'ermanistik', 'fachlich', 'bereichen', 'textanalyse', 'semantik', 'ganisation', 'geschich', 'hinaus', 'digital', 'text', 'nutzung', 'infrastruktur', 'anwendbar', 'keit', 'reichweiten', 'digital', 'ressource', 'gegenstand', 'kommunikation', 'umkreis', 'sten', 'frauenbe

Creating id2word and bag of words

In [28]:
# id2word as dictionary where every word is referenced with id
id2word = corpora.Dictionary(data_bigrams_trigrams)
# rename variable
texts = data_bigrams_trigrams

# corpus as dictionary that contains a list of tuples for each document, tuples contain (id, no. of appearances of the word
# some index numbers are missing due to the tf-idf weighting 
corpus = tf_idf(id2word, texts)


[[(0, 1), (1, 1), (2, 2), (3, 1), (4, 6), (5, 1), (6, 1), (7, 1), (8, 1), (9, 2), (10, 1), (11, 1), (12, 1), (13, 1), (14, 2), (15, 1), (17, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 2), (34, 3), (35, 2), (36, 4), (37, 6), (38, 1), (40, 1), (41, 1), (43, 1), (46, 1), (47, 1), (48, 1), (50, 1), (51, 1), (52, 2), (53, 2), (54, 1), (56, 4), (57, 1), (58, 1), (59, 1), (60, 1), (61, 3), (62, 1), (63, 3), (64, 1), (65, 1), (66, 1), (67, 1), (68, 1), (69, 1), (70, 4), (71, 1), (73, 3), (74, 1), (75, 2), (76, 1), (77, 1), (78, 3), (79, 1), (80, 1), (81, 1), (82, 2), (83, 1), (84, 3), (85, 1), (86, 1), (87, 1), (89, 1), (90, 1), (91, 1), (92, 1), (93, 1), (94, 1), (96, 1), (97, 1), (98, 1), (99, 1), (100, 2), (101, 1), (102, 1), (103, 1), (105, 1), (106, 1), (108, 1), (109, 3), (110, 3), (111, 1), (112, 1), (113, 1), (114, 1), (115, 1), (116, 7), (118, 1), (119, 1), (120, 1), (122, 4), (124, 1), (125, 1), (127, 1), (128,

Creating the LDA model

In [None]:
''' FIND THE PERFECT PARAMETERS
random state:
update_every
chunksize: how many documents are processed at once? Higher: speed up process
passes: iterations
'''
lda_model = gensim.models.ldamodel.LdaModel(corpus=bow,
                                           id2word=id2word,
                                           num_topics=7,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto')

### Evaluation baseline: Topic coherence to evaluate the model's quality

In [277]:
# Compute baseline coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=list_all_texts, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score: ', coherence_lda)

Coherence Score:  0.3721166451208203


### Tuning (hyper-)parameters: Find optimal settings for topic number, alpha and beta

In [None]:
''' FILL IN '''

### Visualisation of final topic modeling with optimal parameters

In [224]:
# pyLDAvis.enable_notebook()
# vis = pyLDAvis.gensim_models.prepare(lda_model, bag_of_words_corpus, id2word, mds="mmds", R=30)
# vis

print(lda_model.print_topics())

[(0, '0.008*"kunstgeschicht" + 0.008*"quellen" + 0.007*"system" + 0.006*"text" + 0.006*"keazor" + 0.006*"film" + 0.005*"annotation" + 0.005*"digital" + 0.005*"musikvideo" + 0.005*"möglichkeit"'), (1, '0.001*"digital" + 0.001*"text" + 0.001*"annotation" + 0.001*"analyse" + 0.001*"lexikalisch" + 0.001*"historisch" + 0.001*"system" + 0.001*"social" + 0.001*"ressource" + 0.001*"erster"'), (2, '0.046*"annotation" + 0.018*"text" + 0.017*"ambiguität" + 0.014*"digital" + 0.011*"teasys" + 0.008*"erklärenden" + 0.007*"literarischer" + 0.007*"textes" + 0.006*"vgl" + 0.006*"information"'), (3, '0.001*"annotation" + 0.001*"digital" + 0.001*"text" + 0.001*"ambiguität" + 0.001*"teasys" + 0.001*"social" + 0.001*"erklärenden" + 0.001*"etwa" + 0.001*"vgl" + 0.001*"neu"'), (4, '0.018*"infrastruktur" + 0.015*"daten" + 0.013*"elexis" + 0.010*"fördern" + 0.010*"partner" + 0.008*"bereits" + 0.008*"bereich" + 0.008*"austausch" + 0.008*"wörterbücher" + 0.008*"zugang"'), (5, '0.029*"digital" + 0.013*"text" + 0.