# Code for Master's Thesis: Preprocessing

## Research Questions

1. Which topics can be found in the abstracts from DHd-conferences between 2014 and 2023 with Topic Modeling?

2. Which topics appear frequently in one abstract and therefore have a high topic similarity?

3. How have the topics been changing throughout the years - which trends are perceptible?

4. With regard to the use of different scientific methods, which developments are perceptible?

5. Which researchers contribute to the conference particularly frequently with abstracts, in which teams do they contribute and how have the teams been changing?

6. Which clusters of researchers can be found with regard to topics and how have the clusters been changing?

### Imports

In [79]:
#Reading in necessary pdf- and xml-files
import zipfile
from bs4 import BeautifulSoup
import fitz
from io import BytesIO

#(pre)processing the files
import re
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
import spacy
from HanTa import HanoverTagger as ht
from langdetect import detect
from gensim.models import TfidfModel
import pickle
import pandas as pd
import gensim
import gensim.corpora as corpora
import os
import numpy as np

### General functions: opening lists, saving and reopening objects

In [80]:
def open_list(doc_name):
    f = open(doc_name, "r", encoding='utf-8')
    data = f.read()
    data = data.split(", ")
    f.close()
    
    return data

In [81]:
def save_object(dirname, filename, varname):
    filename = dirname + filename
    f = open(filename, 'wb')
    pickle.dump(varname, f)
    f.close()

In [82]:
def open_variable(dirname, filename):
    path = str(dirname) + str(filename)
    f = open(path, 'rb')
    filename = pickle.load(f)
    f.close()
    
    return filename 

In [83]:
def check_directory(directory_name):
    if not os.path.isdir(directory_name):
        os.mkdir(directory_name)
        print('Created new directory: ', directory_name)
    else:
        return

### Preprocessing
#### Extracting Text from XML-Files

In [5]:
def extract_xml_text(soup):
    
    # extract <p> tags from body of xml-document to find the actual document text 
    p_tags = soup.body.find_all("p")
    
    return p_tags

#### Determining Text Language

In [6]:
def detect_language(text):
    return detect(text)        

#### Cleaning Texts

Removing new lines, weblinks, digits, markup and punctuation. Further using gensim.utils.simple_preprocess which eliminates tokens shorter than 3 or longer than 30 characters, and returns lower-cased items.

In [7]:
def clean_text(text):
        
    clean_text = re.sub(r'\n', "", str(text))

    sequences_to_remove = [r'http(.*?) ', r'\d', r'<(.*?)>', r'https(.*?) ', r'www(.*?) ', '-']
    for item in sequences_to_remove:
        clean_text = re.sub(item, '', clean_text)
    
    # filtering punctuation
    punctuation = '''!“()´`¨[]{}\\;:”",<>/.?@#$%^&*_~''' 
    for item in clean_text:
        if item in punctuation:
            clean_text = clean_text.replace(item, "")
      
    # convert a document into a list of lowercase tokens, ignoring tokens that are too short (min_len=3) or too long (max_len=30), no deaccentation (by default)
    clean_text = gensim.utils.simple_preprocess(clean_text, min_len=3, max_len=30)

    return clean_text

#### Lemmatization with Part-of-Speech Tagging

Lemmatizing words tagged with certain part of speech and according to the detected language. Hanover Tagger is used since it lemmatized better than spaCy tagger, especially on German words.

In [8]:
def lemmatization(texts, language):
    
    tagger_de = ht.HanoverTagger('morphmodel_ger.pgz')
    tagger_en = ht.HanoverTagger('morphmodel_en.pgz')
    
    allowed_tags = ['NN', 'NE', 'ADJ(A)', 'ADJ(D)', 'VV(INF)', 'VV(FIN)', 'VV(PP)', 'VA(INF)', 'VA(FIN)', 'VM(FIN)', 'VM(INF)']

    if language == 'de':
        lemmatized_text = []
        for token in texts:
            tagged_token = tagger_de.analyze(token)
            if tagged_token[1] in allowed_tags:
                lemmatized_text.append(tagged_token[0].lower())

    elif language == 'en':
        lemmatized_text=[]
        for token in texts:
            tagged_token = tagger_en.analyze(token)
            if tagged_token[1] in allowed_tags:
                lemmatized_text.append(tagged_token[0].lower())
            
    return lemmatized_text

#### Stopword Removal

Removing stopwords contained in standard lists for German and English, as well as from own list set up especially for the corpus used.

In [9]:
def remove_stopwords(text, language, additional_stops):
    
    stopwords_de = set(stopwords.words('german'))
    stopwords_de.update(additional_stops)
    
    stopwords_en = set(stopwords.words('english'))
    stopwords_en.update(additional_stops)
    
    filtered_text = []
    for w in text:
        if language == 'de':
            if w not in stopwords_de:
                filtered_text.append(w)

        elif language == 'en':
            if w not in stopwords_en:
                filtered_text.append(w)
    
    return filtered_text

### Extract List Items from Textfile After Manual OCR-Postprocessing

In [10]:
def post_processing_ocr(manually_corrected_file):

    postprocessed_texts = []
    for item in manually_corrected_file.split(r']'):
        item = re.sub(r', \[', '', item)
        item = re.sub(r'\ufeff', '', item)
        item = re.sub(r'\[', '', item)
        item = re.sub(r'\'', '', item)
        item = re.sub(r'\’', '', item)
        item = re.sub(r'\‘', '', item)
        clean = item.split(', ')
        postprocessed_texts.append(clean)
        
    # cutting off the last two items as they are not text    
    postprocessed_texts = postprocessed_texts[:-2]
    
    return postprocessed_texts

#### Creating N-Grams

Creating n-grams by using classes *Phrases* and *Phraser* provided by *gensim.models*. Those classes identify phrases within the texts which qualify for n-grams, given the minimum count and the threshold. *create_bigrams* and *create_trigrams* then return the actual n-grams to the text basis. 

In [11]:
def create_bigrams(texts, bigram):
    return([bigram[doc] for doc in texts])

def create_trigrams(texts, trigram, bigram):
    return ([trigram[bigram[doc]] for doc in texts])

In [12]:
def create_bigrams_trigrams(texts):
   
    bigram_phrases = gensim.models.Phrases(texts, min_count = 8, threshold = 100)
    trigram_phrases = gensim.models.Phrases(bigram_phrases[texts], threshold = 100)

    bigram = gensim.models.phrases.Phraser(bigram_phrases)
    trigram = gensim.models.phrases.Phraser(trigram_phrases)

    data_bigrams = create_bigrams(texts, bigram)
    data_bigrams_trigrams = create_trigrams(data_bigrams, trigram, bigram)

    return data_bigrams_trigrams

#### TF-IDF Weighting

TF-IDF weighting eliminates terms which are very frequent, i.e. ubiquitous, in the documents and therefore might not be very important to the texts' meaning. The threshold determines which TF-IDF value the terms have to overcome in order to not be removed. 

In [13]:
def tf_idf(id2word, texts):
    
    # simple bag of words for each document, containing tuples with (index, number of appearances of the word in the document)
    corpus = [id2word.doc2bow(text) for text in texts]

    # calculates term frequency (TF) weighted by the inverse document frequency (IDF) for every word/index in the bag of words
    tfidf = TfidfModel(corpus, id2word=id2word)

    # low_value as threshold
    threshold = 0.03
    deleted_words  = []

    # for every single bag of words
    for i in range(0, len(corpus)):
        # consider each bow for each document
        bow = corpus[i]
        
        # if the value in the (id, tfidf-value) tuple is lower than 0.03, put id into list low_value_words
        below_threshold_ids = [id for id, value in tfidf[bow] if value < threshold]

        # which words will be deleted from the bow?
        for id in below_threshold_ids:
            deleted_words.append(id2word[id])
        
        # add words which indexes are not in low_value_words and not in words_missing_in_tfidf to the new bag of words 
        new_bow = [b for b in bow if b[0] not in below_threshold_ids]
        
        # new bow is missing certain indexes
        corpus[i] = new_bow
    
    return corpus, deleted_words

==========================================================================================================================

==========================================================================================================================

==========================================================================================================================

### Answering the Research Questions
#### Extracting Conference Names from Zip-Files

*get_conference_nams* extracts the conference name from the zip-file name so that the proper names can be used for referencing the conferences in e.g. visualizations

In [14]:
def get_conference_names(list):
    
    files = []
    for element in list:
        new_name = element.split('.zip')[0]
        new_name = new_name.split('Corpus/')[1]
        new_name = re.sub('_', ' ', new_name)
        files.append(new_name)
    
    return files

#### Extracting Author Names
*extract_authors* finds the names of the authors and returns a list of lists containing the names of the single texts' authors

In [15]:
def extract_authors(title_stmt):
    
    # navigating to the title statement and finding all tags <author>
    authors = title_stmt.find_all("author")
    fore_and_surnames = []
    
    # extracting the <surname> and <forename> tags and cleaning the outcome from the tags and the brackets
    for element in authors:
        names = element.find_all(['surname', 'forename'])
        names =  re.sub("<(.*?)>", "", str(names))
        names = re.sub("</(.*?)>", "", str(names))
        names = re.sub(r'\]', "", names)
        names = re.sub(r'\[', "", names)
        fore_and_surnames.append(names)
    
    return fore_and_surnames

#### Extracting Keywords from XML-File

- *extract_keywords*

extracts tags \<keywords n='topics' scheme='ConfTool'> and \<keywords n='keywords' scheme='ConfTool'> to get keywords of the texts

- *remove_markup_and_short_keywords*

cleans keywords from xml-markup and removes those shorter than three characters

- *remove_invalid_keywords*

checks validity of keywords from \<keywords n='topics'> by looking of keywords can be found in list of predetermined keywords

In [16]:
def remove_markup_and_short_keywords(keywords):
    
    keywords = re.sub("<(.*?)>", "", keywords)
    keywords = keywords.split("\n")
    for item in keywords:
        if len(item) <= 2:
            keywords.remove(item)
            
    return keywords

In [17]:
def remove_invalid_keywords(keywords_conflist, conf_tool_methods):

    for item in keywords_conflist:
        if item not in conf_tool_methods:
            keywords_conflist.remove(item)
    
    return keywords_conflist

In [18]:
def extract_keywords(xmldata, conf_tool_methods):
    
    # finds all tags <keywords n="keywords"> and <keywords n="topics">, removes all tags within
    keywords_freely_selectable = str(xmldata.find_all('keywords', n='keywords'))
    keywords_conflist = str(xmldata.find_all('keywords', n='topics'))
    
    keywords_freely_selectable = remove_markup_and_short_keywords(keywords_freely_selectable)
    keywords_conflist = remove_markup_and_short_keywords(keywords_conflist)
    
    remove_invalid_keywords(keywords_conflist, conf_tool_methods)
 
    return keywords_freely_selectable, keywords_conflist

==========================================================================================================================

==========================================================================================================================

==========================================================================================================================

## Main Code:

Creating folders in which variables, models and figures can be saved later

In [77]:
check_directory('Variables/')
check_directory('Models/')

rqs = ['RQ1', 'RQ2', 'RQ3', 'RQ4', 'RQ5', 'RQ6', ]    
for section in rqs:
    check_directory('Figures/' + section)

Reading in zip-files of DHd-conferences where only PDF-files are accessible

In [20]:
filenames_pdf = ['Corpus/DHd_2014.zip', 'Corpus/DHd_2015.zip']
document_statistics = []

# extracting text from pdf-files
all_pdf_texts = []
doc_names_pdf = []

#creating a list containing 9 zeros, which is filled with statistical information in the process of running the code
count_english_texts = [0] * 9
year_index = 0

for conference_file in filenames_pdf:
    archive = zipfile.ZipFile(conference_file, 'r')
    doc_names_year = []
    document_statistics.append(len(archive.namelist()))
    for name in archive.namelist():
        if name[-4:] == '.pdf':
            doc_names_year.append(name)
            pdf_data = BytesIO(archive.read(name))
            # reading each pdf-file in the zip-archive
            with fitz.open(stream=pdf_data, filetype='pdf') as doc:
                text = ''
                for page in doc:
                    text += page.get_text()

                all_pdf_texts.append(text)
                # detecting text language here for statistics
                lang = detect_language(text)
                if lang == 'en':
                    count_english_texts[year_index] += 1


    doc_names_pdf.append(doc_names_year)
    year_index += 1
    

filenames_pdf = get_conference_names(filenames_pdf)

Reading in the zip-files of the DHd-Conferences where XML-files were published

In [21]:
filenames_xml = ['Corpus/DHd_2016.zip', 'Corpus/DHd_2017.zip', 'Corpus/DHd_2018.zip', 'Corpus/DHd_2019.zip', 'Corpus/DHd_2020.zip',
             'Corpus/DHd_2022.zip', 'Corpus/DHd_2023.zip']

all_xml_files = []
doc_names_xml = []
# read in all zip-folders
for conference_file in filenames_xml:
    archive = zipfile.ZipFile(conference_file, 'r')
    doc_names_year = []
    xml_per_year = []
    document_statistics.append(len(archive.namelist()))
    # read in all files in the zip-file and check that they are xml-files
    # exclude final.xml since those are not documents about a presentation
    for name in archive.namelist():
        if name[-4:] == '.xml' and not name[-9:] == 'final.xml':
            xml_per_year.append(archive.read(name))
            doc_names_year.append(name)
    all_xml_files.append(xml_per_year)
    # creating a list of all documents' names
    doc_names_xml.append(doc_names_year)
   

docnames = doc_names_pdf + doc_names_xml
filenames_xml = get_conference_names(filenames_xml)
filenames = filenames_pdf + filenames_xml

XML-Files: 

The XML-files are not only used for text extraction, but since they contain a lot of information due to the extensive markup, some other information will be extracted from the files in the following steps:
- Text 
- Authors of the documents
- Keywords given in the metadata of the abstracts in order to find the scientific methods used

In [22]:
all_xml_texts = []

# importing the list provided, which contains all selectable options for <keywords n='keywords'>
list_predetermined_keywords = open_list('Misc/predetermined_keywords.txt')

# contains a list per year, this list contains a list of keywords extracted per text
all_freely_selectable_keywords = []
used_keywords_freely_selectable = []
used_keywords_predetermined = []
authors = []
authors_full_list = []

for year in all_xml_files:
    keywords_freely_selectable_year = []
    keywords_predetermined_year = []
    authors_year = []
    for doc in year:
        
        soup = BeautifulSoup(doc, 'xml')
        
        # Code for extracting the actual text from xml-files
        xml_text = extract_xml_text(soup)
        all_xml_texts.append(xml_text)
        
        lang = detect_language(str(xml_text))
        if lang == 'en':
            count_english_texts[year_index] += 1
        
        # Code for extracting the author names from titleStatement      
        authors_in_doc = extract_authors(soup.titleStmt)
        authors_year.append(authors_in_doc) 

        # Code for extracting the keywords used in xml-files  (per year)
        keywords_freely_selectable, keywords_predetermined = extract_keywords(soup, list_predetermined_keywords)  
        keywords_freely_selectable_year = keywords_freely_selectable_year + keywords_freely_selectable
        keywords_predetermined_year = keywords_predetermined_year + keywords_predetermined
        
    # saving all keywords that were given in the <keyword n=keyword> tags in the XML-files
    all_freely_selectable_keywords = list(dict.fromkeys(all_freely_selectable_keywords + keywords_freely_selectable_year))
    used_keywords_predetermined.append(keywords_predetermined_year)    
    used_keywords_freely_selectable.append(keywords_freely_selectable_year)     
    
    # saves each text's authors in a list, sorted by year of the text
    authors.append(authors_year)
    year_index += 1

Merging the extracted PDF and XML texts for further processing of the textual content:

- Determining text language
- Cleaning the text from unneccessary contents
- Lemmatizing the texts depending on the detected language (English or German) --> time-consuming step
- Removing stopwords depending on the detected language (English or German)

In [23]:
# Time-consuming step: Do not excute if you have the variables stored!
whole_texts = []
whole_texts = all_pdf_texts + all_xml_texts

additional_stopwords = open_list('Misc/additional_stopwords.txt')

list_all_texts = []
for text in whole_texts:
    # detecting language in order to remove the stopwords and lemmatize according to language
    lang = detect_language(str(text)) 
    text = clean_text(text)
    text = lemmatization(text, lang)
    text = remove_stopwords(text, lang, additional_stopwords)
    list_all_texts.append(text)
save_object('Variables/', 'list_all_texts.pckl', list_all_texts)

Writing a file *to_correct_ocr.txt* which contains the retrieved texts from the pdf-files in order to manually postprocess/clean possible OCR mistakes.

In [24]:
with open('Misc/to_correct_ocr.txt', 'w', encoding='utf-8') as f:
    f.write(str(list_all_texts))

Opening the post-processed file and eventually bringing all texts together again in variable *corr_list_of_texts*

In [1]:
# opening the post-processed file
with open('Misc/corrected_text.txt', 'r', encoding='utf-8') as f:
    manually_corrected_file = f.read()

corrected_list_of_texts = post_processing_ocr(manually_corrected_file)

Creating bigrams and trigrams, id2word and corpus, which are necessary for the actual topic modeling algorithm (LDA). For transparency and control purposes, the terms deleted by tf-idf weighting are written into a file and stored.

In [65]:
# creating bigrams and trigrams from lemmatized words
data_bigrams_trigrams = create_bigrams_trigrams(corrected_list_of_texts)

# id2word as dictionary where every word/bi-/trigram is referenced with id
id2word = corpora.Dictionary(data_bigrams_trigrams)

# corpus as dictionary that contains a list of tuples for each document, tuples contain (word id, no. of appearances of the word)
# some index numbers are missing due to the tf-idf weighting 
corpus, deleted_words = tf_idf(id2word, data_bigrams_trigrams)

with open('Misc/tfidf_deleted_terms.txt', 'w', encoding='utf-8') as f:
    f.write(str(deleted_words))  

To save time by not having to execute time-consuming steps every time again, important variables are saved and can be reopened in the following two cells. 

In [67]:
# writing the files to save the variables
save_object('Variables/', 'corrected_list_of_texts.pckl', corrected_list_of_texts)
save_object('Variables/', 'data_bigrams_trigrams.pckl', data_bigrams_trigrams)
save_object('Variables/', 'id2word.pckl', id2word)
save_object('Variables/', 'corpus.pckl', corpus)

In [23]:
# opening the saved variables for reuse
corpus = open_variable('Variables/', 'corpus.pckl')
id2word = open_variable('Variables/', 'id2word.pckl')
corrected_list_of_texts = open_variable('Variables/', 'corrected_list_of_texts.pckl')
data_bigrams_trigrams = open_variable('Variables/', 'data_bigrams_trigrams.pckl')

For information and transparency purposes: Saving general information on the corpus, creating a DataFrame from it and exporting it as csv-file

In [55]:
# how many texts are in each year's corpus taken into account?
number_pdf_docs = [len(sublist) for sublist in doc_names_pdf]
number_xml_docs = [len(sublist) for sublist in doc_names_xml]
number_docs = number_pdf_docs + number_xml_docs

In [72]:
# corpus statistics
statistics = pd.DataFrame([document_statistics, number_docs, count_english_texts], index=["Total No. of Documents", "No. of Documents After Filtering", "Documents in English"], 
                   columns=['2014', '2015', '2016', '2017', '2018', '2019', '2020', '2022', '2023'])
statistics['Total'] = statistics.sum(axis=1)
statistics = statistics.T
statistics['% of English Texts'] = round(((100/statistics['No. of Documents After Filtering'])*statistics['Documents in English']), 2)
statistics.to_csv('Figures/Statistics_Corpus.csv')

Saving some variables so that they can be used in the *MA_TopicModeling* file

In [26]:
%store number_pdf_docs
%store number_xml_docs
%store number_docs
%store docnames
%store filenames_xml
%store filenames_pdf
%store filenames
%store all_freely_selectable_keywords
%store used_keywords_freely_selectable
%store used_keywords_predetermined
%store authors
%store authors_full_list

Stored 'number_pdf_docs' (list)
Stored 'number_xml_docs' (list)
Stored 'number_docs' (list)
Stored 'docnames' (list)
Stored 'filenames_xml' (list)
Stored 'filenames_pdf' (list)
Stored 'filenames' (list)
Stored 'all_freely_selectable_keywords' (list)
Stored 'used_keywords_freely_selectable' (list)
Stored 'used_keywords_predetermined' (list)
Stored 'authors' (list)
Stored 'authors_full_list' (list)
