# Assessing the Linguistic Complexity of German Abitur Texts from 1963–2013
## 2: Perplexity

Author: Noemi Kapusta

## Structure:

1. Import Libraries and Functions
1. Edit the Abitur Texts
1. POS-Tagging
    - 3.1 Tag the Abitur Texts
    - 3.2 Tag the Journals
1. Compute Perplexity
1. References

## 1. Import Libraries and Functions 
- libraries: someweta, nltk.tokenize, nltk.data, pandas, numpy
- imported function: functions.py
    - Function that splits data into test- and devset
    - Author: Matilda Schauf

In [1]:
import sys, os, random
import itertools

import pandas as pd
import numpy

from someweta import ASPTagger
# we use the model "german_newspaper_2020-05-28.model"
someweta_model = "/tmp/german_newspaper_2020-05-28.model"

import nltk.data
from nltk.tokenize import word_tokenize

In [2]:
sys.path.insert(0, "src")

import functions
from functions import get_filenames

# Get devset data
#dev_filenames = get_filenames("src/dataSplits.csv", test=False)
dev_filenames = get_filenames("src/demo_dataSplits.csv", test=False)

# Get testset data
#test_filenames = get_filenames("src/dataSplits.csv", test=True)
test_filenames = get_filenames("src/demo_dataSplits.csv", test=True)

## 2. Edit the Abitur Texts
- read files in and keep only the "FORM"-column
- save the changed files as txt-files in the folder "data_tmp/perplex/tokens/"

In [3]:
import os

def save_data(filenames, path_name):
    """Function that reads in the files and stores only the FORM-column 
    as txt-files in "data/tmp_perplex"
    Input: filenames(list): names with the dev- or test-filenames
           path_name(str): file path to the data
    Output: out_path (str): file path with the folder to store the txt-file"""
    for filename in filenames:
        # path to the Data articles
        pfad = path_name + filename

        # Take the column names from the first line of the file
        with open(pfad, "r", encoding="UTF-8") as file:
            column_names = file.readline().replace("# global.columns =", "").strip().split()

        # create dataframe 
        df = pd.read_csv(pfad, comment="#", sep="\t", quoting=3, header=None, names=column_names)
        df.head()
        
        df = df.astype({"UEBERSCHRIFT": str}, errors='raise')
        df = df[df.UEBERSCHRIFT == "0"]

        # keep only the column "FORM"
        keep = ["FORM"]

        # save the new Dataframe
        df = df[keep]

        # save the new Dataframe as a new file in the folder data_tmp/perplex/tokens/
        out_path = os.path.curdir + "/data_tmp/perplex/tokens/"
        os.makedirs(out_path, exist_ok=True)  

        path_out = out_path + filename
        with open(path_out, mode="w", encoding='UTF-8') as outfile:
            dfAsString = df.to_string(header=False, index=False)
            outfile.write(dfAsString)

    return out_path

In [4]:
path_name = "data/"
path_abitexts = save_data(test_filenames, path_name)

## 3. POS-Tagging
- tag the files with the Someweta POS-Tagger
- we run the POS tagger with the model created by SoMeWeTa from German newspapers called "german_newspaper_2020-05-28" 
- Learn more to this tagger under the following link: https://github.com/tsproisl/SoMeWeTa

### 3.1 Tag the Abitur Texts
- tag the Abitur Text files with their POS
- the POS-Tags of a sentence are stored in one line

In [5]:
import os
someweta_model = "/tmp/german_newspaper_2020-05-28.model"

def tag_abitexts(file):
    """Function that reads files of the file path of the abitexts and saves their tags.
    Input: file (str) = file path of the files to be read in.
    Output: out_path (str) = path to the folder where the tags are saved"""

    # Run the POS tagger with the model created by SoMeWeTa from German newspapers
    #model = "german_newspaper_2020-05-28.model"
    newstagger = ASPTagger()
    newstagger.load(someweta_model)

    # Open and read in articles from the list
    for file_name in os.listdir(file):
        infile = open(file + file_name, mode="r", encoding='UTF8')
        corpusliste = infile.readlines()
        infile.close()
        
        corpustext = str()

        # delete or change the unused tokens
        for element in corpusliste:
            element = element.replace("\n", "").strip(" ")
            if element[:4] == "<B->":
                corpustext += element[4:] + " "
            elif element[:4] == "<E->":
                continue
            elif element[:7] == "<EMPTY>":
                continue
            elif element[:4] == "<I->":
                continue
            else:
                corpustext += element + " "
                
        # tokenize corpustext in sentences
        tokenizer = nltk.data.load('tokenizers/punkt/german.pickle')
        tok_text = tokenizer.tokenize(corpustext)

        newstagged_liste = list()

        # tokenize the sentences in tokens and tag these with our trained POS-tagger
        for sent in tok_text:
            toks = word_tokenize(sent)
            newstagged_sentence = newstagger.tag_sentence(toks)
            newstagged_liste.append(newstagged_sentence)

        pos_tags = str()

        # one line per sentence and only store the tags in a new document
        for sent in newstagged_liste:
            for tok in sent:
                pos_tags += tok[1] + " "
            pos_tags += "\n"

        out_path = os.path.curdir + "/data_tmp/perplex/tagged/"
        os.makedirs(out_path, exist_ok=True)  

        path_out = out_path + file_name
        with open(path_out, mode="w", encoding='UTF-8') as outfile:
            outfile.write(pos_tags)

    return out_path

In [6]:
abitexts_tagged = tag_abitexts(path_abitexts)

### 3.2 Tag the newspaper texts

- tag the newspaper files (ZEIT, Express) with their POS
- the POS-Tags of a sentence are stored in one line
- save results in separate directories

In [7]:
import os
someweta_model = "/tmp/german_newspaper_2020-05-28.model"

def tag_articles(file, filelist, name):
    """Function that reads files of a file path and saves their tags.
    Input:  file (str) = file path of the files to be read in.
            filelist (list) = list with the filenames
            name(str) = name of the folder where the POS-Tags of the articles are to be saved
    Output: corpustext (str) = read files together as string"""

    # Run the POS tagger with the model created by SoMeWeTa from German newspapers
    # Learn more to this tagger under the following link: https://github.com/tsproisl/SoMeWeTa
    newstagger = ASPTagger()
    newstagger.load(someweta_model)

    # Open and read in articles from the list
    for file_name in os.listdir(file):
        if file_name in filelist:
            infile = open(file + file_name, mode="r", encoding='UTF8')
            corpustext = infile.read()
            infile.close()

            # tokenize corpustext in sentences
            tokenizer = nltk.data.load('tokenizers/punkt/german.pickle')
            tok_text = tokenizer.tokenize(corpustext)

            newstagged_liste = list()

            # tokenize the sentences in tokens and tag these with our trained POS-tagger
            for sent in tok_text:
                toks = word_tokenize(sent)
                newstagged_sentence = newstagger.tag_sentence(toks)
                newstagged_liste.append(newstagged_sentence)

            pos_tags = str()

            # one line per sentence and only store the tags in a new document
            for sent in newstagged_liste:
                for tok in sent:
                    pos_tags += tok[1] + " "
                pos_tags += "\n"

            out_path = os.path.curdir + "/" + foldername
            os.makedirs(out_path, exist_ok=True)  

            path_out = out_path + file_name
            with open(path_out, mode="w", encoding='UTF-8') as outfile:
                outfile.write(pos_tags)
            
    return out_path

In [8]:
#for ZEIT articles
data_path = "data/"

files = ["zeit_1.spl", "zeit_2.spl", "zeit_3.spl"]
foldername = "data_tmp/perplex/tagged/zeit/"
os.makedirs(foldername, exist_ok=True)  
zeit_tagged = tag_articles(data_path, files, foldername)

#for Express articles
foldername = "data_tmp/perplex/tagged/express/"
files = ["express_1.spl", "express_2.spl", "express_3.spl"]
os.makedirs(foldername, exist_ok=True)  
express_tagged = tag_articles(data_path, files, foldername)

## 4. Compute Perplexity
- Compute perplexity according to Jurafsky and Martin (2022)
- Training:
    - Train one model with the ZEIT articles
    - Train another model with the Express articles
- Testing:
    - Use the Abitur texts per year
    - Use 5000 Bigrams divided evenly among all texts per year
    - Compute perplexity for each text, and then the mean and the standard deviation per year
- The lower the perplexity value, the better the model predicts the testset

In [9]:
def create_model(traincorpus):
    """Function that creates a list with the padded tokens
    Input: traincorpus(path): file path to the corpus with the data for the training
    Ouput: corpuslist (list): list with the padded tokens"""
    
    corplist = list()
    for file_name in os.listdir(traincorpus):
        infile = open(traincorpus + file_name, mode="r", encoding='UTF8')
        corplist += infile.readlines()
        infile.close()

    # Padding
    corpustext = ""

    # erase Newline and add <s> and </s> at the beginning and ending of a sentence
    for sentence in corplist:
        sentence = sentence.strip()
        if len(sentence) == 0:
            continue
        paddedsentence = "<s> " + sentence + " </s> "
        corpustext = corpustext + paddedsentence

    corpuslist = corpustext.split()

    return corpuslist

In [10]:
def count_words(textlist):
    """Function that counts the number of words
    Input: textlist (list) = List with tokens whose words are to be counted
    Output: numberN (int) = Number of words N of a text"""

    numberN = 0

    # count all tokens except "<s>"
    for word in textlist:
        if word == "<s>":
            continue
        else:
            numberN += 1

    return numberN

In [11]:
def count_types(textlist):
    """Function that counts the number of word types
    Input: textlist (list) = List with tokens whose word types are to be counted
    Output: freq_wordtypes (dict) = Dictionary with the types and their frequencies"""

    # initialize a dictionary: key = word type; value = frequency
    freq_wordtypes = dict()

    # count the frequencies of the word types and save them in a dictionary
    for word in textlist:
        freq_wordtypes[word] = freq_wordtypes.get(word, 0) + 1

    return freq_wordtypes

In [12]:
def create_bigrams(textlist):
    """Function that creates bigrams from a text and counts the frequencies of the bigram types
    Input: textlist(list) = List with tokens to create the bigrams
    Output: freqs(dict) = Dictionary with the bigram types and their frequencies"""

    # initialize a dictionary: key = bigramm; value = frequency
    freqs = dict()

    # get all the bigram types with their frequencies, with the exception of the bigram ("<s>", "</s>")
    for index in range(len(textlist) - 1):
        bigram = (textlist[index],) + (textlist[index + 1],)
        if not bigram == ("</s>", "<s>"):
            freqs[bigram] = freqs.get(bigram, 0) + 1

    return freqs

In [13]:
def perplexity(testbigr, trainbigr, trainunigr, numberwords):
    """Function that calculates the probabiblity and the perplexity of a model to a test set
    input: testbigr(dict): dictionary with bigram types + frequencies from the test set
            trainbigr(dict): Dictionary with bigram types + frequencies from the training set
            trainunigr(dict): Dictionary with unigram types + frequencies from training set
            numberwords(int): number of words of the test set
    output: perplexity: Perplexity of a model"""
    
    prob = 0
    
    # calculate log probability 
    for bigr in testbigr:
        bigr_value = testbigr[bigr]
        if bigr in trainbigr:
            trainbigr_value = trainbigr[bigr]
            unigr_value = trainunigr[bigr[0]]
            bigr_prob = numpy.log(trainbigr_value/unigr_value)
            prob += (bigr_prob * bigr_value)
        else:
            continue

    # perplexity is calculated by taking the probability to the power of -1 
    # divided by the number of words in the test set.
    perpl = numpy.exp(-prob/numberwords)

    return perpl

In [14]:
# train models

# Model 1 ZEIT:
# read text and padding
model1 = create_model(zeit_tagged)
# count words of the first model:
model1_words = count_words(model1)
# count wordtypes of the first model
m1_wordtypes = count_types(model1)
# count bigramtypes of the first model
m1_bigram_types = create_bigrams(model1)

# Model 2 Express:
# read text and padding
model2 = create_model(express_tagged)
# count words of the second model
model2_words = count_words(model2)
# count wordtypes of the second model
m2_wordtypes = count_types(model2)
# count bigramtypes of the second model
m2_bigram_types = create_bigrams(model2)

In [15]:
# Testing with tagged abitexts

# Testset per year:
#yearlist = ["1963", "1968", "1974", "1978", "1983", "1988", "1993", "1998", "2003", "2008", "2013"]
yearlist = ["1963", "2013"]
allppl_zeit_list = list()
allppl_expr_list = list()
meanppl_zeit = list()
meanppl_expr = list()
stdppl_zeit = list ()
stdppl_expr = list()


for i in range(len(yearlist)):
    #initialize article_counter per year and 2 lists for zeit and express perplexity
    artcounter = 0
    ppl_zeit_list = list()
    ppl_expr_list = list()
    
    #calculate how many bigrams per text must be used as a test set, so that 5000 bigrams are used per year
    for file_name in os.listdir(abitexts_tagged):
        if yearlist[i] in file_name:
            artcounter +=1
    bigr_n = 5000/artcounter
    bigr_n = round(bigr_n)
    print("bigr_n:", bigr_n)
    print("articlenumber:", artcounter)

    for file_name in os.listdir(abitexts_tagged):
        if yearlist[i] in file_name:
            infile = open(abitexts_tagged + file_name, mode="r", encoding='UTF8')
            corplist = infile.readlines()
            infile.close()
            
            # Padding
            corpustext = ""    
            # erase Newline and add <s> and </s> at the beginning and ending of a sentence
            for sentence in corplist:
                sentence = sentence.strip()
                if len(sentence) == 0:
                    continue
                paddedsentence = "<s> " + sentence + " </s> "
                corpustext = corpustext + paddedsentence
            corpuslist = corpustext.split()
            
            # count words
            test_words = count_words(corpuslist)
            # create bigrams
            test_bigrams = create_bigrams(corpuslist)
            
            testbigr = {}
            
            try:
                for bigr in itertools.islice(test_bigrams, 0, bigr_n):
                    testbigr[bigr] = test_bigrams[bigr]
            except:
                print("nicht genügend bigrams:", len(test_bigrams))
                
             # Perplexity for model 1
            model1_ppl = perplexity(testbigr, m1_bigram_types, m1_wordtypes, test_words)
            ppl_zeit_list.append(model1_ppl)

            # Perplexity for model 2
            model2_ppl = perplexity(testbigr, m2_bigram_types, m2_wordtypes, test_words)
            ppl_expr_list.append(model2_ppl)
        else:
            continue

    allppl_zeit_list.append(ppl_zeit_list)
    allppl_expr_list.append(ppl_expr_list)
    meanppl_zeit.append(numpy.mean(ppl_zeit_list))
    meanppl_expr.append(numpy.mean(ppl_expr_list))
    stdppl_zeit.append(numpy.std(ppl_zeit_list))
    stdppl_expr.append(numpy.std(ppl_expr_list))

bigr_n: 2500
articlenumber: 2
bigr_n: 2500
articlenumber: 2


In [16]:
# create dataframes 
# dictionary for every category
ppl = {'YEAR': yearlist, 
       'all_ppl_ZEIT': allppl_zeit_list, 
       'all_ppl_Express': allppl_expr_list, 
       'mean_ppl_ZEIT': meanppl_zeit, 
       'mean_ppl_Express': meanppl_expr, 
       'std_ppl_ZEIT': stdppl_zeit, 
       'std_ppl_Express': stdppl_expr}
df_ppl = pd.DataFrame(ppl)
pd.set_option("max_colwidth", 500)
display(df_ppl)

# dataframe to csv-file
#out_path = "results/2_perplex/dev_results"
#out_path = "results/2_perplex/test_results"
out_path = "results/2_perplex_demo/"
import os
os.makedirs(out_path, exist_ok=True)  

df_ppl.to_csv(out_path + "perplexity.csv", sep=",")

Unnamed: 0,YEAR,all_ppl_ZEIT,all_ppl_Express,mean_ppl_ZEIT,mean_ppl_Express,std_ppl_ZEIT,std_ppl_Express
0,1963,"[5.933789693637949, 5.523221692800902]","[6.199482361595224, 5.558098097603365]",5.728506,5.87879,0.205284,0.320692
1,2013,"[5.957410118790915, 5.806513552218744]","[6.063355038057216, 5.676248957629498]",5.881962,5.869802,0.075448,0.193553


### 5. References

- Daniel Jurafsky and James H. Martin. 2022. Speech and Language Processing.

