In [None]:
'''
Compound splitter
'''

# https://github.com/dtuggener/CharSplit
import urllib
url = "https://github.com/dtuggener/CharSplit/archive/master.zip"
filename = "CharSplit.zip"
urllib.request.urlretrieve(url, filename)

import zipfile
zip_ref = zipfile.ZipFile(filename, 'r')
zip_ref.extractall()
zip_ref.close()

import os, sys
#os.rename("CharSplit-master", "CharSplit")
from shutil import copyfile
for filename in os.listdir("CharSplit-master"):
    if filename != "README.md": 
        copyfile("CharSplit-master/" + filename, filename)
    
import char_split, ngram_probs
def compoundSplit(string):
    request = char_split.split_compound(string)[0]
    if request[0] <= 0:
        return [string]
    else:
        return compoundSplit(request[1]) + compoundSplit(request[2])
    

In [None]:
'''
Functions for obtaining the morphological features.  Goals:

• Derivational
- hand-compiled list by Hancke et al (2012) 

• Inflectional
- mood, person, tense, type of verbs
- case of nouns

• Compound words

'''

from nltk import word_tokenize
from collections import defaultdict, Counter, deque
import random
import numpy as np
import pandas as pd
import urllib.request
import io
import ast

url = "https://raw.githubusercontent.com/soerenetler/TextDifficultyAssessmentGerman/master/preprocessed_text_df.csv"
df= pd.read_csv(url, sep="|", lineterminator='\n')

# preprocessing Level classifications to get rid of unwanted data and "B1 " vs. "B1"
df = df[(df["Level"] == "A1")|(df["Level"] == "A2")|(df["Level"] == "B1")|(df["Level"] == "B1 ")|(df["Level"] == "B2")|(df["Level"] == "C1")|(df["Level"] == "C2")]
df = df.replace("B1 ","B1")

# returns a list of all the features included in the dataset, to be used as attributes in feature vector
def get_all_features(df):
    all_features = [] 
    index = 0
    for all_sentences in df.loc[:,'RFTagger']:
        if all_sentences is not None:
            all_sentences = all_sentences[:-5] # remove the empty last sentence [] as following line won't work
            all_sentences += "]" # nasty preprocessing necessary to keep list structure for literal_eval
            all_sentences = ast.literal_eval(all_sentences)
            for sentence in all_sentences:
                for word_features in sentence:
                    word_features = word_features[1:] #exclude the word itself and lemma at end (leads to too many features)                    
                    for token in word_features:
                        token = "MO-"+token
                        if token not in all_features:
                            all_features.append(token)
            index += 1
                    
    return all_features

def get_classifications(df):
    y = []
    
    for classification in df.loc[:,'Level']:
        y.append(classification)
        
    return y

def add_morph_columns(df_document):
    df_document["MO-inf2verbs"] = np.zeros(len(df_document))
    df_document["MO-part2verbs"] = np.zeros(len(df_document))
    df_document["MO-imp2verbs"] = np.zeros(len(df_document))
    df_document["MO-1st2finverbs"] = np.zeros(len(df_document))
    df_document["MO-2nd2finverbs"] = np.zeros(len(df_document))
    df_document["MO-3rd2finverbs"] = np.zeros(len(df_document))
    df_document["MO-subj2finverbs"] = np.zeros(len(df_document))
    df_document["MO-finverbs2verbs"] = np.zeros(len(df_document))
    df_document["MO-modverbs2verbs"] = np.zeros(len(df_document))
    df_document["MO-auxverbs2verbs"] = np.zeros(len(df_document))
    df_document["MO-verbspersent"] = np.zeros(len(df_document))
    df_document["MO-nom2nouns"] = np.zeros(len(df_document))
    df_document["MO-gen2nouns"] = np.zeros(len(df_document))
    df_document["MO-dat2nouns"] = np.zeros(len(df_document))
    df_document["MO-acc2nouns"] = np.zeros(len(df_document))
    df_document["MO-keit2nouns"] = np.zeros(len(df_document))
    df_document["MO-ung2nouns"] = np.zeros(len(df_document))
    df_document["MO-werk2nouns"] = np.zeros(len(df_document))
    #df_document["MO-derived2nouns"] = np.zeros(len(df_document))
    df_document["MO-compounds2nouns"] = np.zeros(len(df_document))
    
def fill_morph_features(df_document, word_list, morph_dict, index):
    derived_list = ["ant", "anten", "antin", "antinnen", "arium", "arien", "ast", "asten", "astin", "astinnen", 
                    "at", "ate", "ator", "atoren", "atorin", "atorinnen", "atur", "aturen", "ei", "eien", "er", 
                    "erin", "erinnen", "ent", "ents", "enz", "enzen", 'eur', 'eure', 'eurin', 'eurinnen', 'heit', 
                    'heiten', 'ist', 'isten', 'istin', 'istinnen', 'ion', 'ionen', 'ismus', 'ismen', 'ität', 
                    'itäten', 'keit', 'keiten', 'ling', 'lingen', 'nis', 'nisse', 'schaft', 'schaften', 'tum', 
                    'tümer', 'ung', 'ungen', 'ur', 'werk', 'werke', 'wesen']
    
    total_derived = 0
    total_nouns = morph_dict["N"]
    total_verbs = morph_dict["VFIN"] + morph_dict["VINF"] + morph_dict["VIMP"] + morph_dict["VPP"]
    total_keit = 0
    total_ung = 0
    total_werk = 0
    total_nom = morph_dict["Nom"]
    total_gen = morph_dict["Gen"]
    total_dat = morph_dict["Dat"]
    total_acc = morph_dict["Acc"]
    total_subj = morph_dict["Subj"]
    total_first = morph_dict["1"]
    total_second = morph_dict["2"]
    total_third = morph_dict["3"]
    total_compounds = 0
    total_infinitives = morph_dict["VINF"]
    total_participles = morph_dict["PART"]
    total_imperatives = morph_dict["VIMP"]
    total_finite = morph_dict["VFIN"]
    total_modal = morph_dict["MOD"]
    total_auxiliary = morph_dict["AUX"]
    num_sentences = 1 + word_list.count(".")
    
    for word in word_list:          
        for derivation in derived_list:
            if word[(len(word)-len(derivation)):len(word)] == derivation:
                #total_derived += 1
                    
                if derivation == "keit":
                    total_keit += 1
                elif derivation == "ung":
                    total_ung += 1
                elif derivation == "werk":
                    total_werk += 1
                    
        try: compoundSplit(word)
        except:
            total_compounds += 1
            continue
        if len(compoundSplit(word)) > 1:
            total_compounds += 1
                    
    try: 1/total_nouns # catch divide by 0 error
    except:
        df_document.loc[index,"MO-inf2verbs"] = total_infinitives/total_verbs
        df_document.loc[index,"MO-part2verbs"] = total_participles/total_verbs
        df_document.loc[index,"MO-imp2verbs"] = total_imperatives/total_verbs
        df_document.loc[index,"MO-1st2finverbs"] = total_first/total_finite
        df_document.loc[index,"MO-2nd2finverbs"] = total_second/total_finite
        df_document.loc[index,"MO-3rd2finverbs"] = total_third/total_finite
        df_document.loc[index,"MO-subj2finverbs"] = total_subj/total_finite
        df_document.loc[index,"MO-finverbs2verbs"] = total_finite/total_verbs
        df_document.loc[index,"MO-modverbs2verbs"] = total_modal/total_verbs
        df_document.loc[index,"MO-auxverbs2verbs"] = total_auxiliary/total_verbs
        df_document.loc[index,"MO-nom2nouns"] = 0
        df_document.loc[index,"MO-gen2nouns"] = 0
        df_document.loc[index,"MO-dat2nouns"] = 0
        df_document.loc[index,"MO-acc2nouns"] = 0
        df_document.loc[index,"MO-keit2nouns"] = 0
        df_document.loc[index,"MO-ung2nouns"] = 0
        df_document.loc[index,"MO-werk2nouns"] = 0
        #df_document.loc[index,"MO-derived2nouns"] = 0
        df_document.loc[index,"MO-compounds2nouns"] = 0
        return
        
    try: 1/total_verbs # catch divide by 0 error
    except:
        df_document.loc[index,"MO-inf2verbs"] = 0
        df_document.loc[index,"MO-part2verbs"] = 0
        df_document.loc[index,"MO-imp2verbs"] = 0
        df_document.loc[index,"MO-1st2finverbs"] = 0
        df_document.loc[index,"MO-2nd2finverbs"] = 0
        df_document.loc[index,"MO-3rd2finverbs"] = 0
        df_document.loc[index,"MO-subj2finverbs"] = 0
        df_document.loc[index,"MO-finverbs2verbs"] = 0
        df_document.loc[index,"MO-modverbs2verbs"] = 0
        df_document.loc[index,"MO-auxverbs2verbs"] = 0
        df_document.loc[index,"MO-nom2nouns"] = total_nom/total_nouns
        df_document.loc[index,"MO-gen2nouns"] = total_gen/total_nouns
        df_document.loc[index,"MO-dat2nouns"] = total_dat/total_nouns
        df_document.loc[index,"MO-acc2nouns"] = total_acc/total_nouns
        df_document.loc[index,"MO-keit2nouns"] = total_keit/len(word_list)
        df_document.loc[index,"MO-ung2nouns"] = total_ung/len(word_list)
        df_document.loc[index,"MO-werk2nouns"] = total_werk/len(word_list)
        #df_document.loc[index,"MO-derived2nouns"] = total_derived/total_nouns
        df_document.loc[index,"MO-compounds2nouns"] = total_compounds/total_nouns
        return
              
    df_document.loc[index,"MO-inf2verbs"] = total_infinitives/total_verbs
    df_document.loc[index,"MO-part2verbs"] = total_participles/total_verbs
    df_document.loc[index,"MO-imp2verbs"] = total_imperatives/total_verbs
    df_document.loc[index,"MO-1st2finverbs"] = total_first/total_finite
    df_document.loc[index,"MO-2nd2finverbs"] = total_second/total_finite
    df_document.loc[index,"MO-3rd2finverbs"] = total_third/total_finite
    df_document.loc[index,"MO-subj2finverbs"] = total_subj/total_finite
    df_document.loc[index,"MO-finverbs2verbs"] = total_finite/total_verbs
    df_document.loc[index,"MO-modverbs2verbs"] = total_modal/total_verbs
    df_document.loc[index,"MO-auxverbs2verbs"] = total_auxiliary/total_verbs
    df_document.loc[index,"MO-verbspersent"] = total_verbs/num_sentences
    df_document.loc[index,"MO-nom2nouns"] = total_nom/total_nouns
    df_document.loc[index,"MO-gen2nouns"] = total_gen/total_nouns
    df_document.loc[index,"MO-dat2nouns"] = total_dat/total_nouns
    df_document.loc[index,"MO-acc2nouns"] = total_acc/total_nouns
    df_document.loc[index,"MO-keit2nouns"] = total_keit/len(word_list)
    df_document.loc[index,"MO-ung2nouns"] = total_ung/len(word_list)
    df_document.loc[index,"MO-werk2nouns"] = total_werk/len(word_list)
    #df_document.loc[index,"MO-derived2nouns"] = total_derived/total_nouns
    df_document.loc[index,"MO-compounds2nouns"] = total_compounds/total_nouns
    
def fill_document_vector(df,df_document):
    add_morph_columns(df_document)
    index = 0
    for all_sentences in df.loc[:,'RFTagger']:
        if all_sentences is not None:
            all_sentences = all_sentences[:-5] # remove the empty last sentence [] as following line won't work
            all_sentences += "]" # nasty preprocessing necessary to keep list structure for literal_eval
            all_sentences = ast.literal_eval(all_sentences)
            text_length = 0
            word_list = []
            morph_dict = defaultdict(int)
            for sentence in all_sentences:
                text_length += len(sentence)
                for word_features in sentence:
                    word_list.append(word_features[0])
                    word_features = word_features[1:] #exclude the word itself and lemma at end (leads to too many features)
                    for token in word_features:
                        df_document.loc[index,"MO-"+token] +=1
                        if token == "Nom" or token== "Gen" or token=="Dat" or token=="Acc":
                            if "N" in word_features: # only add nouns with above cases
                                morph_dict[token] += 1
                                continue
                            else: continue
                        morph_dict[token] += 1
            df_document.loc[index] = df_document.loc[index] / text_length # normalize each value with the total text length       
            fill_morph_features(df_document, word_list, morph_dict, index)
            index +=1
            
def save_df(path,df):
    df.to_csv(path,sep="|",index=False)

# get total number of sentences and all features contained in dataset in order to create vector columns
all_features = get_all_features(df)  

# document level vector, i.e. each row in the feature vector represents one document
zero_data = np.zeros(shape=(len(df),len(all_features))) # create 1 row in the feature vector per document
df_document = pd.DataFrame(zero_data,columns=sorted(all_features))
y = get_classifications(df)

fill_document_vector(df,df_document)
display(df_document.head())
print(list(df_document))

filename = "06_MorphologicalFeatures_df.csv"
save_df(filename,df_document)
