In [4]:
'''
Compound splitter
'''

# https://github.com/dtuggener/CharSplit
import urllib
url = "https://github.com/dtuggener/CharSplit/archive/master.zip"
filename = "CharSplit.zip"
urllib.request.urlretrieve(url, filename)

import zipfile
zip_ref = zipfile.ZipFile(filename, 'r')
zip_ref.extractall()
zip_ref.close()

import os, sys
#os.rename("CharSplit-master", "CharSplit")
from shutil import copyfile
for filename in os.listdir("CharSplit-master"):
    if filename != "README.md": 
        copyfile("CharSplit-master/" + filename, filename)
    
import char_split, ngram_probs
def compoundSplit(string):
    request = char_split.split_compound(string)[0]
    if request[0] <= 0:
        return [string]
    else:
        return compoundSplit(request[1]) + compoundSplit(request[2])
    

In [7]:
'''
Functions for obtaining the morphological features.  Goals:

• Derivational
- hand-compiled list by Hancke et al (2012) 

• Inflectional
- mood, person, tense, type of verbs
- case of nouns

• Compound words

word embeddings?

save features and output to df

'''

from nltk import word_tokenize
from collections import defaultdict, Counter, deque
import random
import numpy as np
import pandas as pd
import urllib.request
import io
import ast

df= pd.read_csv("05_FrequencyFeatures_df.csv", sep="|", lineterminator='\n')

# preprocessing Level classifications to get rid of unwanted data and "B1 " vs. "B1"
df = df[(df["Level"] == "A1")|(df["Level"] == "A2")|(df["Level"] == "B1")|(df["Level"] == "B1 ")|(df["Level"] == "B2")|(df["Level"] == "C1")|(df["Level"] == "C2")]
df = df.replace("B1 ","B1")

# returns a list of all the features included in the dataset, to be used as attributes in feature vector
def get_all_features(df):
    all_features = [] 
    sentence_count = 0
    index = 0
    for all_sentences in df.loc[:,'RFTagger']:
        if all_sentences is not None:
            all_sentences = all_sentences[:-5] # remove the empty last sentence [] as following line won't work
            all_sentences += "]" # nasty preprocessing necessary to keep list structure for literal_eval
            all_sentences = ast.literal_eval(all_sentences)
            for sentence in all_sentences:
                sentence_count +=1
                for word_features in sentence:
                    word_features = word_features[1:] #exclude the word itself and lemma at end (leads to too many features)                    
                    for token in word_features:
                        token = "MO-"+token
                        if token not in all_features:
                            all_features.append(token)
            index += 1
                    
    return sentence_count, all_features

def get_classifications(df):
    y = []
    
    for classification in df.loc[:,'Level']:
        y.append(classification)
        
    return y

def get_classifications_sentences(df):
    return

def fill_sentence_vector(df,df_sentence):
    index = 0
    for all_sentences in df.loc[:,'RFTagger']:
        if all_sentences is not None:
            all_sentences = all_sentences[:-5] # remove the empty last sentence [] as following line won't work
            all_sentences += "]" # nasty preprocessing necessary to keep list structure for literal_eval
            all_sentences = ast.literal_eval(all_sentences)
            for sentence in all_sentences:
                for word_features in sentence:
                    word_features = word_features[1:] #exclude the word itself and lemma at end (leads to too many features)
                    for token in word_features:
                        df_sentence.loc[index,token] +=1
                index +=1

def add_morph_columns(df_document):
    df_document["MO-keit"] = np.zeros(len(df_document))
    df_document["MO-ung"] = np.zeros(len(df_document))
    df_document["MO-werk"] = np.zeros(len(df_document))
    df_document["MO-derived to nouns"] = np.zeros(len(df_document))
    df_document["MO-nom. to nouns"] = np.zeros(len(df_document))
    df_document["MO-gen. to nouns"] = np.zeros(len(df_document))
    df_document["MO-subj. to verbs"] = np.zeros(len(df_document))
    df_document["MO-2p to verbs"] = np.zeros(len(df_document))
    df_document["MO-3p to verbs"] = np.zeros(len(df_document))
    df_document["MO-compounds to nouns"] = np.zeros(len(df_document))
    
def fill_morph_features(df_document, all_sentences, index):
    derived_list = ["ant", "anten", "antin", "antinnen", "arium", "arien", "ast", "asten", "astin", "astinnen", 
                    "at", "ate", "ator", "atoren", "atorin", "atorinnen", "atur", "aturen", "ei", "eien", "er", 
                    "erin", "erinnen", "ent", "ents", "enz", "enzen", 'eur', 'eure', 'eurin', 'eurinnen', 'heit', 
                    'heiten', 'ist', 'isten', 'istin', 'istinnen', 'ion', 'ionen', 'ismus', 'ismen', 'ität', 
                    'itäten', 'keit', 'keiten', 'ling', 'lingen', 'nis', 'nisse', 'schaft', 'schaften', 'tum', 
                    'tümer', 'ung', 'ungen', 'ur', 'werk', 'werke', 'wesen']
    total_derived = 0
    total_nouns = 0
    total_verbs = 0
    total_keit = 0
    total_ung = 0
    total_werk = 0
    total_nom = 0
    total_gen = 0
    total_subj = 0
    total_second = 0
    total_third = 0
    total_compounds = 0
    
    for sentence in all_sentences:
        for word_features in sentence:
            word = word_features[0]
            
            print(len(compoundSplit(word)))
            
            for derivation in derived_list:
                if word[(len(word)-len(derivation)):len(word)] == derivation:
                    total_derived += 1
                    
                    if derivation == "keit":
                        total_keit += 1
                    elif derivation == "ung":
                        total_ung += 1
                    elif derivation == "werk":
                        total_werk += 1
                    
            if "N" in word_features or "Noun" in word_features:
                total_nouns += 1
            if "VFIN" in word_features or "VINF" in word_features or "VIMP" in word_features or "VPP" in word_features or "Verb" in word_features:
                total_verbs += 1
            if "Nom" in word_features:
                total_nom += 1
            if "Gen" in word_features:
                total_gen += 1
            if "Subj" in word_features:
                total_subj += 1
            if "2" in word_features:
                total_second += 1
            if "3" in word_features:
                total_third += 1
                
    if total_nouns == 0:
        df_document.loc[index,"MO-keit"] = 0
        df_document.loc[index,"MO-ung"] = 0
        df_document.loc[index,"MO-werk"] = 0
        df_document.loc[index,"MO-derived to nouns"] = 0
        df_document.loc[index,"MO-nom. to nouns"] = 0
        df_document.loc[index,"MO-gen. to nouns"] = 0
        df_document.loc[index,"MO-compounds to nouns"] = 0
        df_document.loc[index,"MO-subj. to verbs"] = total_subj/total_verbs
        df_document.loc[index,"MO-2p to verbs"] = total_second/total_verbs
        df_document.loc[index,"MO-3p to verbs"] = total_third/total_verbs
        return
    elif total_verbs == 0: 
        df_document.loc[index,"MO-subj. to verbs"] = 0
        df_document.loc[index,"MO-2p to verbs"] = 0
        df_document.loc[index,"MO-3p to verbs"] = 0
        df_document.loc[index,"MO-keit"] = total_keit/total_nouns
        df_document.loc[index,"MO-ung"] = total_ung/total_nouns
        df_document.loc[index,"MO-werk"] = total_werk/total_nouns
        df_document.loc[index,"MO-derived to nouns"] = total_derived/total_nouns
        df_document.loc[index,"MO-nom. to nouns"] = total_nom/total_nouns
        df_document.loc[index,"MO-gen. to nouns"] = total_gen/total_nouns
        df_document.loc[index,"MO-compounds to nouns"] = total_compounds/total_nouns
        return
    
    df_document.loc[index,"MO-keit"] = total_keit/total_nouns
    df_document.loc[index,"MO-ung"] = total_ung/total_nouns
    df_document.loc[index,"MO-werk"] = total_werk/total_nouns
    df_document.loc[index,"MO-derived to nouns"] = total_derived/total_nouns
    df_document.loc[index,"MO-nom. to nouns"] = total_nom/total_nouns
    df_document.loc[index,"MO-gen. to nouns"] = total_gen/total_nouns
    df_document.loc[index,"MO-subj. to verbs"] = total_subj/total_verbs
    df_document.loc[index,"MO-2p to verbs"] = total_second/total_verbs
    df_document.loc[index,"MO-3p to verbs"] = total_third/total_verbs
    df_document.loc[index,"MO-compounds to nouns"] = total_compounds/total_nouns

def fill_document_vector(df,df_document):
    add_morph_columns(df_document)
    index = 0
    for all_sentences in df.loc[:,'RFTagger']:
        if all_sentences is not None:
            all_sentences = all_sentences[:-5] # remove the empty last sentence [] as following line won't work
            all_sentences += "]" # nasty preprocessing necessary to keep list structure for literal_eval
            all_sentences = ast.literal_eval(all_sentences)
            text_length = 0
            for sentence in all_sentences:
                text_length += len(sentence)
                for word_features in sentence:
                    word_features = word_features[1:] #exclude the word itself and lemma at end (leads to too many features)
                    for token in word_features:
                        df_document.loc[index,"MO-"+token] +=1
            df_document.loc[index] = df_document.loc[index] / text_length # normalize each value with the total text length       
            fill_morph_features(df_document, all_sentences, index)
            index +=1

# get total number of sentences and all features contained in dataset in order to create vector columns
num_sentences, all_features = get_all_features(df)  
print(len(all_features))
print(all_features)

'''
# sentence level vector, i.e. each row in the feature vector represents one sentence
zero_data = np.zeros(shape=(num_sentences,len(all_features)))
df_sentence = pd.DataFrame(zero_data, columns=sorted(all_features))
y = get_classifications(df)

fill_sentence_vector(df,df_sentence)
print(df_sentence.shape)
'''

# document level vector, i.e. each row in the feature vector represents one document
zero_data = np.zeros(shape=(len(df),len(all_features))) # create 1 row in the feature vector per document
df_document = pd.DataFrame(zero_data,columns=sorted(all_features))
y = get_classifications(df)

fill_document_vector(df,df_document)
display(df_document.head())

def save_df(path,df):
    df.to_csv(path,sep="|",index=False)

filename = "06_MorphologicalFeatures_df.csv"
save_df(filename,df)

103
['MO-APPR', 'MO-Dat', 'MO-PRO', 'MO-Poss', 'MO-Attr', 'MO--', 'MO-Sg', 'MO-Fem', 'MO-N', 'MO-Reg', 'MO-VFIN', 'MO-Full', 'MO-3', 'MO-Pl', 'MO-Pres', 'MO-Ind', 'MO-CARD', 'MO-Nom', 'MO-SYM', 'MO-Pun', 'MO-Sent', 'MO-ART', 'MO-Def', 'MO-Sein', 'MO-1', 'MO-Pers', 'MO-Subst', 'MO-*', 'MO-CONJ', 'MO-Coord', 'MO-ADV', 'MO-ADJD', 'MO-Pos', 'MO-Masc', 'MO-PART', 'MO-Verb', 'MO-Haben', 'MO-Acc', 'MO-Neut', 'MO-Comma', 'MO-Indef', 'MO-Rel', 'MO-ADJA', 'MO-In', 'MO-Mod', 'MO-APPRART', 'MO-Neg', 'MO-VINF', 'MO-Name', 'MO-Außer', 'MO-Comp', 'MO-VPP', 'MO-Psp', 'MO-Aber', 'MO-Gen', 'MO-Auf', 'MO-ITJ', 'MO-Past', 'MO-Subj', 'MO-Aux', 'MO-Denn', 'MO-An', 'MO-Sup', 'MO-Refl', 'MO-PROADV', 'MO-Dem', 'MO-Deg', 'MO-Colon', 'MO-Quot', 'MO-Left', 'MO-Ans', 'MO-2', 'MO-Inter', 'MO-Right', 'MO-SubFin', 'MO-Vor', 'MO-Wie', 'MO-Other', 'MO-XY', 'MO-Als', 'MO-Unter', 'MO-Zu', 'MO-SubInf', 'MO-VIMP', 'MO-Über', 'MO-Per', 'MO-Bis', 'MO-zu', 'MO-FM', 'MO-Slash', 'MO-Paren', 'MO-Hinter', 'MO-TRUNC', 'MO-Noun', '

RecursionError: maximum recursion depth exceeded while calling a Python object

In [None]:
'''
Tensorflow implementation
'''

import tensorflow as tf
import math

# get a list of all words in the dataframe for word2vec processing            
def vocabulary_by_level(df):
    levels = ["A1","A2","B1","B2","C1","C2"]
    levels_dict = defaultdict(list)
    
    for level in levels:
        for text in df[df["Level"] == level].loc[:,"Text"]:
            text = word_tokenize(text)
            for token in text:
                levels_dict[level].append(token)
    return levels_dict

# uses build_dataset to extract n most common words and assign unique integer to each word.  Returns
# dicts which can look up integer-word and word-integer pairs.
def get_embeddings_dicts(levels_dict, vocabulary_size):
    embeddings_dict = dict()
    
    for key in levels_dict.keys():
        data, count, dictionary, reversed_dictionary = build_dataset(levels_dict[key],vocabulary_size)
        embeddings_dict[key] = [data,count,dictionary,reversed_dictionary]
    
    return embeddings_dict

# Extract the n most common words from each level's texts.  
# Credit to http://adventuresinmachinelearning.com/word2vec-tutorial-tensorflow/
def build_dataset(words, n_words):
    count = [['UNK', -1]]
    count.extend(Counter(words).most_common(n_words - 1))
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0  # dictionary['UNK']
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary

data_index = 0
# generate batch data
def generate_batch(data, batch_size, num_skips, skip_window):
    global data_index
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    context = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    span = 2 * skip_window + 1  # [ skip_window input_word skip_window ]
    buffer = deque(maxlen=span)
    for _ in range(span):
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    for i in range(batch_size // num_skips):
        target = skip_window  # input word at the center of the buffer
        targets_to_avoid = [skip_window]
        for j in range(num_skips):
            while target in targets_to_avoid:
                target = random.randint(0, span - 1)
            targets_to_avoid.append(target)
            batch[i * num_skips + j] = buffer[skip_window]  # this is the input word
            context[i * num_skips + j, 0] = buffer[target]  # these are the context words
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    # Backtrack a little bit to avoid skipping words in the end of a batch
    data_index = (data_index + len(data) - span) % len(data)
    return batch, context

vocabulary_size = 428
levels_dict = vocabulary_by_level(df)
embeddings_dict = get_embeddings_dicts(levels_dict, vocabulary_size

# We pick a random validation set to sample nearest neighbors. Here we limit the
# validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent.
valid_size = 16     # Random set of words to evaluate similarity on.
valid_window = 100  # Only pick dev samples in the head of the distribution.
valid_examples = np.random.choice(valid_window, valid_size, replace=False)
num_sampled = 64    # Number of negative examples to sample.

batch_size = 128
embedding_size = 128  # Dimension of the embedding vector.
skip_window = 1       # How many words to consider left and right.
num_skips = 2         # How many times to reuse an input to generate a context

import datetime as dt

graph = tf.Graph()

def run(graph, num_steps, class_tuple):
    data = class_tuple[0]
    count = class_tuple[1]
    dictionary = class_tuple[2]
    reverse_dictionary = class_tuple[3]
    
    with tf.Session(graph=graph) as session:
        # We must initialize all variables before we use them.
        init.run()
        print('Initialized')

        average_loss = 0
        for step in range(num_steps):
            batch_inputs, batch_context = generate_batch(data,
                batch_size, num_skips, skip_window)
            feed_dict = {train_inputs: batch_inputs, train_context: batch_context}

            # We perform one update step by evaluating the optimizer op (including it
            # in the list of returned values for session.run()
            _, loss_val = session.run([optimizer, cross_entropy], feed_dict=feed_dict)
            average_loss += loss_val

            if step % 2000 == 0:
                if step > 0:
                    average_loss /= 2000
                # The average loss is an estimate of the loss over the last 2000 batches.
                print('Average loss at step ', step, ': ', average_loss)
                average_loss = 0
    
                # Note that this is expensive (~20% slowdown if computed every 500 steps)
            if step % 10000 == 0:
                sim = similarity.eval()
                for i in range(valid_size):
                    valid_word = reverse_dictionary[valid_examples[i]]
                    top_k = 8  # number of nearest neighbors
                    nearest = (-sim[i, :]).argsort()[1:top_k + 1]
                    log_str = 'Nearest to %s:' % valid_word
                    for k in range(top_k):
                        close_word = reverse_dictionary[nearest[k]]
                        log_str = '%s %s,' % (log_str, close_word)
                    print(log_str)
        
        final_embeddings = normalized_embeddings.eval()
            
with graph.as_default():

    train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
    train_context = tf.placeholder(tf.int32, shape=[batch_size, 1])
    valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
    
    # Look up embeddings for inputs.
    embeddings = tf.Variable(
        tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
    embed = tf.nn.embedding_lookup(embeddings, train_inputs)
    
    # Construct the variables for the softmax
    weights = tf.Variable(
    tf.truncated_normal([embedding_size, vocabulary_size],
                          stddev=1.0 / math.sqrt(embedding_size)))
    biases = tf.Variable(tf.zeros([vocabulary_size]))
    hidden_out = tf.transpose(tf.matmul(tf.transpose(weights), tf.transpose(embed))) + biases

    # convert train_context to a one-hot format
    train_one_hot = tf.one_hot(train_context, vocabulary_size)

    cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=hidden_out, labels=train_one_hot))
    
    # Compute the cosine similarity between minibatch examples and all embeddings.
    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keepdims=True))
    normalized_embeddings = embeddings / norm
    valid_embeddings = tf.nn.embedding_lookup(
        normalized_embeddings, valid_dataset)
    similarity = tf.matmul(
        valid_embeddings, normalized_embeddings, transpose_b=True)

    # Construct the variables for the NCE loss
    nce_weights = tf.Variable(
        tf.truncated_normal([vocabulary_size, embedding_size],
                            stddev=1.0 / math.sqrt(embedding_size)))
    nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

    nce_loss = tf.reduce_mean(
        tf.nn.nce_loss(weights=nce_weights,
                       biases=nce_biases,
                       labels=train_context,
                       inputs=embed,
                       num_sampled=num_sampled,
                       num_classes=vocabulary_size))

    optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(nce_loss)

    # Add variable initializer.
    init = tf.global_variables_initializer()

num_steps = 50000

for class_level in embeddings_dict:
    print("Starting!  class level is ",class_level)
    nce_start_time = dt.datetime.now()
    class_tuple = embeddings_dict[class_level]
    run(graph,num_steps,class_tuple)
    nce_end_time = dt.datetime.now()
    print("NCE method took {} seconds to run 50000 iterations".format((nce_end_time-nce_start_time).total_seconds()))

In [None]:
import itertools
import matplotlib.pyplot as plt

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt

X_train, X_test, y_train, y_test = train_test_split(df_document, y, test_size=0.2)

clf = DecisionTreeClassifier(random_state=0)
clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))
level_pred = clf.predict(X_test)
cnf_matrix = confusion_matrix(y_test, level_pred)
class_names = ["A1", "A2", "B1", "B2", "C1", "C2"]
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names,
                      title='Confusion matrix, without normalization')