In [1]:
import sys
import re
from collections import defaultdict
import json
import os
import re
import pickle


import pandas as pd
import numpy as np

from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import StratifiedGroupKFold,RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier

In [2]:
with open('data/nouns.json', 'rt', encoding='UTF-8') as json_file:
    nouns = json.load(json_file)

wordfreq = pd.read_csv('data/third-party/FrequencyWords/content/2018/de/de_full.txt', sep=' ', names=['word','freq']).set_index('word').freq.sort_values(ascending=False) #sort by frequency
wordfreq = wordfreq.to_dict()

In [3]:
rules = []

for wordform,lemmas in nouns.items():
    for lemma_dict in lemmas:
        lemma = lemma_dict['lemma']
        if wordform==lemma:
            rules.append((wordform,lemma,lemma_dict['genus'],lemma_dict['connection'], ''))
        elif wordform.startswith(lemma):
            ending = wordform.replace(lemma,'+')
            rules.append((wordform,lemma,lemma_dict['genus'],lemma_dict['connection'], ending))
        else:
            rules.append((wordform,lemma,lemma_dict['genus'],lemma_dict['connection'], '-'))

rules = pd.DataFrame(rules,columns=['wordform', 'lemma', 'genus', 'connection', 'rule'])

In [4]:
#rules_blind = rules.copy()
#rules_blind[['connection','genus']] = '-'

#rules = pd.concat([rules,rules_blind])

In [5]:
class_counts = rules.rule.value_counts()
class_counts.head()

rule
       392682
+en     85962
+n      82390
+s      72581
-       71555
Name: count, dtype: int64

In [6]:
class_counts = class_counts[class_counts>20]

rules.loc[~rules.rule.isin(class_counts.index),'rule'] = '-'

In [7]:
N_LAST = 6

word_endings = []
for idx in range(-N_LAST,0):
    word_endings.append(rules.wordform.apply(lambda x:x[idx:]).rename(f'last_{abs(idx)}'))
    
word_endings = pd.concat(word_endings,axis=1)

rules = pd.concat([word_endings,rules],axis=1)

In [13]:
features_encoder = OrdinalEncoder(handle_unknown='use_encoded_value',unknown_value=-1)

rules_enc = {rule:idx for idx,rule in enumerate(rules.rule.unique())}

In [9]:
X = rules.drop(columns=['lemma','wordform','rule']).values
y = rules.rule.map(rules_enc).values

X.shape

(820231, 8)

In [10]:
sgkf = StratifiedGroupKFold(n_splits=5)

In [11]:
for i, (train_index, test_index) in enumerate(sgkf.split(X, y, rules.lemma)):
    break

In [12]:
X_train, y_train  = X[train_index], y[train_index] 
X_test, y_test = X[test_index], y[test_index]

In [14]:
X_train = features_encoder.fit_transform(X_train)
X_test = features_encoder.transform(X_test)

In [15]:
#distributions = dict(max_depth=[5,10,15,20,30],min_samples_split=[2,5,10,20,50])
#dt = DecisionTreeClassifier()
#clf = RandomizedSearchCV(dt,distributions,random_state=0)
#search = clf.fit(X_train,y_train)

In [26]:
clf = DecisionTreeClassifier(max_depth=20,min_samples_split=50)

#clf = DecisionTreeClassifier()

clf.fit(X_train,y_train)

In [18]:
clf.score(X_test,y_test)

0.9535920021945198

In [24]:
rules.drop(columns=['lemma','wordform','rule']).columns

Index(['last_6', 'last_5', 'last_4', 'last_3', 'last_2', 'last_1', 'genus',
       'connection'],
      dtype='object')

In [28]:
X_test_[-2:]

array([[-1., -1., -1., -1., -1., -1., -1., -1.],
       [-1., -1., -1., -1., -1., -1., -1., -1.]])

In [29]:
X_test_ = X_test.copy()
X_test_[:,-2:] = -1

clf.score(X_test_,y_test)

0.5866743881252096

In [20]:
clf.fit(features_encoder.transform(X),y)

In [21]:
with open('data/nouns-dt.pickle','wb') as f:
    pickle.dump({'clf':clf,
                   'features_encoder':features_encoder,
                   'features_list':rules.columns.drop(['lemma','wordform','rule']).tolist(),
                   'rules_list':list(rules_enc.keys()),
                   'n_last':N_LAST}, f)

In [27]:
class NounsDTClassifier():

    def __init__(self, path):

        with open(path,'rb') as f:

            data = pickle.load(f)
            
            self.dt_clf = data['clf']
            self.features_encoder = data['features_encoder']
            self.features_list = data['features_list']
            self.rules_list = data['rules_list']
            self.n_last = data['n_last']
        
    def __call__(self, word, constraints=None):
        
        encoding_dict = {feature:cat for feature,cat in zip(self.features_list, self.features_encoder.categories_)}
    
        if constraints is None:
            constraints = [[genus,connection] for connection in encoding_dict['connection'] for genus in encoding_dict['genus']]
            
        data = []
        
        word_parts = [word[idx:] for idx in range(-self.n_last,0)]
        
        for constraint in constraints:
            data.append(word_parts+list(constraint))
    
        try:
            word_enc = self.features_encoder.transform(data)
        except:
            return None
                    
        pred = self.dt_clf.predict_proba(word_enc).mean(0)
        
        rule = self.rules_list[np.argmax(pred)]
    
        if rule=='':
            return word
        elif rule.startswith('+'):
            return re.sub(f'{rule[1:]}$','',word)
            
        return None