In [1]:
import re
import pickle
import json

from collections import Counter, defaultdict

import pandas as pd
import numpy as np

from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import StratifiedGroupKFold

In [11]:
with open('data/nouns.json', 'rt', encoding='UTF-8') as json_file:
    nouns = json.load(json_file)

In [12]:
#wordfreq = pd.read_csv('data/third-party/FrequencyWords/content/2018/de/de_full.txt', sep=' ', names=['word','freq']).set_index('word').freq.sort_values(ascending=False)#sort by frequency
#n_words_vocab = wordfreq.sum()
#wordfreq = wordfreq.to_dict()

In [25]:
N_CLASSES = 100
STATRULES_ACCURACY = 1

In [26]:
def infer_rule(wordform,lemma):

        
    for idx in range(min(len(wordform),len(lemma))):
        if wordform[:idx+1]!=lemma[:idx+1]:
            idx -= 1
            break 

    seq_to_remove = wordform[idx+1:]
    seq_to_add = lemma[idx+1:]
    
    rule = (seq_to_remove,seq_to_add)

    assert re.sub(f'{seq_to_remove}$',seq_to_add,wordform)==lemma
    
    return rule

In [27]:
rules = []

for wordform,lemmas in nouns.items():
    for lemma_dict in lemmas:
        lemma = lemma_dict['lemma']
        rules.append((wordform,lemma,lemma_dict['genus'],lemma_dict['declination'], infer_rule(wordform,lemma)))

rules = pd.DataFrame(rules,columns=['wordform', 'lemma', 'genus', 'declination', 'rule'])

In [28]:
N_LAST = 6

word_endings = []
for idx in range(-N_LAST,0):
    word_endings.append(rules.wordform.apply(lambda x:x[idx:]).rename(f'last_{abs(idx)}'))
    #word_endings.append(rules.wordform.apply(lambda x:x[idx] if len(x)>=abs(idx) else '').rename(f'last_{abs(idx)}'))
    
word_endings = pd.concat(word_endings,axis=1)

rules = pd.concat([word_endings,rules],axis=1)

In [29]:
nouns_stat_rules = {}

for idx in range(-N_LAST,0):
    
    feature = f'last_{abs(idx)}'
    
    feature_df = rules.groupby(feature).rule.value_counts(normalize=True).reset_index()
    feature_counts = rules[feature].value_counts().rename('n_wordforms').reset_index()
    
    feature_df = feature_df.merge(feature_counts)
    feature_df = feature_df[feature_df.n_wordforms>100]
    
    feature_df = feature_df[feature_df.proportion>STATRULES_ACCURACY].sort_values(by='n_wordforms',ascending=False)
    
    nouns_stat_rules[feature] = feature_df.set_index(feature).rule.to_dict()

In [30]:
with open(f'data/nouns_stat_rules-{int(STATRULES_ACCURACY*100)}.pickle','wb') as f:
    pickle.dump({'rules_dict':nouns_stat_rules, 'n_last':N_LAST}, f)

In [506]:
class_counts = rules.rule.value_counts()

class_counts = class_counts.iloc[:N_CLASSES]

rules.loc[~rules.rule.isin(class_counts.index),'rule'] = '-'

class_counts

rule
(, )             392682
(en, )            85962
(n, )             82390
(s, )             72581
(e, )             63237
                  ...  
(örner, orn)        165
(öcher, och)        165
(ännern, ann)       162
(äste, ast)         162
(ina, en)           160
Name: count, Length: 100, dtype: int64

In [None]:
rules_enc = {rule:idx for idx,rule in enumerate(rules.rule.unique())}

In [431]:
features_encoder = OrdinalEncoder(handle_unknown='use_encoded_value',unknown_value=-2,encoded_missing_value=-1)

features_list = rules.columns.drop(['lemma','wordform','rule']).tolist()

features_list

['last_6',
 'last_5',
 'last_4',
 'last_3',
 'last_2',
 'last_1',
 'genus',
 'connection']

In [432]:
X = rules[features_list].values
y = rules.rule.map(rules_enc).values

X.shape

(820231, 8)

In [470]:
sgkf = StratifiedGroupKFold(n_splits=5)

In [471]:
for i, (train_index, test_index) in enumerate(sgkf.split(X, y, rules.lemma)):
    break

In [472]:
X_train, y_train  = X[train_index], y[train_index] 
X_test, y_test = X[test_index], y[test_index]

In [473]:
X_train = features_encoder.fit_transform(X_train).astype(int)
X_test = features_encoder.transform(X_test).astype(int)

In [474]:
class CategoricalNaiveBayes():

    def __init__(self, kappa=2, epsilon=1e-20):
        
        self.kappa = kappa
        self.epsilon = epsilon

    def _compute_priors_logprobs(self, y):

        priors_probs = [class_counts/len(y) for class_counts in self.class_counts]

        self.priors_logprobs = np.log(priors_probs)
        
    def _compute_loglikelihood(self, X, y):
        
        feature_counts = {feature_idx:np.zeros((self.n_categories[feature_idx]+2,self.n_classes)) for feature_idx in range(self.n_features)}
        
        for features, class_idx in zip(X, y):
            
            for feature_idx,feature_value in enumerate(features):
                
                feature_counts[feature_idx][feature_value,class_idx] += 1

        loglikelihood = {feature_idx:np.zeros((self.n_categories[feature_idx]+2,self.n_classes)) for feature_idx in range(self.n_features)}

        for feature_idx in range(self.n_features):
            loglikelihood[feature_idx] = np.log((feature_counts[feature_idx]+self.epsilon)
                                                          / (np.repeat(self.class_counts[None,...], self.n_categories[feature_idx]+2, axis=0)
                                                            + self.kappa*self.epsilon))

            loglikelihood[feature_idx][-1,:] = 0

        self.loglikelihood = loglikelihood

        
    def fit(self, X_train, y_train, priors_logprobs=None):

        counter = Counter(y_train)
        
        class_ids, class_counts = zip(*sorted(counter.items()))
        
        self.class_counts = np.array(class_counts)
        self.n_classes = np.max(class_ids)+1

        self.n_features = X_train.shape[1]
        self.n_categories = X_train.max(axis=0)

        if priors_logprobs is None:
            self._compute_priors_logprobs(y_train)
        else:
            self.priors_logprobs = priors_logprobs

        self._compute_loglikelihood(X_train, y_train)

    def _get_bayes_numerator(self, X):

        n_samples = X.shape[0]

        sample_loglikelihood = np.zeros((n_samples,self.n_features,self.n_classes))

        for feature_idx in range(self.n_features):
            
            sample_loglikelihood[:,feature_idx,:] = self.loglikelihood[feature_idx][X[:,feature_idx]] #N_samplesxN_classes

        numerator = sample_loglikelihood.sum(axis=1)  + self.priors_logprobs[None,...]

        return numerator
            
    def predict_proba(self, X):

        numerator = np.exp(self._get_bayes_numerator(X))
        
        probs = numerator/numerator.sum(axis=1,keepdims=True)
                            
        return probs

    def predict(self, X):

        predicted_class_ids = self._get_bayes_numerator(X).argmax(1)

        return predicted_class_ids
        
    def score(self, X, y):

        y_pred = self.predict(X)

        return (y_pred==np.array(y)).mean()

In [475]:
#pseudocount = 1

#priors_df = rules.groupby('wordform').rule.value_counts(normalize=True).reset_index()

#priors_df['wordfreq'] = priors_df.wordform.map(wordfreq)
#priors_df.wordfreq = priors_df.wordfreq.fillna(0) + pseudocount
#priors_df.wordfreq = priors_df.wordfreq/n_words_vocab
#
#rule_priors = priors_df.groupby('rule').apply(lambda x:(x.wordfreq*x.proportion).sum())
#
#rule_priors = rule_priors/rule_priors.sum()
#
#prior_logprobs = {rules_enc[rule]:np.log(rule_prob) for rule,rule_prob in rule_priors.items()}

In [476]:
nbc = CategoricalNaiveBayes()

nbc.fit(X_train,y_train)

In [477]:
nbc.score(X_test,y_test)

0.9212313319110027

In [478]:
X_ = features_encoder.fit_transform(X).astype(int)

nbc.fit(X_,y)

In [479]:
with open(f'data/nouns-nbc-top{N_CLASSES}.pickle','wb') as f:
    pickle.dump({'clf':nbc,
                   'features_encoder':features_encoder,
                   'features_list':features_list,
                   'rules_list':list(rules_enc.keys()),
                   'n_last':N_LAST}, f)

In [449]:
class NounsNBC():

    def __init__(self, path):

        with open(path,'rb') as f:

            data = pickle.load(f)
            
            self.nbc_clf = data['clf']
            self.features_encoder = data['features_encoder']
            self.features_list = data['features_list']
            self.rules_list = data['rules_list']
            self.n_last = data['n_last']
        
    def __call__(self, word, constraints=None):
        
        word_parts = [word[idx:] for idx in range(-self.n_last,0)]

        if not constraints:
            constraints = ((-1,-1),)

        data = [word_parts+list(constraint) for constraint in constraints]

        word_enc = self.features_encoder.transform(data).astype(int)
        
        if len(constraints)==1:

            pred = self.nbc_clf.predict(word_enc)[0]
            
        else:
                        
            pred = self.nbc_clf.predict_proba(word_enc).mean(0).argmax()
        
        rule = self.rules_list[pred]
    
        if rule=='-':
            return None
        else:
            seq_to_remove,seq_to_add = rule
            return re.sub(f'{seq_to_remove}$',seq_to_add,word)
            
        return None

In [450]:
nbc = NounsNBC('data/nouns-nbc-top100.pickle')

In [451]:
nbc('hhhwkeiten',(('f','Nominativ Plural'),))

'hhhwkeit'