# Imports

In [None]:
from sklearn.model_selection import train_test_split as split
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import matplotlib.pyplot as plt
import numpy as np
from spacy.lang.de import German
from spacy.lang.de.stop_words import STOP_WORDS
import pandas as pd
import json
from collections import defaultdict
import matplotlib.pyplot as plt
from sklearn import metrics
import eli5
from sklearn.svm import LinearSVC
import faiss
import numpy as np

### Helper functions & classes

In [None]:
def listToString(s):
    str1 = " "
    return (str1.join(s))

In [None]:
def confusion_matrix(cm, title='Confusion matrix', cmap=plt.cm.Blues, labels=[]):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(labels))
    plt.xticks(tick_marks, labels, rotation=45)
    plt.yticks(tick_marks, labels)
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show() 

In [None]:
class FaissKNeighbors:
    def __init__(self, k=5):
        self.index = None
        self.y = None
        self.k = k

    def fit(self, X, y):
        self.index = faiss.IndexFlatL2(X.shape[1])
        self.index.add(X.astype(np.float32))
        self.y = y

    def predict(self, X):
        distances, indices = self.index.search(X.astype(np.float32), k=self.k)
        votes = np.array(self.y)[indices]
        predictions = np.array([np.argmax(np.bincount(x)) for x in votes])
        return predictions

## Reading Data

In [None]:
f = open("tr_data_with_category.txt", "r", encoding="utf-8")
data_laws = f.readlines()
f.close()
all_category_n = []
final_laws = []
for i, var in tqdm(enumerate(data_laws), 'Token and Lemmatization'):
    temp = json.loads(var)
    law_category = temp["Label"]
    law_content = temp["Data"]["text"]
    if(law_category=='Justiz'):
        law_content.pop("comments", None)
        law_content.pop("Gericht", None)
        law_content.pop("Entscheidungsdatum", None)
        law_content.pop("Geschäftszahl", None)
        law_content.pop("Norm", None)
        law_content.pop("Entscheidungstexte", None)
        law_content.pop("Rechtssatznummer", None)
        law_content.pop("European Case Law Identifier", None)
    elif(law_category=='Vfgh'):
        law_content.pop("comments", None)
        law_content.pop("Gericht", None)
        law_content.pop("Verfassungsgerichtshof", None)
        law_content.pop("Entscheidungsdatum", None)
        law_content.pop("Geschäftszahl", None)
        law_content.pop("Sammlungsnummer", None)
        law_content.pop("Spruch", None)
        law_content.pop("Begründung", None)
        law_content.pop("European Case Law Identifier", None)
    elif(law_category=='RegV'):
        law_content.pop("comments", None)
    elif(law_category=='Gemeinderecht'):
        law_content.pop("comments", None)
    elif(law_category=='Bundesnormen'):
        law_content.pop("comments", None)
        law_content.pop("Kurztitel", None)
        law_content.pop("Kundmachungsorgan", None)
        law_content.pop("§/Artikel/Anlage", None)
        law_content.pop("Außerkrafttretensdatum", None)
        law_content.pop("Inkrafttretensdatum", None)
        law_content.pop("Beachte", None)
        law_content.pop("Typ", None)
        law_content.pop("Index", None)
    elif(law_category=='Lgbl'):
        law_content.pop("comments", None)
        law_content.pop("Datum der Kundmachung", None)
        law_content.pop("Fundstelle", None)
        law_content.pop("Bundesland", None)
    elif(law_category=='Normenliste'):
        law_content.pop("comments", None)
        law_content.pop("Dokumentnummer", None)
        law_content.pop("Typ", None)
        law_content.pop("Abkürzung des Verwaltungsgerichtshofes", None)
        law_content.pop("Fundstelle", None)
        law_content.pop("Abkürzung", None)
        law_content.pop("Index", None)
        law_content.pop("Anmerkung", None)
    elif(law_category=='Begut'):
        law_content.pop("comments", None)
    elif(law_category=='Vwgh'):
        law_content.pop("comments", None)
        law_content.pop("Gericht", None)
        law_content.pop("Entscheidungsdatum", None)
        law_content.pop("Geschäftszahl", None)
        law_content.pop("European Case Law Identifier", None)
        law_content.pop("Beachte", None)
        law_content.pop("Hinweis auf Stammrechtssatz", None)
    elif(law_category=='Landesnormen'):
        law_content.pop("comments", None)
        law_content.pop("Bundesland", None)
        law_content.pop("Kurztitel", None)
        law_content.pop("Kundmachungsorgan", None)
        law_content.pop("Typ", None)
        law_content.pop("§/Artikel/Anlage", None)
        law_content.pop("Außerkrafttretensdatum", None)
        law_content.pop("Index", None)
        law_content.pop("Im RIS seit", None)
        law_content.pop("Inkrafttretensdatum", None)
        law_content.pop("Außerkrafttretensdatum", None)
        law_content.pop("Gesetzesnummer", None)
        law_content.pop("Dokumentnummer", None)
        law_content.pop("Änderung", None)
    
    temp_all = []
    for the_key, the_value in law_content.items():
        temp_all.append(the_value)
    final_laws.append(temp_all)
    all_category_n.append(law_category)

### Modify data

In [None]:
prep_laws = []
for words in final_laws:
    temp = [j for sub in words for j in sub]
    prep_laws.append(temp)

In [None]:
final_laws = []
for words in prep_laws:
    final_laws.append(listToString(words))

In [None]:
all_category = []
fullnew_law = []
fullnew_category = []
for word in all_category_n:
    if word=='Justiz':
        all_category.append(1)
    elif word=='Vfgh':
        all_category.append(2)
    elif word=='RegV':
        all_category.append(3)
    elif word=='Gemeinderecht':
        all_category.append(4)
    elif word=='Bundesnormen':
        all_category.append(5)
    elif word=='Lgbl':
        all_category.append(6)
    elif word=='Normenliste':
        all_category.append(7)
    elif word=='Begut':
        all_category.append(8)
    elif word=='Vwgh':
        all_category.append(9) 
    elif word=='Landesnormen':
        all_category.append(10)
    else:
        print(word)

for i, asd in enumerate(final_laws):
    if(len(asd) > 1):
        fullnew_law.append(asd)
        fullnew_category.append(all_category[i])

## Vectiorizing

In [None]:
tr_laws,tst_laws,tr_labels,tst_labels = split(fullnew_law,fullnew_category,test_size=0.3)

In [None]:
tr_vecs = []
tst_vecs = []

In [None]:
vectorizer = TfidfVectorizer(max_features = 2000)
vectorizer = vectorizer.fit(tr_laws)
vectors = vectorizer.transform(tr_laws).toarray()
tr_vecs.append(vectors)

In [None]:
vectors = vectorizer.transform(tst_laws).toarray()
tst_vecs.append(vectors)

## Classifiers and fitting

In [None]:
classifier = LogisticRegression
params = {'penalty':'l2','solver':'sag'}
model = classifier(**params)
model.fit(tr_vecs[0],tr_labels)

In [None]:
classifier = RandomForestClassifier
params = {'n_estimators':15}
model = classifier(**params)
model.fit(tr_vecs[0],tr_labels)

In [None]:
classifier = MultinomialNB
params = {'alpha':5,'fit_prior':True}
model = classifier(**params)
model.fit(tr_vecs[0],tr_labels)

In [None]:
classifier = KNeighborsClassifier
params = {'n_neighbors':5,'leaf_size':15}
model = classifier(**params)
model.fit(tr_vecs[0],tr_labels)

In [None]:
classifier = LinearSVC
params = {'C':10}
model = classifier(**params)
model.fit(tr_vecs[0],tr_labels)

In [None]:
classifier = FaissKNeighbors()
classifier.fit(tr_vecs[0],tr_labels)

## Predict and results

In [None]:
test_data = np.array(tst_vecs[0])
test_labels = np.array(tst_labels).reshape(-1,1)
predicted_tst_labels = model.predict(test_data) #change model to classifier with FaissKNeighbors
conf = np.zeros([11,11])
confnorm = np.zeros([11,11])
for i in range(0,test_data.shape[0]):
    j = test_labels[i,:]
    k = predicted_tst_labels[i]
    conf[j,k] = conf[j,k] + 1
for i in range(0,11):
    confnorm[i,:] = conf[i,:] / np.sum(conf[i,:])
confusion_matrix(confnorm, labels=[i for i in range(11)])
acc = model.score(test_data,test_labels)

In [None]:
print(acc)
print(accuracy_score(test_labels, predicted_tst_labels))

## Explain results (regression)

In [None]:
category_names = ['Justiz','Vfgh','RegV','Gemeinderecht','Bundesnormen','Lgbl','Normenliste','Begut','Vwgh','Landesnormen']
eli5.explain_weights(model, feature_names=vectorizer.get_feature_names(), target_names=category_names)