### Importing libraries

In [None]:
from spacy.lang.de import German
from tqdm import tqdm
from spacy.lang.de.stop_words import STOP_WORDS
import ftfy as ff
import re
import pandas as pd
import json
from collections import defaultdict
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split as split

### Reading

In [None]:
de_nlp = German()

In [None]:
all_laws = []
prep_tr_laws = []

In [None]:
with open("-f_new.txt", encoding='utf-8') as f:
    all_laws = f.readlines()
all_laws = [x.strip() for x in all_laws]

In [None]:
web_addresses = []
for line in all_laws:
    temp = line.split(' ')
    web_addresses.append(temp[1])

In [None]:
words = []
for address in web_addresses:
    temp = address.split('/')
    words.append(temp[4])

In [None]:
category = set()
for word in words:
    category.add(word)

In [None]:
print(category)

In [None]:
lawJson = json.loads(all_laws[1])
print(lawJson["text"])

In [None]:
shuffled_data = shuffle(all_laws, random_state=7)
print(shuffled_data[1])

### Lemmatization

In [None]:
other_stopwords = [']','[','{','}','.',',',':','#','-','"','!','?','*','&','@','˝',')','(',';','´',' ','/']

In [None]:
def listToString(s):
    str1 = " "
    return (str1.join(s))

In [None]:
def lemmatize(sentence):
    temp = de_nlp(sentence)
    words = []
    for j, token in enumerate(temp):
        if not token.is_stop and token.text not in other_stopwords:
             words.append((token.lemma_))
    return words

In [None]:
all_data_law = []
for i, var in tqdm(enumerate(all_laws), 'Token and Lemmatization'):
    if len(var) > 1000000:
        continue
    temp = json.loads(var)
    final = []
    categories = temp["file"].split("/") 
    final.append(categories[4])
    if "text" in temp:
        if "Kurztitel" in temp["text"]:
            final.append(lemmatize(listToString(temp["text"]["Kurztitel"])))
        else:
            final.append("Missing")
        if "Kundmachungsorgan" in temp["text"]:
            final.append(lemmatize(listToString(temp["text"]["Kundmachungsorgan"])))
        else:
            final.append("Missing")
        if "Beachte" in temp["text"]:
            final.append(lemmatize(listToString(temp["text"]["Beachte"])))
        else:
            final.append("Missing")
        final.append(lemmatize(listToString(temp["text"])))    
    else:
        final.append("Missing")
        final.append("Missing")
        final.append("Missing")
        final.append(lemmatize(var))
    all_data_law.append(final)
df = pd.DataFrame(all_data_law,columns=['Category','Kurztitel','Kundmachungsorgan','Beachte','All_data'])
#df.head

In [None]:
df[["Category","Kurztitel","Kundmachungsorgan","Beachte","All_data"]]

In [None]:
for element in category:
    print(element + " :" + str(len(category_df[category_df.Category == element])))

In [None]:
def word_freq(s):
    freq = defaultdict(int)
    for word in s:
        freq[word.lower()] += 1
    return freq

In [None]:
def iterOnDatas(df):
    big_list = []
    for index, row in df.iterrows():
        for element in row["All_data"]:
            big_list.append(element)
    return big_list

In [None]:
wordsfreq = word_freq(iterOnDatas(df))
sorted_words = sorted(wordsfreq.items(), key=lambda kv: kv[1])
sorted_words.reverse()
sorted_words

In [None]:
new_df = df[(df.Category != "Erlaesse") & (df.Category != "LgblNO")]
new_df[["Category","Kurztitel","Kundmachungsorgan","Beachte","All_data"]]

In [None]:
data_list = []
for i, var in tqdm(enumerate(all_laws), 'Token and Lemmatization'):
    if len(var) > 1000000:
        continue
    final = []
    categories = var.split('/')
    final.append(categories[4])
    final.append(var)
    final.append(len(var))
    final.append(len(var.split(" ")))
    data_list.append(final)
category_df = pd.DataFrame(data_list,columns=['Category','All_data','Length','Words'])

### Filter laws

In [None]:
data_list = []
for i, var in tqdm(enumerate(all_laws), 'Token and Lemmatization'):
    if len(var) > 1000000:
        continue
    final = []
    categories = var.split('/')
    final.append(categories[4])
    final.append(var)
    data_list.append(final)
category_df = pd.DataFrame(data_list,columns=['Category','All_data'])

In [None]:
category_df = category_df[(category_df.Category != "Erlaesse") & (category_df.Category != "LgblNO")]

In [None]:
print("Max length: " + str(category_df.Length.max()))
print("Most words: " + str(category_df.Words.max()))
print("Min length: " + str(category_df.Length.min()))
print("Least words: " + str(category_df.Words.min()))
print("Length mean: " + str(category_df.Length.mean()))
print("Words mean: " + str(category_df.Words.mean()))

In [None]:
category_df[category_df.Words == 22]

In [None]:
category_df.loc[123895]["All_data"]

In [None]:
new_category_df = category_df[(category_df.Words > 21)]

In [None]:
print("Max length: " + str(new_category_df.Length.max()))
print("Most words: " + str(new_category_df.Words.max()))
print("Min length: " + str(new_category_df.Length.min()))
print("Least words: " + str(new_category_df.Words.min()))
print("Length mean: " + str(new_category_df.Length.mean()))
print("Words mean: " + str(new_category_df.Words.mean()))

In [None]:
all_category = []
all_data = []
for i, row in new_category_df.iterrows():
    all_category.append(row["Category"])
    all_data.append(row["All_data"])

In [None]:
tr_data,tst_data,tr_labels,tst_labels = split(all_data,all_category,test_size=0.2,random_state=20,stratify=all_category)

In [None]:
tr_data,dev_data,tr_labels,dev_labels = split(tr_data,tr_labels,test_size=0.2,random_state=20,stratify=tr_labels)

In [None]:
f = open("tr_data_with_category.txt", "w", encoding="utf-8")
fff = open("tst_data_with_category.txt","w", encoding="utf-8")
ffff = open("dev_data_with_category.txt","w", encoding="utf-8")
for i, row in enumerate(tr_data):
    temp = '{"ID": ' + str(i) +', "Data": ' + row + ', "Label": "' + tr_labels[i] +'"}' + "\n"
    f.write(temp)
for i, row in enumerate(tst_data):
    temp = '{"ID": ' + str(i) +', "Data": ' + row + ', "Label": "' + tst_labels[i] +'"}' + "\n"
    fff.write(temp)
for i, row in enumerate(dev_data):
    temp = '{"ID": ' + str(i) +', "Data": ' + row + ', "Label": "' + dev_labels[i] +'"}' + "\n"
    ffff.write(temp)
f.close()
fff.close()
ffff.close()