## DATEV Thesaurus Stats

From the original thesaurus file `german_relat.json`, the pretty-printed and valid-JSON file `german_relat_pretty-20180605.json` was generated as follows:

```
# Pretty-Printing File
cat ./data/german_relat.json | jq . -M > ./output/german_relat/german_relat_pretty-20180605.json

# Was many separate JSON objects - surrounded them by an array to have one valid JSON object
sed -i.bak 's/}/}\,/g' german_relat_pretty-20180605.json
# `brew install gnu-sed` to install sed that behaves like on Linux (macOS-sed behaves slightly different)
gsed -i.bak '1i[' german_relat_pretty-20180605.json
echo "]" >> german_relat_pretty-20180605.json
# then MANUALLY: removed the last comma in the file
# `brew install moreutils` to install the sponge util
cat german_relat_pretty-20180605.json | jq . | sponge german_relat_pretty-20180605.json
```

In [None]:
import os
from os.path import dirname
import json

projectRootDir = dirname(dirname(dirname(os.getcwd())))
thesaurusFilename = projectRootDir + "/data/german_relat_pretty-20180605.json"
with open(thesaurusFilename) as f:
    thesaurusConcepts = json.load(f)
print(thesaurusConcepts[0])

In [None]:
print("Number of concepts:", len(thesaurusConcepts))

In [None]:
from collections import defaultdict
import pprint
pp = pprint.PrettyPrinter(indent=4)

conceptClasses = defaultdict(lambda: 0)
for concept in thesaurusConcepts:
    conceptClasses[concept["Class"]] += 1
print("There are", len(conceptClasses), "concept classes:")
pp.pprint(dict(conceptClasses))

In [None]:
import pandas as pd
dfThesarus = pd.DataFrame.from_dict(thesaurusConcepts)
dfThesarus = dfThesarus.set_index("Concept")
dfThesarus = dfThesarus[dfThesarus["Class"] == "synonym"]
dfThesarus = dfThesarus.drop(columns = ["ExpansionKeys", "Class"])
dfThesarus["KeysCount"] =  dfThesarus.apply(lambda row: len(row["Keys"]), axis=1)
dfThesarus.head()

In [None]:
print(dfThesarus["KeysCount"].describe())
print()
print("Number of keys:\t\t", dfThesarus["KeysCount"].sum())

keysList = []
dfThesarus.apply(lambda row: keysList.extend(row["Keys"]), axis=1)

print("Number of unique keys:\t", len(set(keysList)))

from collections import Counter
cnt = Counter(keysList)
multipleAppearKeys = [k for k, v in cnt.items() if v > 1]
print("Number of keys that appear in multiple concepts:", len(multipleAppearKeys))

In [None]:
dfThesarus["KeysCount"].value_counts()

In [None]:
import matplotlib.pyplot as plt
import numpy as np
ax = dfThesarus.plot.hist(by="KeysCount", edgecolor="black", figsize=(8,6), align="left", bins=range(1,35), rwidth=1)
for p in ax.patches:
    if p.get_height() == 0:
        continue
    ax.annotate(str(int(p.get_height())), xy=(p.get_x()+p.get_width()/2., p.get_height()), ha='center', va='center', xytext=(0, 10), textcoords='offset points')
ax.xaxis.set_ticks(np.arange(1, 33, 1))
ax.legend_.remove()

plt.xlabel("Synset Size")
plt.tight_layout()
plt.savefig('thesaurus_synset_keycount_hist.png', dpi = 300)
plt.show()