In [1]:
import json
import requests
import time
import spacy
import itertools
import nltk
import copy
import re
from firebase import firebase

from collections import defaultdict

In [2]:
firebase = firebase.FirebaseApplication('https://mtverena.firebaseio.com/', None)
nlp = spacy.load('de')
stemmer = nltk.PorterStemmer()
url = 'https://www.openthesaurus.de/synonyme/search?q={}&format=application/json'

In [3]:
def clean_word(word):
    tmp = re.sub('[\'\(\)\"\?\.\!\:\;\-]', ' ', word)
    return re.sub(' +',' ',tmp)

def stemm_word(word):
    return stemmer.stem(word)

In [4]:
def create_synonyms(synonyms_list):
    synonyms = []
    for element in itertools.product(*synonyms_list):
        synonyms.append(' '.join(element))
        
    return list(set(synonyms))

In [5]:
def request_synonyms(orig_word, cleaned_stemmed_word, address):
    """
    requests synonyms from openthesaurus
    """
    
    synonyms = [orig_word]
    
    # request synonyms
    try:
        r = requests.get(address.format(cleaned_stemmed_word))
        r.raise_for_status()
        res = r.json()

        # for every synset
        for synset in res['synsets']:
            for word in synset['terms']:
                # extract synonyms
                term = word['term']
                synonyms.append(clean_word(term))
                
    except requests.exceptions.HTTPError as err:
        print('error')
    
    return list(set(synonyms))

In [None]:
request_synonyms('Kleinunternehmer', 'Kleinunternehmer', url)

In [None]:
'Bescheinigung der Gestattung zur Erbringung vorübergehender grenzüberschreitender Dienstleistungen nach § 9 Absatz 1 Nummer 2 Handwerksordnung (HwO) im Bereich des zulassungspflichtigen Handwerks'

In [None]:
'Restschuldbefreiung'

In [6]:
# load json (exported from firebase)
with open('../data/objectives_topics.json') as data_file:   
    data_full = json.load(data_file)


In [7]:
counter = len(list(data_full.keys()))
counter

450

In [8]:
# push synonyms directly to firebase

In [9]:
n = 0
# iterate over json
for (key, data) in data_full.items():
    
    n += 1
    # get synonyms
    cleaned_key = clean_word(key)
    
    synonyms_list = []
    
    # get pos tags of key
    topic_words = [(word.text, word.pos_) for word in nlp(cleaned_key)]
    
    if len(topic_words) > 1:
        # composite topic
        # check for nouns
        for word, pos in topic_words:
            if pos == 'NOUN':
                stemmed_word = stemm_word(word)
                synonyms_list.append(request_synonyms(word, stemmed_word, url))
                time.sleep(3)
            else:
                # push word in list for cross product
                synonyms_list.append([word])
        
        # get cross product with noun variations       
        synonyms = create_synonyms(synonyms_list)
    else:
        # single word
        # request synonyms
        term = topic_words[0][0]
        stemmed_term = stemm_word(term)
        synonyms = request_synonyms(term, stemmed_term, url)
        time.sleep(3)
        
    #print(synonyms)
    synonyms = [x for x in synonyms if x != key]
        
    # push to firebase
    f_url = '/synonyms/' + key
    try:
        firebase.put(f_url, 'synonyms', synonyms)
    except:
        print('---')
        print('error at key:', key)
        print('---')
    
    if n % 20 == 0:
        print(n)

20
40
60
80
100
120
140
160
180
200
220
240
260
280
---
error at key: Restschuldbefreiung
---
300
320
340
360
380
400
---
error at key: Bescheinigung der Gestattung zur Erbringung vorübergehender grenzüberschreitender Dienstleistungen nach § 9 Absatz 1 Nummer 2 Handwerksordnung (HwO) im Bereich des zulassungspflichtigen Handwerks
---
---
error at key: Kleinunternehmer
---
420
440
