In [None]:
import logging
import os
import os.path as path
from gensim import corpora, models, similarities
from textblob_de import PatternParser, PatternParserLemmatizer, Word, TextBlobDE as TextBlob
from nltk.tag import StanfordNERTagger, StanfordPOSTagger

os.environ['JAVAHOME'] = 'C:/Development/Java/bin'
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

pos_model = path.join('Models', 'Stanford', 'german-hgc.tagger')
pos_jar = path.join('Dependencies', 'stanford-postagger.jar')
ner_model = path.join('Models', 'Stanford', 'german.hgc_175m_600.crf.ser.gz')
ner_jar = path.join('Dependencies', 'stanford-ner.jar')

pos = StanfordPOSTagger(pos_model, pos_jar, encoding = 'UTF-8')
ner = StanfordNERTagger(ner_model, ner_jar, encoding = 'UTF-8')

class StreamCorpus(object):
    def __init__(self, filename):
        self.filename = filename
    def __iter__(self):
        for line in open(self.filename, encoding='UTF-8'):
            text = TextBlob(document).words
            entity_tag = ner.tag(text)
            pos_tag = pos.tag(text)
            entities = [word for word, tag in entity_tag if tag not in set(['O', 'I-MISC'])]
            keywords = [word.lower() for word, tag in pos_tag if tag in set(['NN', 'NE', 'VVPP']) and word not in set(entities)]
            yield dictionary.doc2bow(keywords)

def load_stopwords(filename):
    with open(filename, 'r', encoding='UTF-8') as f:
        data = f.read()
        return set(data.split('\n'))

corpus_file = path.join('..', 'Crawler', 'Data', 'Corpus.txt')
stopwords_file = path.join('..', 'Crawler', 'Data', 'Stopwords.txt')

corpus = StreamCorpus(corpus_file)
stoplist = load_stopwords(stopwords_file)

dictionary = corpora.Dictionary(TextBlob(line.lower()).words for line in open(corpus_file, encoding='UTF-8'))
stop_ids = [dictionary.token2id[stopword] for stopword in stoplist
                                          if stopword in dictionary.token2id]
once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.items() if docfreq == 1]
dictionary.filter_tokens(stop_ids + once_ids)
dictionary.filter_extremes(no_below=10)
dictionary.compactify()

dictionary.save('corpus.dict')
corpora.MmCorpus.serialize('corpus.mm', corpus)