In [85]:
# Imports
from luigi.contrib.spark import PySparkTask
from luigi.parameter import IntParameter, DateSecondParameter
from luigi import LocalTarget, Task, WrapperTask
import datetime
import pandas as pd
import re
from nltk.stem.cistem import Cistem
%run Importer.ipynb

class Preprocessor(Task):

    # Date for Output-File prefix
    from datetime import date, timedelta
    date = DateSecondParameter(default=datetime.datetime.now())
    #date = datetime.datetime.now()
    
    # Method to declare the Output-File
    def output(self):
        prefix = self.date.strftime("%Y-%m-%dT%H%M%S")
        return LocalTarget("data/%s_Preprocessor_out.csv" % prefix, format=UTF8)
    
    # Method to define the required Task (Importer)
    def requires(self):
        return Importer()


    # Preprocess the imported Data
    def run(self):
        df = pd.read_csv(self.input().path)
        
        # Preprocessing
        for index, document in df.iterrows():
            text = self.removeSpecialCharacters(document.text)
            text = self.removeSentenceCharacters(text)
            text = self.removeSingleCharacters(text)
            text = self.removeMultiSpaces(text)
            text = self.stemText(text)
            text = self.removeStopWords(text)
            print(text)
            #print(self.removeSingleCharacters(document.text)) 
        
        # Write .csv-File
        with self.output().open("w") as out:
            df.to_csv(out, encoding="utf-8")
            
    
    # External Methods for preprocessing
    def removeSpecialCharacters(self, text):
        return re.sub(r'[^éàèÉÀÈäöüÄÖÜa-zA-Z0-9\.]+', ' ', str(text))
    
    def removeSentenceCharacters(self, text):
        return re.sub(r'\W\s', ' ', text)
    
    def removeSingleCharacters(self, text):
        return re.sub(r'\s+[a-zA-Z]\s+', ' ', text)
    
    def removeMultiSpaces(self, text):
        return re.sub(r'\s+', ' ', text, flags=re.I)
    
    def stemText(self, text):
        stemmer = Cistem()
        return [stemmer.stem(word) for word in text.split()]
    
    def removeStopWords(self, words):
        # use own stopword list
        stop = pd.read_csv('stopwords_no_umlaute.txt', header=None)
        stop.columns = ['word']
        # convert list to set for word comparison
        stopwordSet = set(stop.word)
        wordsFiltered = []
        wordsRemoved = []
        for w in words:
            w = w.lower()
            if w not in stopwordSet:
                wordsFiltered.append(w)
            if w in stopwordSet:
                wordsRemoved.append(w)

        #print("Removed words: %s" % wordsRemoved)
        #print("Percentage of removed words: %s" % (len(wordsRemoved)/len(words)*100))
        return wordsFiltered

pre = Preprocessor()
pre.run()


['bestell', 'hotli', '41', '71', '228', '66', '77', 'navigatio', 'uberspring', 'hom', 'schnupp', 'job', 'kontakt', 'navigatio', 'uberspring', 'hom', 'aktuell', 'standort', 'gschw', 'st.gall', 'gschw', 'rotmont', 'gschw', 'winkel', 'gschw', 'abtwil', 'gschw', 'morschwil', 'prospekt', 'schenkkart', 'menu', 'unternehm', 'caf', 'restaurant', 'gartenterrass', 'backerei', 'confiserie', 'kuch', 'team', 'sortiment', 'apero', 'mehr..', 'tort', 'spezialitat', 'giv', 'away', 'saisonalartikel', 'sandwich', 'salat', 'partyservic', 'navigatio', 'uberspring', 'hom', 'aktuell', 'aktuell', 'detail', 'standort', 'gschw', 'st.gall', 'gschw', 'rotmont', 'gschw', 'winkel', 'gschw', 'abtwil', 'gschw', 'morschwil', 'prospekt', 'schenkkart', 'dankesseit', 'menu', 'fondu', 'chinoi', 'unternehm', 'caf', 'restaurant', 'gartenterrass', 'backerei', 'confiserie', 'kuch', 'team', 'schaftsleitung', 'administratio', 'produktio', 'nachtschicht', 'produktio', 'tagschicht', 'produktio', 'apero', 'traiteur', 'partyservic'

['hom', 'speisekart', 'weinkart', 'angebot', 'schicht', 'kontakt', 'entdeck', 'raumlichkeit', 'reichhaltig', 'angebot', 'ritterstub', 'niess', 'mutlich', 'abend', 'ritterstub', 'ambient', 'werd', 'nich', 'vergess', 'gaststub', 'mindest', 'mutlich', 'ritterstub', 'gaststub', 'gastfreundschaft', 'schrieb', 'kegelbah', 'reservier', 'kegelbah', 'freu', 'ideal', 'anlass', 'egal', 'firmenanlass', 'vereinsess', 'zaub', 'perfek', 'abend', 'unvergesslich', 'erlebni', 'stimmungsvoll', 'ritterstub', 'gasthof', 'kro', 'nossiko', 'schaftsfuhr', 'gokba', 'haci', 'burgstrass', '81', '8610', 'uster', '044', '940', '16', '49', 'kro', 'nossikon.ch', 'impressum', 'datenschutzrichtlinie', 'webdesig', 'by', 'schneid', 'servic', 'impressum', 'betreib', 'inhab', 'webseit', 'hdb', 'gastro', 'gasthof', 'kro', 'nossiko', 'haci', 'gokba', 'lindauerstrass', '4', '8309', 'nurensdorf', 'webmast', 'schneid', 'servic', 'werbeschulstrass', '9', '8620', 'wetziko', 'datenschutzerklarung', '01.01.2019', 'verantwortungsvo

['tagesmenu', '08.03.2019', 'menu', 'offerier', 'tagessalat', 'tomatensupp', 'menu', '1', 'ravioli', 'di', 'manzo', 'teigtasch', 'full', 'rindfleisch', 'ricotta', 'spargel', 'pesto', 'rahmsauc', '16.00', 'menu', '2', 'vegi', 'risotto', 'alla', 'verdur', 'parmesa', 'risotto', 'saiso', 'mus', 'buffalo', 'mozzarella', '16.00', 'menu', '3', 'pizza', 'mexico', 'tomat', 'mozzarella', 'schweinegeschnetzelt', 'mai', 'peperoni', 'scharf', '17.50', 'menu', '4', 'portafoglio', 'di', 'maial', 'schwei', 'cordo', 'blu', 'krokett', 'mus', '20.50', 'hit', '1', 'fisch', 'filetto', 'di', 'nasello', 'seehechtfilet', 'safransauc', 'bratkartoffel', 'mus', '19.50', 'business', 'lunch', 'bistecca', 'di', 'manzo', 'rindsentrecot', 'pfeffersauc', 'krokett', 'mus', '27.50', 'dessert', 'hausgemacht', 'tiramisu', '5.50', 'wunsch', 'appetit', 'buo', 'appetito', 'commercio', 'team', 'milch', 'produkt', 'salami', 'parma', 'schink', 'caporaso', 'frisch', 'eier', 'besorg', 'familie', 'beyel', '5108', 'oberflach', 'par

['fago', 'restaurant', 'café', 'bar', 'restaurant', 'kuch', 'tagesmenu', 'business', 'lunch', 'à', 'la', 'cart', 'aktuell', 'faschingstreib', 'russisch', 'abend', 'fischwoch', 'fago', 'bar', 'kontakt', 'offnungszeit', 'fago', 'nuss', 'motivatio', 'fago', 'verschmelz', 'gaumenfreud', 'augenschmau', 'behaglich', 'ambient', 'inhaberi', 'alexandra', 'goop', 'team', 'verwoh', 'raffinier', 'zuberei', 'saisonal', 'spei', 'erl', 'wein', 'somm', 'lad', 'stilvoll', 'gastgart', 'verweil', 'niess', 'moch', 'lass', 'besuch', 'hauseig', 'cocktail', 'erl', 'zigarr', 'bar', 'auskling', 'lokal', 'herz', 'esche', 'biet', 'perfek', 'rahm', 'priva', 'feierlichkeit', 'firmenanlass', 'montag', 'freitag', 'off', 'samstag', 'sonntag', 'verwoh', 'sellschaft', 'voranmeldung', 'zusatzlich', 'off', 'sonntag', 'find', 'aktuell', 'tag', 'menu', 'inklusiv', 'tagessupp', 'tagessalat', 'spaghetti', 'alla', 'vongol', '25.00', 'flammkuch', 'hawaii', '24.00', 'bauernwurst', 'frisch', 'meerrettich', 'salzkartoffel', 'saue