In [5]:
# Imports
from luigi.contrib.spark import PySparkTask
from luigi.parameter import IntParameter, DateSecondParameter
from luigi import LocalTarget, Task, WrapperTask
import datetime
import pandas as pd
import numpy as np
import re
from nltk.stem.cistem import Cistem
from sklearn.feature_extraction.text import CountVectorizer
%run Importer.ipynb


class Preprocessor(Task):
# External Methods for preprocessing
    def toLowerCase(self, text):
        return text.lower()
    
    def priceTagger(self, text):
        # match patterns with decimalpoint or comma, real rappen-values and chf,sfr,fr or .-:
        # whitespaces inside () are optional
        # characters inside [] are prohibited
        # x => number
        # a => letter
        #   [x or a or , or .]xxx.xx( )chf[x or a]
        text = re.sub(r'[^0-9a-z\.\,][0-9]{1,3}(\.|\,)[0-9](5|0) {0,1}(chf|sfr|fr|\.\-)[^0-9a-z]', ' priceentity ', text)
        # match following patterns with chf,sfr,fr or .-:
        # characters inside () are optional
        # characters inside [] are prohibited
        # x => number
        # a => letter
        #   [x or a or , or .]xxx( )chf[x or a]
        text = re.sub(r'[^0-9a-z\.\,][0-9]{1,3} {0,1}(chf|sfr|fr|\.\-)[^0-9a-z]', ' priceentity ', text)
        # match following patterns with decimalpoint or comma, real rappen-values and chf,sfr,fr or .-:
        # characters inside () are optional
        # characters inside [] are prohibited
        # x => number
        # a => letter
        #   [x or a or , or .]chf(.)( )xxx.xx[x or a]
        text = re.sub(r'[^0-9a-z\.\,](chf|sfr|fr)\.{0,1} {0,1}[0-9]{1,3}(\.|\,)[0-9](5|0)[^0-9a-z]', ' priceentity ', text)
        # match following patterns with decimalpoint or comma and real rappen-values:
        # characters inside () are optional
        # characters inside [] are prohibited
        # x => number
        # a => letter
        #   [x or a or , or .]xxx.xx[x or a]
        # to avoid detecting day times or dates the regex only detects
        # prices with values after decimalpoint over 59 (i.e 12.60 or 1.65)
        #text = re.sub(r'[^0-9a-z\.\,][0-9]{1,3}(\.|\,)[6-9](0|5)[^0-9\.a-z]', ' priceentity ', text)
        return text
        
    def removeSpecialCharacters(self, text):
        return re.sub(r'[^éàèÉÀÈäöüÄÖÜa-zA-Z]+', ' ', str(text)) #0-9 entfernt
    
    def removeSingleCharacters(self, text):
        return re.sub(r'\s+[a-zA-Z]\s+', ' ', text)
    
    def removeMultiSpaces(self, text):
        return re.sub(r'\s+', ' ', text, flags=re.I)
    
    def stemText(self, text):
        stemmer = Cistem()
        return [stemmer.stem(word) for word in text.split()]
    
    def removeStopWords(self, words):
        # use own stopword list
        stop = pd.read_csv('../stopwords_no_umlaute.txt', header=None)
        stop.columns = ['word']
        # convert list to set for word comparison
        stopwordSet = set(stop.word)
        wordsFiltered = []
        wordsRemoved = []
        for w in words:
            if w not in stopwordSet:
                wordsFiltered.append(w)
            if w in stopwordSet:
                wordsRemoved.append(w)

        #print("Removed words: %s" % wordsRemoved)
        #print("Percentage of removed words: %s" % (len(wordsRemoved)/len(words)*100))
        return wordsFiltered
    
    def replaceUmlaut(self, text):
        text = re.sub(r'ä', 'a', text)
        text = re.sub(r'ö', 'o', text)
        text = re.sub(r'ü', 'u', text)
        return text
        
    
    # Date for Output-File prefix
    from datetime import date, timedelta
    date = DateSecondParameter(default=datetime.datetime.now())
    
    # Method to declare the Output-File
    def output(self):
        prefix = self.date.strftime("%Y-%m-%dT%H%M%S")
        return LocalTarget("../data/%s_FrequentWordFinder_out.csv" % prefix, format=UTF8)
    
    # Method to define the required Task (Importer)
    def requires(self):
        return Importer()


    # Preprocess the imported Data
    def run(self):
        df = pd.read_csv(self.input().path)
        output_df = pd.DataFrame(columns=('text', 'url', 'title', 'Class'))
        
        # Preprocessing
        for index, document in df.iterrows():
            # Text Preprocessing
            text = self.toLowerCase(str(document.text))
            text = self.priceTagger(text)
            text = self.removeSpecialCharacters(text)
            text = self.removeSingleCharacters(text)
            text = self.removeMultiSpaces(text)
            text = self.stemText(text)
            #text = text.split(' ')  #--> Nur Anwenden, falls Stemming weggelassen wird
            text = self.removeStopWords(text)
            
            # Title Preprocessing
            title = self.toLowerCase(str(document.title))
            title = self.replaceUmlaut(title)
            title = self.removeSpecialCharacters(title)
            title = self.removeSingleCharacters(title)
            title = self.removeMultiSpaces(title)
            
            #Write rows for Output-File
            row = [text, document.url, title, document.Class]
            output_df.loc[index] = row
        
        
        # Bag of Words
        # max = Amount of Words
        # binary: Anzahl Wörter im Dokument irrelevant
        pos_vectorizer = CountVectorizer(max_features=50, binary=True) #, min_df=5, max_df=0.7  
        neg_vectorizer = CountVectorizer(max_features=50, binary=True) #, min_df=5, max_df=0.7  

        pos_bow=pd.DataFrame(columns=('text', 'url', 'title', 'Class'))
        neg_bow=pd.DataFrame(columns=('text', 'url', 'title', 'Class')) 
        for index, document in output_df.iterrows():
            row = [document.text, document.url, document.title, document.Class]
            if document.Class == 1:
                pos_bow.loc[index] = row
                pos_bow.text[index] = ' '.join(pos_bow.text[index])
            else:
                neg_bow.loc[index] = row
                neg_bow.text[index] = ' '.join(neg_bow.text[index])
        
        np.set_printoptions(threshold=np.inf) # Ganzes Array printen
        pos = pos_vectorizer.fit_transform(pos_bow.text).toarray()
        print('---------POS--------')
        print(pos_vectorizer.get_feature_names())
        #print(pos)
        neg = neg_vectorizer.fit_transform(neg_bow.text).toarray() 
        print('---------NEG--------')
        print(neg_vectorizer.get_feature_names())
        #print(neg)
        
        
        # Write .csv-File
        with self.output().open("w") as out:
            output_df.to_csv(out, encoding="utf-8")
            
    
 

pre = Preprocessor()
pre.run()


---------POS--------
['aktuell', 'all', 'angebo', 'back', 'bitt', 'bra', 'ch', 'dess', 'dienstag', 'donnerstag', 'fisch', 'fleisch', 'freitag', 'fri', 'frisch', 'gru', 'hausgemach', 'hom', 'impressum', 'kas', 'klei', 'knoblauch', 'kontak', 'menu', 'misch', 'mittagsmenu', 'montag', 'mus', 'offnungszei', 'pomm', 'poul', 'prei', 'priceentity', 'reis', 'reservatio', 'restaura', 'rich', 'rindfleisch', 'sala', 'samstag', 'sauc', 'schloss', 'servier', 'sonntag', 'supp', 'toma', 'vegetarisch', 'vorspei', 'zurich', 'zwiebel']
---------NEG--------
['aktuell', 'all', 'anfahr', 'anfrag', 'angebo', 'anlass', 'bankett', 'bitt', 'ch', 'datenschutz', 'de', 'ess', 'find', 'fr', 'freitag', 'freu', 'gas', 'ger', 'hom', 'hotel', 'impressum', 'job', 'konn', 'kontak', 'kuch', 'link', 'mail', 'menu', 'montag', 'nich', 'niess', 'off', 'offnungszei', 'perso', 'priceentity', 'reservatio', 'reservier', 'restaura', 'samstag', 'schloss', 'schweiz', 'sonntag', 'speisekar', 'stell', 'tag', 'team', 'telefo', 'wein', 

In [84]:
import collections
poswords = ['aktuell', 'all', 'angebo', 'back', 'bitt', 'bra', 'ch', 'dess', 'dienstag', 'donnerstag', 'fisch', 'fleisch', 'freitag', 'fri', 'frisch', 'gru', 'hausgemach', 'hom', 'impressum', 'kas', 'klei', 'knoblauch', 'kontak', 'menu', 'misch', 'mittagsmenu', 'montag', 'mus', 'offnungszei', 'pomm', 'poul', 'prei', 'priceentity', 'reis', 'reservatio', 'restaura', 'rich', 'rindfleisch', 'sala', 'samstag', 'sauc', 'schloss', 'servier', 'sonntag', 'supp', 'toma', 'vegetarisch', 'vorspei', 'zurich', 'zwiebel']
negwords = ['aktuell', 'all', 'anfahr', 'anfrag', 'angebo', 'anlass', 'bankett', 'bitt', 'ch', 'datenschutz', 'de', 'ess', 'find', 'fr', 'freitag', 'freu', 'gas', 'ger', 'hom', 'hotel', 'impressum', 'job', 'konn', 'kontak', 'kuch', 'link', 'mail', 'menu', 'montag', 'nich', 'niess', 'off', 'offnungszei', 'perso', 'priceentity', 'reservatio', 'reservier', 'restaura', 'samstag', 'schloss', 'schweiz', 'sonntag', 'speisekar', 'stell', 'tag', 'team', 'telefo', 'wein', 'weinkar', 'werd']

for i in range(len(poswords)):
    for j in range(len(negwords)):
        if poswords[i] == negwords[j]:
            poswords[i] = 'remove'
            negwords[j] = 'remove'

while 'remove' in poswords: poswords.remove('remove')
while 'remove' in negwords: negwords.remove('remove')
    

print(poswords)
print(negwords)

['back', 'bra', 'dess', 'dienstag', 'donnerstag', 'fisch', 'fleisch', 'fri', 'frisch', 'gru', 'hausgemach', 'kas', 'klei', 'knoblauch', 'misch', 'mittagsmenu', 'mus', 'pomm', 'poul', 'prei', 'reis', 'rich', 'rindfleisch', 'sala', 'sauc', 'servier', 'supp', 'toma', 'vegetarisch', 'vorspei', 'zurich', 'zwiebel']
['anfahr', 'anfrag', 'anlass', 'bankett', 'datenschutz', 'de', 'ess', 'find', 'fr', 'freu', 'gas', 'ger', 'hotel', 'job', 'konn', 'kuch', 'link', 'mail', 'nich', 'niess', 'off', 'perso', 'reservier', 'schweiz', 'speisekar', 'stell', 'tag', 'team', 'telefo', 'wein', 'weinkar', 'werd']
