... imports

In [None]:
import pandas as pd
import numpy as np
import os
import spacy
from spacy.lang.de import German
from spacy.lang.en import English
from spacy import displacy

from spacy.lang.de.stop_words import STOP_WORDS
nlp = spacy.load('de_core_news_sm')

***
## Text Preprocessing mit Spacy

### Online-Artikel aus auto.motorsport einlesen und tokenizen

In [None]:
txt = open("Vierer.txt", encoding="utf-8").read()
doc = nlp(txt); doc

for i,sent in enumerate(doc.sents):
    print(i,sent)
    
# print(txt)

Beim Einlesen dem sog. "Parsen" wird ein vortrainiertes NLP-Modell auf die Daten angewandt.  
Dadurch werden die einzelnen Elemente des Textes automatisch bereits klassifiziert:  

In [None]:
bspDisp = nlp("Die große vertikale Niere ist zurück.")
displacy.render(bspDisp, style=("dep"))

Natürlich liegen diese Informationen für jedes "Token" (hier jedes Wort) vor:

In [None]:
bmwToken = pd.DataFrame({"TOKEN":[], "LEMMA":[], "POS":[], "TAG":[], "DEP":[], "SHAPE":[], "ALPHA":[],  "STOP":[],})
for i,token in enumerate(doc):
    tokenFeatures = [token.text,	token.lemma_, token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop,]
    bmwToken.loc[i,:] = tokenFeatures

bmwToken.drop_duplicates(subset="TOKEN", inplace=True)
print(len(bmwToken))
bmwToken.sample(20)

In [None]:
bmwToken.sample(20, random_state=111)

### Stoppwörter rausfiltern

Stoppwort Liste erstellen

In [None]:
stopwordsDE = list(STOP_WORDS)
print("Anzahl: ",len(stopwordsDE)); print(stopwordsDE[:50])

In [None]:
bsp = nlp("Dieser Satz strotzt nur so vor lauter Stoppwörtern und es macht nicht nur Spaß sie alle zu finden ")
for token in bsp:
    if token.is_stop==True:
        print(token.text, token.pos_, token.is_stop)

In [None]:
NoStopWordsDoc = [token for token in doc if token.is_stop == False]
print(NoStopWordsDoc[:100])

### Lemmatisieren

In [None]:
bspLemma = nlp('sehen gesehen sah sieht saht sahen seht siehste')
for lem in bspLemma: print(lem.text, "LEMMA ==> ", lem.lemma_)

### Entity Detection

In [None]:
entities = pd.DataFrame({"text":[], "start_char":[],"end_char":[],"label":[]})
for i, ent in enumerate(doc.ents):
    vars = [ent.text, ent.start_char, ent.end_char, ent.label_]
    # print(vars)
    entities.loc[i,:] = vars
    
entities.drop_duplicates(subset="text", inplace=True)
print(len(entities))
print(entities.label.unique())
print(entities.text.unique())

In [None]:
displacy.render(doc, style="ent", page=True,)

***
## Text Classification auf Basis von YELP, amazon & IMDB Reviews

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix 

Daten laden

In [None]:
pd.set_option('display.max_colwidth', -1)
data = pd.read_csv("amazonYelpImdb.csv")
data.sample(10)

In [None]:
data.Sentiment.value_counts()

Preprocessing Function, die jeden einzelnen Review bearbeitet

In [None]:
from spacy.lang.en.stop_words import STOP_WORDS
import string
punct = string.punctuation
stopwords = list(STOP_WORDS)

def text_data_cleaning(sentence):
    doc = nlp(sentence)
    
    tokens = []
    for token in doc:
        if token.lemma_ != "-PRON-":
            temp = token.lemma_.lower().strip()
        else:
            temp = token.lower_
        tokens.append(temp)
    
    cleaned_tokens = []
    for token in tokens:
        if token not in stopwords and token not in punct:
            cleaned_tokens.append(token)
    return cleaned_tokens

In [None]:
text_data_cleaning("Martin, Herbert & Susanne are going to enjoy this afternoon at BMW after their heavy lunch break")

#### Vectorization Feature Engineering (TF-IDF)

In [None]:
from sklearn.svm import LinearSVC
tfidf = TfidfVectorizer(tokenizer=text_data_cleaning)
classifier = LinearSVC()

X,y = data["Review"], data["Sentiment"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

clf = Pipeline([('tfidf', tfidf), ('clf', classifier)])
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test,y_pred, target_names=["Negativ", "Positiv"]))


In [None]:
newText_1 = "Will not buy a Mercedes again"
newText_2 = "The new BMW is an absolutely wonderful experience"
clf.predict([newText_1, newText_2,])

# Parkplatz

In [None]:
url = "https://raw.githubusercontent.com/strategiepilot/NLP-Tutorial-8---Sentiment-Classification-using-SpaCy-for-IMDB-and-Amazon-Review-Dataset/master/datasets/"
columns_name = ['Review', 'Sentiment']
data_yelp = pd.read_csv(url+"yelp_labelled.txt", sep='\t', header = None, names=columns_name)            ; print(data_yelp.shape)
data_amazon = pd.read_csv(url+"amazon_cells_labelled.txt", sep='\t', header = None, names=columns_name)  ; print(data_amazon.shape)
data_imdb = pd.read_csv(url+"imdb_labelled.txt", sep='\t', header = None, names=columns_name)            ; print(data_imdb.shape)
data = pd.concat([data_yelp, data_amazon, data_imdb], axis=0, ignore_index=True, )                       ; print(data.shape)
data.head()