... imports

In [1]:
import pandas as pd
import numpy as np
import os
import spacy
import warnings
warnings.filterwarnings('ignore')

from spacy.lang.de import German
from spacy.lang.en import English
from spacy import displacy

from spacy.lang.de.stop_words import STOP_WORDS
nlp = spacy.load('de_core_news_sm')

***
## 1. Text Preprocessing mit Spacy

Beim Einlesen dem sog. "Parsen" wird ein vortrainiertes NLP-Modell auf die Daten angewandt.  
Dadurch werden die einzelnen Elemente des Textes automatisch bereits klassifiziert:  

In [14]:
e1 = "Sehr geehrte Damen und Herren, wir haben im Juli für unseren Sohn Markus Huber ein Festgeldkonto eröffnet"
e2 = "Hallo Team BMW Bank, bitte ändern Sie meine Adresse in Langemarckstraße 2, 86609 Donauwörth"
e3 = "Wir haben den Wagen am 13.3.2020 an die Schmidt Stahl GmbH, Frau Maria Bock verkauft"


In [4]:
bspDisp = nlp(e1)
displacy.render(bspDisp, style=("dep"))

In [13]:
for email in [e1, e2, e3]:
    doc = nlp(email)
    displacy.render(doc, style="ent", page=True,)


Natürlich liegen diese Informationen für jedes "Token" (hier jedes Wort) vor:

In [7]:
Token = pd.DataFrame({"TOKEN":[], "LEMMA":[], "POS":[], "TAG":[], "DEP":[], "SHAPE":[], "ALPHA":[],  "STOP":[],})
# doc = nlp(e1); doc

for i,token in enumerate(doc):
    tokenFeatures = [token.text,	token.lemma_, token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop,]
    Token.loc[i,:] = tokenFeatures

Token.drop_duplicates(subset="TOKEN", inplace=True)
print(len(Token))
Token

18


Unnamed: 0,TOKEN,LEMMA,POS,TAG,DEP,SHAPE,ALPHA,STOP
0,Sehr,Sehr,ADV,ADV,mo,Xxxx,True,True
1,geehrte,geehrt,ADJ,ADJA,nk,xxxx,True,False
2,Damen,Dame,NOUN,NN,sb,Xxxxx,True,False
3,und,und,CCONJ,KON,cd,xxx,True,True
4,Herren,Herr,NOUN,NN,cj,Xxxxx,True,False
5,",",",",PUNCT,"$,",punct,",",False,False
6,wir,ich,PRON,PPER,sb,xxx,True,True
7,haben,haben,AUX,VAFIN,ROOT,xxxx,True,True
8,im,im,ADP,APPRART,mo,xx,True,True
9,Juli,Juli,NOUN,NN,nk,Xxxx,True,False


### Entity Detection

In [15]:
entities = pd.DataFrame({"text":[], "start_char":[],"end_char":[],"label":[]})
for i, ent in enumerate(doc.ents):
    vars = [ent.text, ent.start_char, ent.end_char, ent.label_]
    # print(vars)
    entities.loc[i,:] = vars
    
entities.drop_duplicates(subset="text", inplace=True)
print(len(entities))
print(entities.label.unique())
print(entities.text.unique())

2
['ORG' 'PER']
['Schmidt Stahl GmbH' 'Maria Bock']


In [None]:
displacy.render(doc, style="ent", page=True,)

***
### 1b Text PreProcessing Beschwerde einer Vodafone Kundin (Facebook Post)

In [8]:
txt = open("vodafone.txt", encoding="utf-8").read()
doc = nlp(txt); doc
displacy.render(doc, style="ent", page=True,)

FileNotFoundError: [Errno 2] No such file or directory: 'vodafone.txt'

***
## 2. Text Classification auf Basis von YELP, amazon & IMDB Reviews

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix 

Daten laden

In [None]:
pd.set_option('display.max_colwidth', -1)
data = pd.read_csv("amazonYelpImdb.csv")
data["Meaning"] = "negativ"
data.loc[data.Sentiment==1, "Meaning"] = "posititv"
data.sample(10)

In [None]:
data.Sentiment.value_counts()

Preprocessing Function, die jeden einzelnen Review bearbeitet

In [None]:
from spacy.lang.en.stop_words import STOP_WORDS
import string
punct = string.punctuation
stopwords = list(STOP_WORDS)

def text_data_cleaning(sentence):
    doc = nlp(sentence)
    
    tokens = []
    for token in doc:
        if token.lemma_ != "-PRON-":
            temp = token.lemma_.lower().strip()
        else:
            temp = token.lower_
        tokens.append(temp)
    
    cleaned_tokens = []
    for token in tokens:
        if token not in stopwords and token not in punct:
            cleaned_tokens.append(token)
    return cleaned_tokens

In [None]:
text_data_cleaning("Martin, Herbert & Susanne are going to enjoy this afternoon at BMW after their heavy lunch break")

#### Vectorization Feature Engineering (TF-IDF)

In [None]:
from sklearn.svm import LinearSVC
tfidf = TfidfVectorizer(tokenizer=text_data_cleaning)
classifier = LinearSVC()

X,y = data["Review"], data["Sentiment"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

clf = Pipeline([('tfidf', tfidf), ('clf', classifier)])
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test,y_pred, target_names=["Negativ", "Positiv"]))


In [None]:
newText_1 = "Will not buy a Mercedes again"
newText_2 = "The new BMW is an absolutely wonderful experience"
clf.predict([newText_1, newText_2,])

# Parkplatz

In [None]:
url = "https://raw.githubusercontent.com/strategiepilot/NLP-Tutorial-8---Sentiment-Classification-using-SpaCy-for-IMDB-and-Amazon-Review-Dataset/master/datasets/"
columns_name = ['Review', 'Sentiment']
data_yelp = pd.read_csv(url+"yelp_labelled.txt", sep='\t', header = None, names=columns_name)            ; print(data_yelp.shape)
data_amazon = pd.read_csv(url+"amazon_cells_labelled.txt", sep='\t', header = None, names=columns_name)  ; print(data_amazon.shape)
data_imdb = pd.read_csv(url+"imdb_labelled.txt", sep='\t', header = None, names=columns_name)            ; print(data_imdb.shape)
data = pd.concat([data_yelp, data_amazon, data_imdb], axis=0, ignore_index=True, )                       ; print(data.shape)
data.head()