In [None]:
!git clone https://github.com/sauarampfa1/nlp-ws19.git
!pip install pyLDAvis
!python -m spacy download de_core_news_sm
!python -m spacy download en
import os
import sys
os.chdir('nlp-ws19')
sys.path.insert(0, os.path.abspath('.'))

In [None]:
%matplotlib inline
import pandas as pd
import numpy as np

import spacy

import itertools

import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import LabelEncoder

from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier

from sklearn.metrics import confusion_matrix, accuracy_score

from plot_cm import plot_confusion_matrix

from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamodel import LdaModel

import pyLDAvis.gensim
pyLDAvis.enable_notebook()

## Dataset

#### Read the "articles.csv" file into the variable "data" via pandas

#### Print the first 10 rows and have a look at it to get to know the dataset

#### Investigate how many instances of each class are present in the data. Is the dataset balanced? Unbalanced? Plot the distribution

#### Load the german spacy word model

In [None]:
nlp = spacy.load("de_core_news_sm")

#### Apply spacy pre-processing to the 'content' column and store the result in a variable called 'content'

In [None]:
content = nlp.pipe(data['content'])

#### Read the stopwords file "stopwords.txt" into the variable 'stopwords'

In [None]:
with open('stopwords.txt', 'r') as f:
    stopwords = f.readlines()

In [None]:
stopwords[:10]

#### Remove whitespaces and newlines from the stopwords

#### Set stopwords

In [None]:
def set_stop_word(sw):
    if sw.startswith("#") or sw.startswith("//"):
        return
    nlp.vocab[sw].is_stop = True
    sw = sw[0].upper() + sw[1:]
    nlp.vocab[sw].is_stop = True
    
for w in stopwords:
    set_stop_word(w)

#### Remove stopwords from content

In [None]:
#this line takes really long to execute. Stopwords are removed and lemmas are extracted.
#the result from this line is already stored in data['content_preprocessed'] -> see next line

#content_without_stopwords=[]
#for t in content:
#    content_without_stopwords.append("".join(list("".join(tok.lemma_ + tok.whitespace_) for tok in t if not tok.is_stop)))

In [None]:
content_without_stopwords = data['content_preprocessed']

#### Split the data into two sets: one for training the classifier and the other one for testing the performance. Eg. 80% for training and 20% for testing

#### Encode the texts with the TfidfVectorizer. Have a look at the hyperparamters. Transform your train and test data to vectors.

In [None]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=1000, min_df=2)
train_x = tfidf_vectorizer.fit_transform(train['content_preprocessed']).toarray()
test_x = tfidf_vectorizer.transform(test['content_preprocessed']).toarray()

#### Encode the labels with the LabelEncoder

#### Random Forest Classifier (RandomForestClassifier)

In [None]:
clf = RandomForestClassifier(10, random_state=2)
clf.fit(train_x, train_y)

predictions = clf.predict(test_x)
cnf_matrix = confusion_matrix(test_y, predictions)

In [None]:
print(accuracy_score(predictions, test_y))
plot_confusion_matrix(cnf_matrix, classes=le.classes_, title='', normalize=True)

In [None]:
lda_texts = []
for text in data['content_preprocessed'][:1000]:
    lda_texts.append([tok.text for tok in nlp(text) if tok.is_alpha])

In [None]:
common_dictionary = Dictionary(lda_texts)
common_corpus = [common_dictionary.doc2bow(text) for text in lda_texts]

lda = LdaModel(common_corpus, num_topics=10)

In [None]:
vis = pyLDAvis.gensim.prepare(lda, common_corpus, dictionary=common_dictionary)
vis

In [None]:
politic_texts = []
for i, text in enumerate(data['content_preprocessed']):
    if i >= 1000:
        break
    if data['category'][i] == 'politik':
        politic_texts.append([tok.text for tok in nlp(text) if tok.is_alpha])

In [None]:
politic_dictionary = Dictionary(politic_texts)
politic_corpus = [politic_dictionary.doc2bow(text) for text in politic_texts]

politic_lda = LdaModel(politic_corpus, num_topics=5)

In [None]:
politic_vis = pyLDAvis.gensim.prepare(politic_lda, politic_corpus, dictionary=politic_dictionary)
politic_vis