In [None]:
import numpy as np 
import pandas as pd 
import re

import numpy as np 
import requests
from PIL import Image
from io import BytesIO 

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

from collections import Counter
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(rc={'figure.figsize':(12,8)})

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Importing the dataset

In [None]:
df = pd.read_csv('/kaggle/input/nietzsches-bibliography/Nietzsche_works_corpus.csv')
df.head()

In [None]:
# Isolate the book wanted: Beyond Good and Evil
bge = df[df['book_title']=='Beyond Good and Evil']['text_clean'][0]
tokens = word_tokenize(bge)
# items to be removed
removed = {'project', 'gutenberg', 'ebook', 'it', 's', 'the', 'and'}
tokens = [ele for ele in tokens if ele not in removed]

# Word Frequency

In [None]:
freq = Counter(tokens)
sorted_freq = dict(sorted(freq.items(), key=lambda x: x[1], reverse=True))
top_25_words = list(sorted_freq.keys())[:25]
top_25_freq = list(sorted_freq.values())[:25]
sns.barplot(y=top_25_words, x=top_25_freq)

# Word Cloud

## 1- Simple

In [None]:
def plot_cloud(wordcloud):
    # Set figure size
    plt.figure(figsize=(12, 8))
    # Display image
    plt.imshow(wordcloud) 
    # No axis details
    plt.axis("off");
    

wordcloud = WordCloud(width = 3000, height = 2000, random_state=1, 
                      background_color='black', colormap='Set2', 
                      collocations=False, stopwords = STOPWORDS)
wordcloud.generate_from_frequencies(sorted_freq)
plot_cloud(wordcloud)

## 2 - With a mask image of Nietzsche

In [None]:
def read_img_from_url(url):
    response = requests.get(url)
    img = Image.open(BytesIO(response.content))
    img_matrix = np.array(img)
    return img_matrix

def read_txt_from_url(url, *size):
    text = requests.get(url).text
    wc = WordCloud(background_color="white", max_words=100 , max_font_size=100, width=size[0], height=size[1], random_state=42)
    wc.generate(text)
    return wc.to_array()
    
img_url = "https://nearemmaus.files.wordpress.com/2014/01/nietzsche_by_vanjamrgan.jpg"
img_matrix = read_img_from_url(img_url)

stopwords = set(STOPWORDS)
stopwords.add("said")

wc = WordCloud(width = 3000, height = 2000, random_state=1, 
              background_color='black', colormap='Reds', 
              collocations=False, stopwords = STOPWORDS, mask=img_matrix)

# generate word cloud
wc.generate_from_frequencies(sorted_freq)

# show
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.figure()
plt.show()

# Stemming

In [None]:
stemmer = PorterStemmer()
stemmed_words = [stemmer.stem(word) for word in tokens]
stemmed_words[100:120]

# Lemmatizing

In [None]:
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in tokens]
lemmatized_words[100:120]

# Tagging words in the text

In [None]:
words_tags = nltk.pos_tag(tokens)
words_tags[:20]

# Dispersion Plot

In [None]:
from nltk.draw.dispersion import dispersion_plot
from nltk.text import Text
# inaugural_tokens=inaugural.words()
text = Text(tokens)
dispersion_plot(text, top_25_words, ignore_case=True, title='Beyond Good and Evil top 25 words Plot')

# Frequency Distribution with NLTK

In [None]:
from nltk import FreqDist
from nltk.corpus import stopwords
frequency_distribution = FreqDist(tokens)
frequency_distribution.most_common(20)

In [None]:
frequency_distribution.plot(20, cumulative=True)