In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import re
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import gensim
from gensim.utils import simple_preprocess
from gensim.models import Phrases, phrases, ldamodel, CoherenceModel
import nltk
from nltk.corpus import stopwords
import spacy
import gensim.corpora as corpora
from pprint import pprint
import pyLDAvis.gensim 
import matplotlib.pyplot as plt
import seaborn as sns

In this notebook we will apply topic modelling and create three topics as given dataset consists of three authors and the excerpts from their horror stories. These authors are Edgar Allan Poe, Mary Shelley, and HP Lovecraft. Dataset contains text from works of fiction written by spooky authors of the public domain: Edgar Allan Poe, HP Lovecraft and Mary Shelley.EAP work is around tales of mystery and the macabre. Mary Shelley work is around science fiction and  HP Lovecraft, best known as a writer of weird fiction.

In [None]:
authors_data_df = pd.read_csv('/kaggle/input/spooky-author-identification/train.zip')
authors_data_df.head()

In [None]:
# the length of the dataset
authors_data_df.shape

In [None]:
# any null value
authors_data_df.isnull().sum()

Do not find any null column

In [None]:
#for the topic modelling we will focus only on the text data
authors_data_df = authors_data_df.drop(columns = ['id'], axis=1)
authors_data_df.head()

## Inspecting data

In [None]:
authors_data_df['author'].value_counts()

#### The distribution of the excerpts script by author and we can view the number of excerpts are more for EAP then MWS and then HPL

## Data cleaning and preparation

In [None]:
authors_data_df['text_processed'] = authors_data_df['text'].map(lambda x: re.sub('[,\.!?]','',x))

In [None]:
authors_data_df['text_processed'] = authors_data_df['text_processed'].map(lambda x:x.lower())
print(authors_data_df['text_processed'].head())

In [None]:
# remove all characters, number or characters
def cleanText(input_string):
    modified_string = re.sub('[^A-Za-z0-9]+', ' ', input_string)
    return(modified_string)
authors_data_df['text_processed'] = authors_data_df.text_processed.apply(cleanText)
authors_data_df['text_processed'][150]

In [None]:
#NLTK stop words
nltk.download('stopwords')

In [None]:
stopWords = stopwords.words('english')
stopWords.extend(["make","mr","de","without","let","rather","upon","within","made","must","much","yet","thought","see",
                  "said","us","say","whose","though","every","know",
                  "many","will","never","even","found","might","almost",'although','indeed','thus','still',
                  'this','me','of','may', 'would', 'ever','could','shall','come','go','soon','however','become',
                  'give','take','well'])
def removeStopWords(stopWords, rvw_txt):
    newtxt = ' '.join([word for word in rvw_txt.split() if word not in stopWords])
    return newtxt
authors_data_df['text_processed'] = [removeStopWords(stopWords,x) for x in authors_data_df['text_processed']]

### Exploratory data analysis with Wordcloud

In [None]:
# join the different text together
longText = ','.join(list(authors_data_df['text_processed'].values))
# generate the word cloud
wordcloud = WordCloud(background_color="white",
                      max_words= 500,
                      contour_width = 8,
                      contour_color = "steelblue",
                     collocations=False).generate(longText)
# visualize the word cloud
fig = plt.figure(1, figsize = (10, 10))
plt.axis('off')
plt.imshow(wordcloud)
plt.show()


In [None]:
from collections import Counter
fig = plt.figure(1, figsize = (20,10))
# split() returns list of all the words in the string
split_it = longText.split()
# Pass the split_it list to instance of Counter class.
Counter = Counter(split_it)
#print(Counter)
# most_common() produces k frequently encountered
# input values and their respective counts.
most_occur = Counter.most_common(30)
x_df = pd.DataFrame(most_occur, columns=("words","count"))
sns.barplot(x = 'words', y = 'count', data = x_df)

In [None]:
hplDatadf = authors_data_df[authors_data_df.author=="HPL"]
hplDatadf.head()

In [None]:
mwsDatadf = authors_data_df[authors_data_df.author=="MWS"]
mwsDatadf.head()

In [None]:
eapDatadf = authors_data_df[authors_data_df.author=="EAP"]
eapDatadf.head()

In [None]:
# join the different text together
longText = ','.join(list(hplDatadf['text_processed'].values))
# generate the word cloud
wordcloud = WordCloud(background_color="white",
                      max_words= 500,
                      contour_width = 8,
                      contour_color = "steelblue",
                     collocations=False).generate(longText)

# visualize the word cloud
fig = plt.figure(1, figsize = (10, 10))
plt.axis('off')
plt.imshow(wordcloud)
plt.show()

In [None]:
from collections import Counter
fig = plt.figure(1, figsize = (20,10))
# split() returns list of all the words in the string
split_it = longText.split()
# Pass the split_it list to instance of Counter class.
Counter = Counter(split_it)
#print(Counter)
# most_common() produces k frequently encountered
# input values and their respective counts.
most_occur = Counter.most_common(30)
x_df = pd.DataFrame(most_occur, columns=("words","count"))
sns.barplot(x = 'words', y = 'count', data = x_df)

HP lovecraft wordcloud displays words like "night", "death","dream", "dead","fear","horror", "strange", "window", "ancient" which seem to resonate with themes that the author was famous fo

In [None]:
# join the different text together
longText = ','.join(list(mwsDatadf['text_processed'].values))
# generate the word cloud
wordcloud = WordCloud(background_color="white",
                      max_words= 500,
                      contour_width = 8,
                      contour_color = "steelblue",
                     collocations=False).generate(longText)
# visualize the word cloud
fig = plt.figure(1, figsize = (10, 10))
plt.axis('off')
plt.imshow(wordcloud)
plt.show()

Mary Shelley wordcloud displays words as fear, heart, raymond, mind, soul, power, hope, feeling, death,spirit,friend, death. Positive as well as negative words

In [None]:
from collections import Counter
fig = plt.figure(1, figsize = (20,10))
# split() returns list of all the words in the string
split_it = longText.split()
# Pass the split_it list to instance of Counter class.
Counter = Counter(split_it)
#print(Counter)
# most_common() produces k frequently encountered
# input values and their respective counts.
most_occur = Counter.most_common(30)
x_df = pd.DataFrame(most_occur, columns=("words","count"))
sns.barplot(x = 'words', y = 'count', data = x_df)

In [None]:
# join the different text together
longText = ','.join(list(eapDatadf['text_processed'].values))
# generate the word cloud
wordcloud = WordCloud(background_color="white",
                      max_words= 500,
                      contour_width = 8,
                      contour_color = "steelblue",
                     collocations=False).generate(longText)
# visualize the word cloud
fig = plt.figure(1, figsize = (10, 10))
plt.axis('off')
plt.imshow(wordcloud)
plt.show()

In [None]:
from collections import Counter
fig = plt.figure(1, figsize = (20,10))
# split() returns list of all the words in the string
split_it = longText.split()
# Pass the split_it list to instance of Counter class.
Counter = Counter(split_it)
#print(Counter)
# most_common() produces k frequently encountered
# input values and their respective counts.
most_occur = Counter.most_common(30)
x_df = pd.DataFrame(most_occur, columns=("words","count"))
sns.barplot(x = 'words', y = 'count', data = x_df)

Edgar Allan Poe wordcloud displays words as life,end,friend, night, far, open,eye, great, one, little, time, good, manner, moment etc

### Preparing data for Topic Modelling

#### Step 1 Processed text to words or tokens

In [None]:
def text_to_tokens (textSentences):
    for sent in textSentences:
        yield(simple_preprocess(str(sent),deacc=True))

wordsData=authors_data_df.text_processed.values.tolist()
wordsDataList = list(text_to_tokens(wordsData))
print(wordsDataList[:1])

#### Step 2 Building N grams

In [None]:
# Building the tokens
tokens = Phrases(wordsDataList,min_count=5,threshold=100)
tokensModel = phrases.Phraser(tokens)

In [None]:
def make_tokens_model(textSentences):
   return[tokensModel[doc] for doc in textSentences]

In [None]:
def lemmatizedText(textSentences, allowed_postags=['NOUN','ADJ','VERB','ADV']):
    textSent_Output = []
    for sent in textSentences:
        doc = nlp(" ".join(sent))
        textSent_Output.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return textSent_Output

Step 3 Lemmatize Is the process of converting the words to the root words

In [None]:
# form n grams
dataWordsngrams = make_tokens_model(wordsDataList)
dataWordsngrams[:2]

In [None]:
# initialize spacy "en" model keeping only the tagger component 
nlp = spacy.load("en_core_web_sm",disable=['parser','ner'])
# lemmatize keeping only noun, adj, adv and verbs
lemmatizedTextData = lemmatizedText(dataWordsngrams, allowed_postags=["NOUN","ADJ","VERB","ADV"])
print(lemmatizedTextData[:1])

Step 4 Building the corpora

In [None]:
# create dictionary
dictObject = corpora.Dictionary(lemmatizedTextData)
# create corpus
textData = lemmatizedTextData
dictObject[0]

In [None]:
# term document frequency
corpusData = [dictObject.doc2bow(text) for text in textData]
print(corpusData[:1])

In [None]:
# Human readable format of the term frequency corpus 
[[(dictObject[idx], count) for idx, count in x] for x in corpusData[:1]]

Build the model

In [None]:
n_topics = 3
ldaModel = ldamodel.LdaModel(corpus=corpusData,
                            id2word = dictObject,
                            num_topics=n_topics,
                            random_state=123,
                            chunksize=100,
                            passes=10,
                            alpha=0.01,
                            eta='auto',
                            iterations=400,
                            per_word_topics = True                            
                            )

In [None]:
# print the keyword in the topics
pprint(ldaModel.print_topics())

In [None]:
coherenceModelLda = CoherenceModel(model=ldaModel, texts = lemmatizedTextData,dictionary=dictObject,coherence='c_v')
coherenceScore = coherenceModelLda.get_coherence()
print(coherenceScore)

In [None]:
pyLDAvis.enable_notebook()
vis=pyLDAvis.gensim.prepare(ldaModel,corpusData,dictObject)
vis

The human interpretability of these topics returned by the statistical analysis is not easy. We may involve domain expertise to check whether the topics makes sense.

When we compare the top 30 words from the model per topic and then look into the top 30 words from the work of each of the author then may be Topic1 maps to EAP, Topic2 maps tp MWS and Topic3 maps to HPL based on the subset of words that overlap the most.