In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Visualization
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
import seaborn as sns

# sklearn for feature extraction & modeling
import re

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

%matplotlib inline
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (14.0, 8.7)

In [None]:
train = pd.read_csv('/kaggle/input/spooky-author-identification/train.zip')
train.head()

In [None]:
count = train.author.value_counts().reset_index()
count

In [None]:
plt.figure(figsize = (12,7))
ax = sns.barplot(x="index", y="author", data=count,palette = 'Pastel1_r')

In [None]:
## check Nan value
for i in train.columns:
    print (i+": "+str(train[i].isna().sum()))

In [None]:
# drop ID column
train = train.drop(columns = ['id'], axis=1)
train.head()

In [None]:
cloud=WordCloud(colormap="summer",width=700,height=500).generate(str(train["text"]))
fig=plt.figure(figsize=(13,18))
plt.axis("off")
plt.imshow(cloud,interpolation='bilinear')

In [None]:
train['text_cleaning'] = train['text'].map(lambda x: re.sub('[,\.!?]','',x))
train['text_cleaning'] = train['text_cleaning'].map(lambda x:x.lower())
print(train['text_cleaning'].head())

In [None]:
from nltk.corpus import stopwords

# remove stopwords
stopWords = stopwords.words('english')
def removeStopWords(stopWords, rvw_txt):
    newtxt = ' '.join([word for word in rvw_txt.split() if word not in stopWords])
    return newtxt
train['text_cleaning'] = [removeStopWords(stopWords,x) for x in train['text_cleaning']]

In [None]:
# join the different text together
full_Text = ','.join(list(train['text_cleaning'].values))
# generate the word cloud
wordcloud = WordCloud(background_color="white",
                      max_words= 600,
                      contour_width = 10,
                      contour_color = "",
                     collocations=False).generate(full_Text)
# visualize the word cloud
fig = plt.figure(1, figsize = (12, 12))
plt.axis('off')
plt.imshow(wordcloud)
plt.show()

In [None]:
# Split Data to train & test Topic Models
train_docs, test_docs = train_test_split(train, 
                                         stratify=train.author, 
                                         test_size=50, 
                                         random_state=42)

In [None]:
train.head()

In [None]:
vectorizer = TfidfVectorizer(max_df=.5, 
                             min_df=.001, 
                             stop_words='english')

train_dtm = vectorizer.fit_transform(train["text_cleaning"])
words = vectorizer.get_feature_names()
train_dtm

In [None]:
len(words)

In [None]:
n_components = 3
topic_labels = ['Topic {}'.format(i) for i in range(1, n_components+1)]

In [None]:
lda_base = LatentDirichletAllocation(n_components=n_components,
                                     n_jobs=-1,
                                     learning_method='batch',
                                     max_iter=20)
lda_base.fit(train_dtm)

In [None]:
# pseudo counts
topics_count = lda_base.components_
print(topics_count.shape)
topics_count[:3]

In [None]:
topics_prob = topics_count / topics_count.sum(axis=1).reshape(-1, 1)
topics = pd.DataFrame(topics_prob.T,
                      index=words,
                      columns=topic_labels)
topics.head()

In [None]:
top_words = {}
for topic, words_ in topics.items():
    top_words[topic] = words_.nlargest(20).index.tolist()
pd.DataFrame(top_words)

In [None]:
train_preds = lda_base.transform(train_dtm)
train_preds.shape

In [None]:
train_eval = pd.DataFrame(train_preds, columns=topic_labels, index=train.author)
train_eval.sample(n=3)

In [None]:
df = train_eval.groupby(level='author').idxmax(
    axis=1).reset_index(-1, drop=True)
sns.heatmap(df.groupby(level='author').value_counts(normalize=True)
            .unstack(-1), annot=True, fmt='.1%', cmap='Blues', square=True)
plt.title('Train Data: Topic Assignments')