# Introduction
This is my first time working with textual data. Thanks to Edgar Allan Poe for the motivation!

In [None]:
# load necessary modules
import pandas as pd
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from collections import Counter
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.corpora.dictionary import Dictionary
from collections import defaultdict
import itertools
from gensim.models.tfidfmodel import TfidfModel
from nltk import pos_tag
from nltk import ne_chunk_sents
import spacy
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB
import numpy as np

# read data sets
df_train = pd.read_csv("../input/train.csv")
df_tests = pd.read_csv("../input/test.csv")
df_sampl = pd.read_csv("../input/sample_submission.csv")

In [None]:
# inspect training set
print(df_train.shape)
df_train.head()

The training set contains 19,579 rows and 3 columns: id, text, and author. We're obviously going to have to do some feature engineering. The target variable, author, has three possible values: EAP for Edgar Allan Poe, HPL for HP Lovecraft, and MWS for Mary Wollstonecraft Shelley.

In [None]:
# analyze distribution of target variable, author
df_train["author"].value_counts().plot.bar()

The distribution of authors is pretty balanced so I don't think we will need to use any type of subsampling.

In [None]:
# inspect the test set
print(df_tests.shape)
df_tests.head()

The test set contains 8,392 rows and 2 columns: id and text.

In [None]:
# inspect the sample submission
print(df_sampl.shape)
df_sampl.head()

The sample submission contains 8,392 rows (consistent with the test set) and 4 columns: id and the probability that the text was written by EAP, HPL, and MWS.

# Visualizations
I want to make one of those plots where the size of the word is related to the number of times is appears in the text. Here's how I see this working. I concatenate the text for EAP, HPL, and MWS, and then make a plot for each author.

In [None]:
# concatenate text written by EAP
eap_texts = df_train[df_train["author"] == "EAP"]["text"].str.cat(sep = " ")

In [None]:
# generate word cloud
eap_wordcloud = WordCloud().generate(eap_texts)

# Display the generated image:
# the matplotlib way:
plt.imshow(eap_wordcloud, interpolation = "bilinear")
plt.axis("off")

Very exciting! EAP really likes to use the word "upon".

In [None]:
# count number of times "upon" appears in the text written by EAP
eap_texts.count("upon")

In [None]:
# concatenate text written by HPL and MWS
hpl_texts = df_train[df_train["author"] == "HPL"]["text"].str.cat(sep = " ")
mws_texts = df_train[df_train["author"] == "MWS"]["text"].str.cat(sep = " ")

# generate word cloud
hpl_wordcloud = WordCloud().generate(hpl_texts)
mws_wordcloud = WordCloud().generate(mws_texts)

# display the generated image
plt.imshow(hpl_wordcloud, interpolation = "bilinear")
plt.axis("off")

In [None]:
# display the generated image
plt.imshow(mws_wordcloud, interpolation = "bilinear")
plt.axis("off")

This word cloud for MWS has an interesting feature, the name "Raymond". This is likely the name of a character in one of her works (https://en.wikipedia.org/wiki/The_Last_Man). Therefore, the presence of the word "Raymond" most likely corresponds to text written by MWS. Let's see if this is so.

In [None]:
# count the number of occurences of "Raymond" in each authors' texts
print(eap_texts.count("Raymond"))
print(hpl_texts.count("Raymond"))
print(mws_texts.count("Raymond"))

Indeed, the word "Raymond" appears in text written by MWS more than 99% of the time.

In [None]:
# add the number of characters in text as a feature
df_train["n_char"] = df_train["text"].map(lambda x: len(x))

In [None]:
df_train["n_char"].plot.hist(bins = 500)

In [None]:
df_train["n_char"].describe()

The median number of characters is 128. Since typical word length is 5 characters, this corresponds to about 25 words per text. Interestingly, the maximum number of characters is 4,663.

In [None]:
df_train[df_train["n_char"] == 4663]["text"].values

This excerpt is from Mathilda by MWS (https://en.wikipedia.org/wiki/Mathilda_(novella)). Upon inspection of the source, it appears that there is no punctuation in this passage. This is why the number of characters in this text is so large.

In [None]:
sns.FacetGrid(df_train, hue = "author", size = 6) \
   .map(sns.kdeplot, "n_char") \
   .add_legend()
plt.xlim(0, 500)

This plot shows that EAP tends to write shorter sentences than HPL and MWS. There is not enough separation in the target variable, however, for the number of characters to be an important feature in author classification.

# Natural language processing
It has been established that natural language processing is an integral part of building textual classifiers. Since I don't have any experience with these techniques, my analyses will follow the DataCamp course "Natural Language Processing Fundamentals in Python" (https://www.datacamp.com/courses/natural-language-processing-fundamentals-in-python). Please comment if you have any suggestions for processing techniques that I have not covered.

## Regular expressions & word tokenization

In [None]:
# split EAP texts into sentences
sentences = sent_tokenize(eap_texts)
sentences[3]

In [None]:
# tokenize the 4th sentence
tokenized_sent = word_tokenize(sentences[3])
tokenized_sent

In [None]:
# make a set of unique tokens in EAP texts
unique_tokens = set(word_tokenize(eap_texts))
unique_tokens

In [None]:
# search for the first occurrence of "Raymond" in MWS texts
match = re.search(r"Raymond", mws_texts)
print(match)
print(match.start(), match.end())

In [None]:
# search for anything in quotes in EAP texts
# https://stackoverflow.com/questions/171480/regex-grabbing-values-between-quotation-marks
pattern1 = r'"(.*?)"'

# find the first text in quotes
print(re.search(pattern1, eap_texts))

In [None]:
# plot a histogram of the word lengths in texts written by all authors
eap_words = word_tokenize(eap_texts)
hpl_words = word_tokenize(hpl_texts)
mws_words = word_tokenize(mws_texts)
eap_word_lenghts = [len(w) for w in eap_words]
hpl_word_lenghts = [len(w) for w in hpl_words]
mws_word_lenghts = [len(w) for w in mws_words]
f, (ax1, ax2, ax3) = plt.subplots(1, 3, sharey = True)
ax1.hist(eap_word_lenghts, bins = 20)
ax2.hist(hpl_word_lenghts, bins = 20)
ax3.hist(mws_word_lenghts, bins = 20)
plt.show()

## Simple topic identification

In [None]:
# building a counter with bag-of-words for EAP texts
# tokenize
tokens = word_tokenize(eap_texts)

# convert the tokens into lowercase
lower_tokens = [t.lower() for t in tokens]

# create a counter with the lowercase tokens
bow_simple = Counter(lower_tokens)

# print the 10 most common tokens
print(bow_simple.most_common(10))

In [None]:
# text preprocessing for EAP texts
# retain alphabetic words
alpha_only = [t for t in lower_tokens if t.isalpha()]

# remove all stop words (and, the, etc.)
no_stops = [t for t in alpha_only if t not in stopwords.words("english")]

# instantiate the WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

# lemmatize (reduce words to their word stem) all tokens into a new list
lemmatized = [wordnet_lemmatizer.lemmatize(t) for t in no_stops]

# create the bag-of-words
bow = Counter(lemmatized)

# print the 10 most common tokens
print(bow.most_common(10))

In [None]:
# generate word cloud
bow_wordcloud = WordCloud().generate_from_frequencies(bow)

# Display the generated image:
# the matplotlib way:
plt.imshow(bow_wordcloud, interpolation = "bilinear")
plt.axis("off")

In [None]:
# creating a corpus with gensim for EAP texts
my_documents = df_train[df_train["author"] == "EAP"]["text"].values

# tokenize and convert the tokens into lowercase
tokenized_docs = [word_tokenize(doc.lower()) for doc in my_documents]

# retain alphabetic words
# https://stackoverflow.com/questions/18072759/python-nested-list-comprehension
alpha_only_docs = [[t for t in doc if t.isalpha()] for doc in tokenized_docs]

# remove all stop words (and, the, etc.)
# for some reason, this is much faster than list comprehension
no_stops_docs = []
for i, doc in enumerate(alpha_only_docs) :
    print(i + 1, " of ", len(alpha_only_docs), " texts")
    no_stops = []
    for t in doc :
        if t not in stopwords.words("english") :
            no_stops.append(t)
    no_stops_docs.append(no_stops)

# lemmatize (reduce words to their word stem) all tokens into a new list
lemmatized_docs = [[wordnet_lemmatizer.lemmatize(t) for t in doc] for doc in no_stops_docs]

# create a dictionary from the articles
dictionary = Dictionary(lemmatized_docs)

# create a MmCorpus
corpus = [dictionary.doc2bow(doc) for doc in lemmatized_docs]

In [None]:
# save the first document
doc = corpus[0]

# sort the doc for frequency
bow_doc = sorted(doc, key = lambda w : w[1], reverse = True)

# print the top 5 words of the document alongside the count
print("First Document:")
for word_id, word_count in bow_doc[:5] :
    print(dictionary.get(word_id), word_count)

# create the defaultdict
total_word_count = defaultdict(int)
for word_id, word_count in itertools.chain.from_iterable(corpus):
    total_word_count[word_id] += word_count

# create a sorted list from the defaultdict
sorted_word_count = sorted(total_word_count.items(), key = lambda w : w[1], reverse = True)

# print the top 5 words across all documents alongside the count
print(" ")
print("All Documents:")
for word_id, word_count in sorted_word_count[:5]:
    print(dictionary.get(word_id), word_count)

In [None]:
# tf-idf

# creating a corpus with gensim for EAP texts
all_documents = [eap_texts, hpl_texts, mws_texts]

# tokenize and convert the tokens into lowercase
tokenized_docs = [word_tokenize(doc.lower()) for doc in all_documents]

# retain alphabetic words
alpha_only_docs = [[t for t in doc if t.isalpha()] for doc in tokenized_docs]

# remove all stop words
no_stops_docs = []
for i, doc in enumerate(alpha_only_docs) :
    print(i + 1, " of ", len(alpha_only_docs), " authors")
    no_stops = []
    for t in doc :
        if t not in stopwords.words("english") :
            no_stops.append(t)
    no_stops_docs.append(no_stops)

# lemmatize (reduce words to their word stem) all tokens into a new list
lemmatized_docs = [[wordnet_lemmatizer.lemmatize(t) for t in doc] for doc in no_stops_docs]

# create a dictionary from the articles
dictionary = Dictionary(lemmatized_docs)

# create a MmCorpus
corpus = [dictionary.doc2bow(doc) for doc in lemmatized_docs]

In [None]:
# create a new TfidfModel using the corpus
tfidf = TfidfModel(corpus)

# calculate the tfidf weights of EAP texts
tfidf_weights = tfidf[corpus[0]]

# sort the weights from highest to lowest
sorted_tfidf_weights = sorted(tfidf_weights, key = lambda w : w[1], reverse = True)

# print the top 5 weighted words
for term_id, weight in sorted_tfidf_weights[:5] :
    print(dictionary.get(term_id), weight)

Ok, so this is very cool. These are the top 5 words sorted by their tf-idf weights. The higher the weight, the more  that particular word uniquely classifies the author. So where do these words come from?
* Dupin: https://en.wikipedia.org/wiki/C._Auguste_Dupin
* Marie: https://en.wikipedia.org/wiki/The_Mystery_of_Marie_Rog%C3%AAt
* Jupiter: https://en.wikipedia.org/wiki/The_Gold-Bug
* Ellison: http://xroads.virginia.edu/~hyper/poe/l_garden.html

## Named-entity recognition
### NLTK

In [None]:
# tokenize EAP texts into sentences
sentences = sent_tokenize(eap_texts)

# tokenize each sentence into words
token_sentences = [word_tokenize(sent) for sent in sentences]

# tag each tokenized sentence into parts of speech
pos_sentences = [pos_tag(sent) for sent in token_sentences] 

# create the named entity chunks
chunked_sentences = ne_chunk_sents(pos_sentences, binary = True)

# test for stems of the tree with "NE" tags
for sent in chunked_sentences:
    for chunk in sent:
        if hasattr(chunk, "label") and chunk.label() == "NE":
            print(chunk)

### spaCy

In [None]:
# instantiate the english model
nlp = spacy.load('en', tagger = False, parser = False, matcher = False)

# create a new document
doc = nlp(eap_texts)

# Print all of the found entities and their labels
for ent in doc.ents:
    print(ent.label_, ent.text)

## Supervised learning
Awesome! So far we've learned a lot about how to process textual data, identify topics, and recognize named entities. Now, we will learn how to train a supervised learning model that can predict the author from the text.

In [None]:
# create a series to store the labels
y = df_train["author"]

# create training and test sets
X_train, X_test, y_train, y_test = train_test_split(df_train["text"], y, test_size = 0.33, random_state = 53)

# initialize a CountVectorizer object
count_vectorizer = CountVectorizer(stop_words="english")

# transform the training data using only the 'text' column values
count_train = count_vectorizer.fit_transform(X_train.values)

# transform the test data using only the 'text' column values
count_test = count_vectorizer.transform(X_test.values)

# print the first 10 features of the count_vectorizer
print(count_vectorizer.get_feature_names()[:10])

In [None]:
# initialize a TfidfVectorizer object
tfidf_vectorizer = TfidfVectorizer(stop_words="english", max_df=0.7)

# transform the training data
tfidf_train = tfidf_vectorizer.fit_transform(X_train.values)

# transform the test data
tfidf_test = tfidf_vectorizer.transform(X_test.values)

# print the first 10 features
print(tfidf_vectorizer.get_feature_names()[:10])

# print the first 5 vectors of the tfidf training data
print(tfidf_train[:5])

In [None]:
# create the CountVectorizer DataFrame
count_df = pd.DataFrame(count_train.A, columns=count_vectorizer.get_feature_names())

# create the TfidfVectorizer DataFrame
tfidf_df = pd.DataFrame(tfidf_train.A, columns=tfidf_vectorizer.get_feature_names())

# print the head of count_df
print(count_df.head())

# print the head of tfidf_df
print(tfidf_df.head())

# calculate the difference in columns
difference = set(count_df.columns) - set(tfidf_df.columns)
print(difference)

# check whether the DataFrames are equal
print(count_df.equals(tfidf_df))

### Training and testing with CountVectorizer

In [None]:
# instantiate a Multinomial Naive Bayes classifier
nb_classifier = MultinomialNB()

# fit the classifier to the training data
nb_classifier.fit(count_train, y_train)

# create the predicted tags
pred = nb_classifier.predict(count_test)

# calculate the accuracy score
score = metrics.accuracy_score(y_test, pred)
print(score)

# calculate the confusion matrix
cm = metrics.confusion_matrix(y_test, pred, labels=['EAP', 'MWS', 'HPL'])
print(cm)

### Training and testing with TfidfVectorizer

In [None]:
# create a Multinomial Naive Bayes classifier
nb_classifier = MultinomialNB()

# fit the classifier to the training data
nb_classifier.fit(tfidf_train, y_train)

# create the predicted tags
pred = nb_classifier.predict(tfidf_test)

# calculate the accuracy score
score = metrics.accuracy_score(y_test, pred)
print(score)

# calculate the confusion matrix
cm = metrics.confusion_matrix(y_test, pred, labels=['EAP', 'MWS', 'HPL'])
print(cm)

### Improving the model

In [None]:
# create the list of alphas
alphas = np.arange(0, 1, 0.1)

# define train_and_predict()
def train_and_predict(alpha):
    # instantiate the classifier
    nb_classifier = MultinomialNB(alpha=alpha)
    # fit to the training data
    nb_classifier.fit(tfidf_train, y_train)
    # predict the labels
    pred = nb_classifier.predict(tfidf_test)
    # compute accuracy
    score = metrics.accuracy_score(y_test, pred)
    return score

# Iterate over the alphas and print the corresponding score
for alpha in alphas:
    print('Alpha: ', alpha)
    print('Score: ', train_and_predict(alpha))
    print()

#### Multi-class logarithmic loss

In [None]:
# create a Multinomial Naive Bayes classifier
nb_classifier = MultinomialNB(alpha = 0.1)

# fit the classifier to the training data
nb_classifier.fit(tfidf_train, y_train)

# create the predicted tags
pred = nb_classifier.predict(tfidf_test)
pred_prob = nb_classifier.predict_proba(tfidf_test)

# calculate the accuracy score
acc = metrics.accuracy_score(y_test, pred)
logloss = metrics.log_loss(y_test, pred_prob)
print(acc)
print(logloss)

# calculate the confusion matrix
cm = metrics.confusion_matrix(y_test, pred, labels=['EAP', 'MWS', 'HPL'])
print(cm)

# Submission
Wow! We have come a long way. Now we are ready to make predictions on the test data and generate a submission file.

In [None]:
# transform the test data
tfidf_test = tfidf_vectorizer.transform(df_tests["text"].values)

pred_prob = nb_classifier.predict_proba(tfidf_test)

df_sampl["EAP"] = pred_prob[:, 0]
df_sampl["HPL"] = pred_prob[:, 1]
df_sampl["MWS"] = pred_prob[:, 2]

df_sampl.to_csv("submission.csv", index=False)