In [2]:
import numpy as np
import pandas as pd
import sklearn
import spacy
import re
from nltk.corpus import gutenberg
import nltk
import warnings
warnings.filterwarnings("ignore")
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
nltk
nltk.download('gutenberg')
from nltk import pos_tag
from nltk.text import Text
from nltk import sent_tokenize
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS
!python -m spacy download en
import gensim

[nltk_data] Downloading package gutenberg to
[nltk_data]     /Users/robholmstrom/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/Users/robholmstrom/miniconda3/lib/python3.7/site-packages/en_core_web_sm -->
/Users/robholmstrom/miniconda3/lib/python3.7/site-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [25]:
def text_cleaner(text):
    # visual inspection identifies a form of punctuation spaCy does not
    # recognize: the double dash '--'.  Better get rid of it now!
    text = re.sub(r'--',' ',text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = re.sub(r"(\b|\s+\-?|^\-?)(\d+|\d*\.\d+)\b", " ", text)
    text = ' '.join(text.split())
    return text

In [26]:
# load and clean the data.
persuasion = gutenberg.raw('austen-persuasion.txt')
alice = gutenberg.raw('carroll-alice.txt')

# the chapter indicator is idiosyncratic
persuasion = re.sub(r'Chapter \d+', '', persuasion)
alice = re.sub(r'CHAPTER .*', '', alice)
    
alice = text_cleaner(alice)
persuasion = text_cleaner(persuasion)

In [27]:
# parse the cleaned novels. this can take a bit
nlp = spacy.load('en')
alice_doc = nlp(alice)
persuasion_doc = nlp(persuasion)

In [28]:
# group into sentences
alice_sents = [[sent, "Carroll"] for sent in alice_doc.sents]
persuasion_sents = [[sent, "Austen"] for sent in persuasion_doc.sents]

# combine the sentences from the two novels into one data frame
sentences = pd.DataFrame(alice_sents + persuasion_sents, columns = ["text", "author"])
sentences.head()

Unnamed: 0,text,author
0,"(Alice, was, beginning, to, get, very, tired, ...",Carroll
1,"(So, she, was, considering, in, her, own, mind...",Carroll
2,"(There, was, nothing, so, VERY, remarkable, in...",Carroll
3,"(Oh, dear, !)",Carroll
4,"(Oh, dear, !)",Carroll


In [29]:
# get rid off stop words and punctuation
# and lemmatize the tokens
for i, sentence in enumerate(sentences["text"]):
    sentences.loc[i, "text"] = " ".join(
        [token.lemma_ for token in sentence if not token.is_punct and not token.is_stop])

Now, we're ready to vectorize our words using word2vec. For this purpuse, we use Word2Vec from the models module of gensim. The Word2Vec class has several parameters. We set the following parameters:

workers=4: We set the number of threads to run in parallel to 4 (make sense if your computer has available computing units).
min_count=1: We set the minimum word count threshold to 1.
window=6: We set the number of words around target word to consider to 6.
sg=0: We use CBOW because our corpus is small.
sample=1e-3: We penalize frequent words.
size=100: We set the word vector length to 100.
hs=1: We use hierarchical softmax.

In [54]:
# train word2vec on the the sentences
model = gensim.models.Word2Vec(
    sentences["text"],
    workers=4,
    min_count=2,
    window=4,
    sg=0,
    sample=1e-3,
    size=100,
    hs=1
)

In [55]:
word2vec_arr = np.zeros((sentences.shape[0],100))

for i, sentence in enumerate(sentences["text"]):
    try:
        word2vec_arr[i,:] = np.mean([model[lemma] for lemma in sentence], axis=0)
    except KeyError:
        word2vec_arr[i, :] = np.full((1,100), np.nan)
word2vec_arr = pd.DataFrame(word2vec_arr)
sentences2 = pd.concat([sentences[["author", "text"]],word2vec_arr], axis=1)
sentences2.dropna(inplace=True)

sentences2.head()

Unnamed: 0,author,text,0,1,2,3,4,5,6,7,...,90,91,92,93,94,95,96,97,98,99
0,Carroll,Alice begin tired sit sister bank have twice p...,0.223172,0.020525,-0.055817,0.017764,-0.032733,0.013045,0.031529,-0.004377,...,-0.009629,0.114793,0.1457,0.064799,-0.024737,0.013459,-0.04282,-0.070483,0.183989,-0.060073
1,Carroll,consider mind hot day feel sleepy stupid pleas...,0.194484,0.032125,-0.050899,-0.007056,-0.026646,-0.002122,0.03031,0.001089,...,-0.02269,0.092844,0.132692,0.044662,-0.045043,0.036374,-0.037489,-0.055718,0.147318,-0.062522
2,Carroll,remarkable Alice think way hear Rabbit,0.175725,0.032291,-0.015428,0.027251,0.001575,0.058039,-0.001594,0.01453,...,-0.03344,0.08163,0.138925,0.018392,-0.054507,0.068657,0.027283,-0.045345,0.131372,-0.028706
3,Carroll,oh dear,0.15333,0.07006,-0.007479,-0.033633,0.10651,0.046761,-0.023423,-0.059364,...,0.007347,0.109599,0.088784,0.049615,-0.040165,0.026241,0.030117,-0.077189,0.091077,-0.099297
4,Carroll,oh dear,0.15333,0.07006,-0.007479,-0.033633,0.10651,0.046761,-0.023423,-0.059364,...,0.007347,0.109599,0.088784,0.049615,-0.040165,0.026241,0.030117,-0.077189,0.091077,-0.099297


Train your own word2vec representations as we did in our first example in the checkpoint. But, you need to experiment with the hyperparameters of the vectorization step. Modify the hyperparameters and run the classification models again. Can you wrangle any improvements?

In [56]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split

Y = sentences2['author']
X = np.array(sentences2.drop(['text','author'], 1))

# We split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=123)

# Models
lr = LogisticRegression()
lr.fit(X_train, y_train)

print("----------------------Logistic Regression Scores----------------------")
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))


----------------------Logistic Regression Scores----------------------
Training set score: 0.6875

Test set score: 0.6798438262567106
