Train your own word2vec representations, as you did in the first example in this checkpoint. However, you need to experiment with the hyperparameters of the vectorization step. Modify the hyperparameters and run the classification models again. Can you wrangle any improvements?

In [9]:
import numpy as np
import pandas as pd
import sklearn
import spacy
import re
import nltk
from nltk.corpus import gutenberg
import gensim

nltk.download('gutenberg')
!python -m spacy download en

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.
Collecting en_core_web_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz (12.0 MB)
[K     |████████████████████████████████| 12.0 MB 5.2 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.7/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.7/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [10]:
# apply all the text cleaning and model preparation

# Utility function for standard text cleaning
def text_cleaner(text):
    # Visual inspection identifies a form of punctuation that spaCy doesn't
    # recognize: the double dash --. Better get rid of it now!
    text = re.sub(r'--',' ',text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = re.sub(r"(\b|\s+\-?|^\-?)(\d+|\d*\.\d+)\b", " ", text)
    text = ' '.join(text.split())
    return text

# Load and clean the data
persuasion = gutenberg.raw('austen-persuasion.txt')
alice = gutenberg.raw('carroll-alice.txt')

# The chapter indicator is idiosyncratic
persuasion = re.sub(r'Chapter \d+', '', persuasion)
alice = re.sub(r'CHAPTER .*', '', alice)
    
alice = text_cleaner(alice)
persuasion = text_cleaner(persuasion)

# Parse the cleaned novels. This can take some time.
nlp = spacy.load('en')
alice_doc = nlp(alice)
persuasion_doc = nlp(persuasion)

# Group into sentences
alice_sents = [[sent, "Carroll"] for sent in alice_doc.sents]
persuasion_sents = [[sent, "Austen"] for sent in persuasion_doc.sents]

# Combine the sentences from the two novels into one DataFrame
sentences = pd.DataFrame(alice_sents + persuasion_sents, columns = ["text", "author"])

# Get rid of stop words and punctuation,
# and lemmatize the tokens
for i, sentence in enumerate(sentences["text"]):
    sentences.loc[i, "text"] = [token.lemma_ for token in sentence if not token.is_punct and not token.is_stop]

In [13]:
# train word2vec on the sentences
model_1 = gensim.models.Word2Vec(
    sentences["text"],
    workers=4,
    min_count=1,
    window=4,
    sg=0,
    sample=1e-3,
    size=100,
    hs=1
)

# let's adjust the window size for another model
model_2 = gensim.models.Word2Vec(
    sentences["text"],
    workers=4,
    min_count=1,
    window=10,
    sg=0,
    sample=1e-3,
    size=100,
    hs=1
)

# now let's also change the size
model_3 = gensim.models.Word2Vec(
    sentences["text"],
    workers=4,
    min_count=1,
    window=10,
    sg=0,
    sample=1e-3,
    size=300,
    hs=1
)

In [15]:
# now prepare the model DataFrames
word2vec_arr_1 = np.zeros((sentences.shape[0],100))
word2vec_arr_2 = np.zeros((sentences.shape[0],100))
word2vec_arr_3 = np.zeros((sentences.shape[0],300))

for i, sentence in enumerate(sentences["text"]):
  word2vec_arr_1[i,:] = np.mean([model_1[lemma] for lemma in sentence], axis=0)
  word2vec_arr_2[i,:] = np.mean([model_2[lemma] for lemma in sentence], axis=0)
  word2vec_arr_3[i,:] = np.mean([model_3[lemma] for lemma in sentence], axis=0)

word2vec_arr_1 = pd.DataFrame(word2vec_arr_1)
word2vec_arr_2 = pd.DataFrame(word2vec_arr_2)
word2vec_arr_3 = pd.DataFrame(word2vec_arr_3)

sentences_1 = pd.concat([sentences[["author", "text"]], word2vec_arr_1], axis=1)
sentences_1.dropna(inplace=True)

sentences_2 = pd.concat([sentences[["author", "text"]], word2vec_arr_2], axis=1)
sentences_2.dropna(inplace=True)

sentences_3 = pd.concat([sentences[["author", "text"]], word2vec_arr_3], axis=1)
sentences_3.dropna(inplace=True)

  import sys
  
  if __name__ == '__main__':
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [16]:
# now for the actual model training and evaluation
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split

Y_1 = sentences_1["author"]
Y_2 = sentences_2["author"]
Y_3 = sentences_3["author"]

X_1 = np.array(sentences_1.drop(["text", "author"], 1))
X_2 = np.array(sentences_2.drop(["text", "author"], 1))
X_3 = np.array(sentences_3.drop(["text", "author"], 1))

# splitting into training and test sets
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X_1, Y_1, test_size=0.3, random_state=70)
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_2, Y_2, test_size=0.3, random_state=70)
X_train_3, X_test_3, y_train_3, y_test_3 = train_test_split(X_3, Y_3, test_size=0.3, random_state=70)

# load the models
lr = LogisticRegression()
rfc = RandomForestClassifier()
gbc = GradientBoostingClassifier()

# fit the models and evaluate them
print("Model 1")
lr.fit(X_train_1, y_train_1)
rfc.fit(X_train_1, y_train_1)
gbc.fit(X_train_1, y_train_1)
print("LR Scores")
print("Training: ", lr.score(X_train_1, y_train_1))
print("Test: ", lr.score(X_test_1, y_test_1))
print("\n")
print("RFC Scores")
print("Training: ", rfc.score(X_train_1, y_train_1))
print("Test: ", rfc.score(X_test_1, y_test_1))
print("\n")
print("GBC Scores")
print("Training: ", gbc.score(X_train_1, y_train_1))
print("Test: ", gbc.score(X_test_1, y_test_1))

print("-------------------")

print("Model 2")
lr.fit(X_train_2, y_train_2)
rfc.fit(X_train_2, y_train_2)
gbc.fit(X_train_2, y_train_2)
print("LR Scores")
print("Training: ", lr.score(X_train_2, y_train_2))
print("Test: ", lr.score(X_test_2, y_test_2))
print("\n")
print("RFC Scores")
print("Training: ", rfc.score(X_train_2, y_train_2))
print("Test: ", rfc.score(X_test_2, y_test_2))
print("\n")
print("GBC Scores")
print("Training: ", gbc.score(X_train_2, y_train_2))
print("Test: ", gbc.score(X_test_2, y_test_2))

print("-------------------")

print("Model 3")
lr.fit(X_train_3, y_train_3)
rfc.fit(X_train_3, y_train_3)
gbc.fit(X_train_3, y_train_3)
print("LR Scores")
print("Training: ", lr.score(X_train_3, y_train_3))
print("Test: ", lr.score(X_test_3, y_test_3))
print("\n")
print("RFC Scores")
print("Training: ", rfc.score(X_train_3, y_train_3))
print("Test: ", rfc.score(X_test_3, y_test_3))
print("\n")
print("GBC Scores")
print("Training: ", gbc.score(X_train_3, y_train_3))
print("Test: ", gbc.score(X_test_3, y_test_3))

Model 1
LR Scores
Training:  0.7860125260960334
Test:  0.7845404747413268


RFC Scores
Training:  0.9929540709812108
Test:  0.8174071819841753


GBC Scores
Training:  0.8885699373695198
Test:  0.8119293974437005
-------------------
Model 2
LR Scores
Training:  0.8014091858037579
Test:  0.7985392574558734


RFC Scores
Training:  0.9929540709812108
Test:  0.8210590383444918


GBC Scores
Training:  0.8919624217118998
Test:  0.8241022519780888
-------------------
Model 3
LR Scores
Training:  0.7682672233820459
Test:  0.7735849056603774


RFC Scores
Training:  0.9929540709812108
Test:  0.8167985392574558


GBC Scores
Training:  0.901356993736952
Test:  0.8253195374315276


The 2nd model gives slightly better results when compared to the model implemented in the checkpoint notebook, but it still shows signs of overfitting. It is better though, but it could be further improved.