# NLP Feature Engineering III

## 1. Train your own word2vec representations as we did in our first example in the checkpoint. But, you need to experiment with the hyperparameters of the vectorization step. Modify the hyperparameters and run the classification models again. Can you wrangle any improvements?

In [2]:
import numpy as np
import pandas as pd
import sklearn
import spacy
import re
import nltk
from nltk.corpus import gutenberg
import gensim
import warnings
warnings.filterwarnings("ignore")

In [3]:
nltk.download('gutenberg')
!python -m spacy download en

In [3]:
# utility function for standard text cleaning
def text_cleaner(text):
    # visual inspection identifies a form of punctuation spaCy does not
    # recognize: the double dash '--'.  Better get rid of it now!
    text = re.sub(r'--',' ',text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = re.sub(r"(\b|\s+\-?|^\-?)(\d+|\d*\.\d+)\b", " ", text)
    text = ' '.join(text.split())
    return text

In [4]:
# load and clean the data
persuasion = gutenberg.raw('austen-persuasion.txt')
alice = gutenberg.raw('carroll-alice.txt')

# the chapter indicator is idiosyncratic
persuasion = re.sub(r'Chapter \d+', '', persuasion)
alice = re.sub(r'CHAPTER .*', '', alice)
    
alice = text_cleaner(alice)
persuasion = text_cleaner(persuasion)

In [7]:
# parse the cleaned novels. This can take a bit.
nlp = spacy.load('en_core_web_sm')
alice_doc = nlp(alice)
persuasion_doc = nlp(persuasion)

In [8]:
# group into sentences
alice_sents = [[sent, "Carroll"] for sent in alice_doc.sents]
persuasion_sents = [[sent, "Austen"] for sent in persuasion_doc.sents]

# combine the sentences from the two novels into one data frame
sentences = pd.DataFrame(alice_sents + persuasion_sents, columns = ["text", "author"])
sentences.head()

Unnamed: 0,text,author
0,"(Alice, was, beginning, to, get, very, tired, ...",Carroll
1,"(her, sister, was, reading, ,, but, it, had, n...",Carroll
2,"(thought, Alice, ', without, pictures, or, con...",Carroll
3,"(So, she, was, considering, in, her, own, mind...",Carroll
4,"(There, was, nothing, so, VERY, remarkable, in...",Carroll


In [9]:
# get rid off stop words and punctuation
# and lemmatize the tokens
for i, sentence in enumerate(sentences["text"]):
    sentences.loc[i, "text"] = [token.lemma_ for token in sentence if not token.is_punct and not token.is_stop]

Below, we train several word2vec models. In particular, models 1 through 3 try windows sizes of 4, 6 and 8 and models 4 through 6 try vector size of 200 instead of 100:

In [11]:
# train word2vec on the the sentences
model1 = gensim.models.Word2Vec(
    sentences["text"],
    workers=4,
    min_count=1,
    window=4,
    sg=0,
    sample=1e-3,
    size=100,
    hs=1
)

model2 = gensim.models.Word2Vec(
    sentences["text"],
    workers=4,
    min_count=1,
    window=6,
    sg=0,
    sample=1e-3,
    size=100,
    hs=1
)

model3 = gensim.models.Word2Vec(
    sentences["text"],
    workers=4,
    min_count=1,
    window=8,
    sg=0,
    sample=1e-3,
    size=100,
    hs=1
)

model4 = gensim.models.Word2Vec(
    sentences["text"],
    workers=4,
    min_count=1,
    window=4,
    sg=0,
    sample=1e-3,
    size=200,
    hs=1
)

model5 = gensim.models.Word2Vec(
    sentences["text"],
    workers=4,
    min_count=1,
    window=6,
    sg=0,
    sample=1e-3,
    size=200,
    hs=1
)

model6 = gensim.models.Word2Vec(
    sentences["text"],
    workers=4,
    min_count=1,
    window=8,
    sg=0,
    sample=1e-3,
    size=200,
    hs=1
)

model7 = gensim.models.Word2Vec(
    sentences["text"],
    workers=4,
    min_count=1,
    window=8,
    sg=0,
    sample=1e-3,
    size=250,
    hs=1
)

In [13]:
word2vec_arr1 = np.zeros((sentences.shape[0],100))
word2vec_arr2 = np.zeros((sentences.shape[0],100))
word2vec_arr3 = np.zeros((sentences.shape[0],100))
word2vec_arr4 = np.zeros((sentences.shape[0],200))
word2vec_arr5 = np.zeros((sentences.shape[0],200))
word2vec_arr6 = np.zeros((sentences.shape[0],200))
word2vec_arr7 = np.zeros((sentences.shape[0],250))

for i, sentence in enumerate(sentences["text"]):
    word2vec_arr1[i,:] = np.mean([model1[lemma] for lemma in sentence], axis=0)
    word2vec_arr2[i,:] = np.mean([model2[lemma] for lemma in sentence], axis=0)
    word2vec_arr3[i,:] = np.mean([model3[lemma] for lemma in sentence], axis=0)
    word2vec_arr4[i,:] = np.mean([model4[lemma] for lemma in sentence], axis=0)
    word2vec_arr5[i,:] = np.mean([model5[lemma] for lemma in sentence], axis=0)
    word2vec_arr6[i,:] = np.mean([model6[lemma] for lemma in sentence], axis=0)
    word2vec_arr7[i,:] = np.mean([model7[lemma] for lemma in sentence], axis=0)

word2vec_arr1 = pd.DataFrame(word2vec_arr1)
word2vec_arr2 = pd.DataFrame(word2vec_arr2)
word2vec_arr3 = pd.DataFrame(word2vec_arr3)
word2vec_arr4 = pd.DataFrame(word2vec_arr4)
word2vec_arr5 = pd.DataFrame(word2vec_arr5)
word2vec_arr6 = pd.DataFrame(word2vec_arr6)
word2vec_arr7 = pd.DataFrame(word2vec_arr7)

sentences1 = pd.concat([sentences[["author", "text"]],word2vec_arr1], axis=1)
sentences1.dropna(inplace=True)

sentences2 = pd.concat([sentences[["author", "text"]],word2vec_arr2], axis=1)
sentences2.dropna(inplace=True)

sentences3 = pd.concat([sentences[["author", "text"]],word2vec_arr3], axis=1)
sentences3.dropna(inplace=True)

sentences4 = pd.concat([sentences[["author", "text"]],word2vec_arr4], axis=1)
sentences4.dropna(inplace=True)

sentences5 = pd.concat([sentences[["author", "text"]],word2vec_arr5], axis=1)
sentences5.dropna(inplace=True)

sentences6 = pd.concat([sentences[["author", "text"]],word2vec_arr6], axis=1)
sentences6.dropna(inplace=True)

sentences7 = pd.concat([sentences[["author", "text"]],word2vec_arr7], axis=1)
sentences7.dropna(inplace=True)

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split

Y1 = sentences1['author']
Y2 = sentences2['author']
Y3 = sentences3['author']
Y4 = sentences4['author']
Y5 = sentences5['author']
Y6 = sentences6['author']
Y7 = sentences7['author']

X1 = np.array(sentences1.drop(['text','author'], 1))
X2 = np.array(sentences2.drop(['text','author'], 1))
X3 = np.array(sentences3.drop(['text','author'], 1))
X4 = np.array(sentences4.drop(['text','author'], 1))
X5 = np.array(sentences5.drop(['text','author'], 1))
X6 = np.array(sentences6.drop(['text','author'], 1))
X7 = np.array(sentences7.drop(['text','author'], 1))

# We split the dataset into train and test sets
X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, Y1, test_size=0.4, random_state=123)
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, Y2, test_size=0.4, random_state=123)
X_train3, X_test3, y_train3, y_test3 = train_test_split(X3, Y3, test_size=0.4, random_state=123)
X_train4, X_test4, y_train4, y_test4 = train_test_split(X4, Y4, test_size=0.4, random_state=123)
X_train5, X_test5, y_train5, y_test5 = train_test_split(X5, Y5, test_size=0.4, random_state=123)
X_train6, X_test6, y_train6, y_test6 = train_test_split(X6, Y6, test_size=0.4, random_state=123)
X_train7, X_test7, y_train7, y_test7 = train_test_split(X7, Y7, test_size=0.4, random_state=123)

# Models
lr = LogisticRegression()
rfc = RandomForestClassifier()
gbc = GradientBoostingClassifier()

print("-----------------------Word2vec Model 1------------------------------")
lr.fit(X_train1, y_train1)
rfc.fit(X_train1, y_train1)
gbc.fit(X_train1, y_train1)
print("----------------------Logistic Regression Scores----------------------")
print('Training set score:', lr.score(X_train1, y_train1))
print('\nTest set score:', lr.score(X_test1, y_test1))

print("----------------------Random Forest Scores----------------------")
print('Training set score:', rfc.score(X_train1, y_train1))
print('\nTest set score:', rfc.score(X_test1, y_test1))

print("----------------------Gradient Boosting Scores----------------------")
print('Training set score:', gbc.score(X_train1, y_train1))
print('\nTest set score:', gbc.score(X_test1, y_test1))

print("-----------------------Word2vec Model 2------------------------------")
lr.fit(X_train2, y_train2)
rfc.fit(X_train2, y_train2)
gbc.fit(X_train2, y_train2)
print("----------------------Logistic Regression Scores----------------------")
print('Training set score:', lr.score(X_train2, y_train2))
print('\nTest set score:', lr.score(X_test2, y_test2))

print("----------------------Random Forest Scores----------------------")
print('Training set score:', rfc.score(X_train2, y_train2))
print('\nTest set score:', rfc.score(X_test2, y_test2))

print("----------------------Gradient Boosting Scores----------------------")
print('Training set score:', gbc.score(X_train2, y_train2))
print('\nTest set score:', gbc.score(X_test2, y_test2))

print("-----------------------Word2vec Model 3------------------------------")
lr.fit(X_train3, y_train3)
rfc.fit(X_train3, y_train3)
gbc.fit(X_train3, y_train3)
print("----------------------Logistic Regression Scores----------------------")
print('Training set score:', lr.score(X_train3, y_train3))
print('\nTest set score:', lr.score(X_test3, y_test3))

print("----------------------Random Forest Scores----------------------")
print('Training set score:', rfc.score(X_train3, y_train3))
print('\nTest set score:', rfc.score(X_test3, y_test3))

print("----------------------Gradient Boosting Scores----------------------")
print('Training set score:', gbc.score(X_train3, y_train3))
print('\nTest set score:', gbc.score(X_test3, y_test3))

print("-----------------------Word2vec Model 4------------------------------")
lr.fit(X_train4, y_train4)
rfc.fit(X_train4, y_train4)
gbc.fit(X_train4, y_train4)
print("----------------------Logistic Regression Scores----------------------")
print('Training set score:', lr.score(X_train4, y_train4))
print('\nTest set score:', lr.score(X_test4, y_test4))

print("----------------------Random Forest Scores----------------------")
print('Training set score:', rfc.score(X_train4, y_train4))
print('\nTest set score:', rfc.score(X_test4, y_test4))

print("----------------------Gradient Boosting Scores----------------------")
print('Training set score:', gbc.score(X_train4, y_train4))
print('\nTest set score:', gbc.score(X_test4, y_test4))

print("-----------------------Word2vec Model 5------------------------------")
lr.fit(X_train5, y_train5)
rfc.fit(X_train5, y_train5)
gbc.fit(X_train5, y_train5)
print("----------------------Logistic Regression Scores----------------------")
print('Training set score:', lr.score(X_train5, y_train5))
print('\nTest set score:', lr.score(X_test5, y_test5))

print("----------------------Random Forest Scores----------------------")
print('Training set score:', rfc.score(X_train5, y_train5))
print('\nTest set score:', rfc.score(X_test5, y_test5))

print("----------------------Gradient Boosting Scores----------------------")
print('Training set score:', gbc.score(X_train5, y_train5))
print('\nTest set score:', gbc.score(X_test5, y_test5))

print("-----------------------Word2vec Model 6------------------------------")
lr.fit(X_train6, y_train6)
rfc.fit(X_train6, y_train6)
gbc.fit(X_train6, y_train6)
print("----------------------Logistic Regression Scores----------------------")
print('Training set score:', lr.score(X_train6, y_train6))
print('\nTest set score:', lr.score(X_test6, y_test6))

print("----------------------Random Forest Scores----------------------")
print('Training set score:', rfc.score(X_train6, y_train6))
print('\nTest set score:', rfc.score(X_test6, y_test6))

print("----------------------Gradient Boosting Scores----------------------")
print('Training set score:', gbc.score(X_train6, y_train6))
print('\nTest set score:', gbc.score(X_test6, y_test6))

print("-----------------------Word2vec Model 7------------------------------")
lr.fit(X_train7, y_train7)
rfc.fit(X_train7, y_train7)
gbc.fit(X_train7, y_train7)
print("----------------------Logistic Regression Scores----------------------")
print('Training set score:', lr.score(X_train7, y_train7))
print('\nTest set score:', lr.score(X_test7, y_test7))

print("----------------------Random Forest Scores----------------------")
print('Training set score:', rfc.score(X_train7, y_train7))
print('\nTest set score:', rfc.score(X_test7, y_test7))

print("----------------------Gradient Boosting Scores----------------------")
print('Training set score:', gbc.score(X_train7, y_train7))
print('\nTest set score:', gbc.score(X_train7, y_train7))


-----------------------Word2vec Model 1------------------------------
----------------------Logistic Regression Scores----------------------
Training set score: 0.7877330126277812

Test set score: 0.799819657348963
----------------------Random Forest Scores----------------------
Training set score: 0.9933854479855683

Test set score: 0.8417493237150586
----------------------Gradient Boosting Scores----------------------
Training set score: 0.8902585688514733

Test set score: 0.8390441839495041
-----------------------Word2vec Model 2------------------------------
----------------------Logistic Regression Scores----------------------
Training set score: 0.8009621166566446

Test set score: 0.8160504959422904
----------------------Random Forest Scores----------------------
Training set score: 0.9933854479855683

Test set score: 0.8458070333633905
----------------------Gradient Boosting Scores----------------------
Training set score: 0.8938665063138905

Test set score: 0.8444544634806131
-

Model 7's performance seemed to be the best. In particular, the best test performance is achieved using model 6 and gradient boosting. Three random forest models also achieved the highest score when trained on model 6. 

Model 6's performance is also superior to that of the model in the checkpoint. However, they may be some overfitting involved.