In [29]:
import numpy as np
import pandas as pd
import sklearn
import spacy
import re
from nltk.corpus import gutenberg
import nltk
import warnings
warnings.filterwarnings("ignore")
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
nltk
nltk.download('gutenberg')
from nltk import pos_tag
from nltk.text import Text
from nltk import sent_tokenize
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS

[nltk_data] Downloading package gutenberg to
[nltk_data]     /Users/robholmstrom/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


In [3]:
def text_cleaner(text):
    # visual inspection identifies a form of punctuation spaCy does not
    # recognize: the double dash '--'.  better get rid of it now!
    text = re.sub(r'--',' ',text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = re.sub(r"(\b|\s+\-?|^\-?)(\d+|\d*\.\d+)\b", " ", text)
    text = ' '.join(text.split())
    return text

In [4]:
persuasion1 = gutenberg.raw('austen-persuasion.txt')
alice1 = gutenberg.raw('carroll-alice.txt')

# the Chapter indicator is idiosyncratic
persuasion = re.sub(r'Chapter \d+', '', persuasion1)
alice = re.sub(r'CHAPTER .*', '', alice1)
    
alice = text_cleaner(alice)
persuasion = text_cleaner(persuasion)

In [5]:
# parse the cleaned novels. this can take a bit.
nlp = spacy.load('en')
alice_doc = nlp(alice)
persuasion_doc = nlp(persuasion)

In [6]:
# group into sentences
alice_sents = [[sent, "Carroll"] for sent in alice_doc.sents]
persuasion_sents = [[sent, "Austen"] for sent in persuasion_doc.sents]

# combine the sentences from the two novels into one data frame
sentences = pd.DataFrame(alice_sents + persuasion_sents, columns = ["text", "author"])
sentences.head()

Unnamed: 0,text,author
0,"(Alice, was, beginning, to, get, very, tired, ...",Carroll
1,"(So, she, was, considering, in, her, own, mind...",Carroll
2,"(There, was, nothing, so, VERY, remarkable, in...",Carroll
3,"(Oh, dear, !)",Carroll
4,"(Oh, dear, !)",Carroll


In [7]:
#get rid off stop words and punctuation
# and lemmatize the tokens
for i, sentence in enumerate(sentences["text"]):
    sentences.loc[i, "text"] = " ".join(
        [token.lemma_ for token in sentence if not token.is_punct and not token.is_stop])

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(analyzer='word')
X = vectorizer.fit_transform(sentences["text"])
bow_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
sentences = pd.concat([bow_df, sentences[["text", "author"]]], axis=1)

In [9]:
sentences['text']

0       Alice begin tired sit sister bank have twice p...
1       consider mind hot day feel sleepy stupid pleas...
2                  remarkable Alice think way hear Rabbit
3                                                 oh dear
4                                                 oh dear
                              ...                        
5627    spring felicity glow spirit friend Anne warmth...
5628    Anne tenderness worth Captain Wentworth affection
5629    profession friend wish tenderness dread future...
5630    glory sailor wife pay tax quick alarm belong p...
5631                                                Finis
Name: text, Length: 5632, dtype: object

Your task is to increase the performance of the models we implemented in the BoW example. Suggested avenues of investigation include:



In [10]:

Y = sentences['author']
X = np.array(sentences.drop(['text','author'], 1))

# We split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=123)

# Models
lr = LogisticRegression()
rfc = RandomForestClassifier()
gbc = GradientBoostingClassifier()

lr.fit(X_train, y_train)
rfc.fit(X_train, y_train)
gbc.fit(X_train, y_train)


print("----------------------Logistic Regression Scores----------------------")
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))

print("----------------------Random Forest Scores----------------------")
print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', rfc.score(X_test, y_test))

print("----------------------Gradient Boosting Scores----------------------")
print('Training set score:', gbc.score(X_train, y_train))
print('\nTest set score:', gbc.score(X_test, y_test))



----------------------Logistic Regression Scores----------------------
Training set score: 0.9354838709677419

Test set score: 0.8761651131824234
----------------------Random Forest Scores----------------------
Training set score: 0.9795797573246523

Test set score: 0.854416333777186
----------------------Gradient Boosting Scores----------------------
Training set score: 0.8514353358981948

Test set score: 0.8362183754993342


Other modeling techniques and models



In [11]:
from sklearn.neighbors import  KNeighborsClassifier
kn =  KNeighborsClassifier()
kn.fit(X_train, y_train)

print("----------------------Gradient Boosting Scores----------------------")
print('Training set score:', kn.score(X_train, y_train))
print('\nTest set score:', kn.score(X_test, y_test))

----------------------Gradient Boosting Scores----------------------
Training set score: 0.8144421426457532

Test set score: 0.746560142032845


In [12]:
from sklearn.svm import SVC
svc = SVC()
svc.fit(X_train, y_train)

print
svc.score(X_test, y_test)

0.8477585441633377

Making more features that take advantage of the SpaCy information (include grammar, phrases, POS, etc)



In [54]:
textlist = []
tokens = [word_tokenize(token) for token in sentences['text']]
# for word in tokens:
    
#     tags = pos_tag(word.lower())
#     textlist.append(tags)
pos_tag(tokens[0])

[('Alice', 'NNP'),
 ('begin', 'NN'),
 ('tired', 'VBD'),
 ('sit', 'NNS'),
 ('sister', 'JJ'),
 ('bank', 'NN'),
 ('have', 'VBP'),
 ('twice', 'RB'),
 ('peep', 'JJ'),
 ('book', 'NN'),
 ('sister', 'NN'),
 ('read', 'JJ'),
 ('picture', 'NN'),
 ('conversation', 'NN'),
 ('use', 'VBP'),
 ('book', 'NN'),
 ('think', 'NN'),
 ('Alice', 'NNP'),
 ('picture', 'NN'),
 ('conversation', 'NN')]

Making sentence-level features (number of words, amount of punctuation)



In [26]:

sentences['sentlength']= [len(sentences['text'][x]) for x in range(len(sentences['text']))]


In [23]:
sentences['sent_withpunct'] = [sent for sent in alice_doc.sents] + [sent for sent in persuasion_doc.sents]
for i, sentence in enumerate(sentences['sent_withpunct']):
    sentences.loc[i, 'sent_withpunct'] = " ".join([token.lemma_ for token in sentence if not token.is_stop])


In [24]:
sentences['sentlengthwpunct'] = [len(sentences['sent_withpunct'][x]) for x in range(len(sentences['text']))]


In [25]:
sentences['punctlenth'] = sentences['sentlengthwpunct']-sentences['sentlength']


In [21]:
X2 = np.array(sentences.drop(['text', 'author', 'sent_withpunct', 'sentlengthwpunct' ], 1))
y2 = sentences['author']

Including contextual information (length of previous and next sentences, words repeated from one sentence to the next, etc)



Or anything else your heart desires.



Compare your models' performances with those of the example.



In [22]:


# We split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X2, y2, test_size=0.4, random_state=123)

# Models
lr = LogisticRegression()
rfc = RandomForestClassifier()
gbc = GradientBoostingClassifier()

lr.fit(X_train, y_train)
rfc.fit(X_train, y_train)
gbc.fit(X_train, y_train)


print("----------------------Logistic Regression Scores----------------------")
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))

print("----------------------Random Forest Scores----------------------")
print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', rfc.score(X_test, y_test))

print("----------------------Gradient Boosting Scores----------------------")
print('Training set score:', gbc.score(X_train, y_train))
print('\nTest set score:', gbc.score(X_test, y_test))



----------------------Logistic Regression Scores----------------------
Training set score: 0.9348919798757028

Test set score: 0.8819351975144252
----------------------Random Forest Scores----------------------
Training set score: 0.986090559337082

Test set score: 0.8695073235685752
----------------------Gradient Boosting Scores----------------------
Training set score: 0.8514353358981948

Test set score: 0.8384376387039503


In the 2-gram example above, we only used 2-gram as our features. This time, use both 1-gram and 2-gram features together as your feature set. Run the same models in the example and compare the results.

In [57]:
sentences.head()

Unnamed: 0,1st,29th,abbreviation,abdication,abide,ability,able,abode,abominable,abominate,...,younker,youth,youthful,zeal,zealand,zealous,zealously,zigzag,text,author
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,Alice begin tired sit sister bank have twice p...,Carroll
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,consider mind hot day feel sleepy stupid pleas...,Carroll
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,remarkable Alice think way hear Rabbit,Carroll
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,oh dear,Carroll
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,oh dear,Carroll


In [58]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(analyzer='word', ngram_range=(1,2))
X = vectorizer.fit_transform(sentences["text"])
bow_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
sentences = pd.concat([bow_df, sentences[["text", "author"]]], axis=1)
sentences.head()

Unnamed: 0,1st,29th,29th september,abbreviation,abbreviation living,abdication,abdication neighbour,abide,abide consequence,abide figure,...,zealand australia,zealous,zealous officer,zealous subject,zealously,zealously discharge,zigzag,zigzag go,text,author
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,Alice begin tired sit sister bank have twice p...,Carroll
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,consider mind hot day feel sleepy stupid pleas...,Carroll
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,remarkable Alice think way hear Rabbit,Carroll
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,oh dear,Carroll
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,oh dear,Carroll


In [59]:
Y = sentences['author']
X = np.array(sentences.drop(['text','author'], 1))

# We split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=123)

# Models
lr = LogisticRegression()
rfc = RandomForestClassifier()
gbc = GradientBoostingClassifier()

lr.fit(X_train, y_train)
rfc.fit(X_train, y_train)
gbc.fit(X_train, y_train)

print("----------------------Logistic Regression Scores----------------------")
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))

print("----------------------Random Forest Scores----------------------")
print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', rfc.score(X_test, y_test))

print("----------------------Gradient Boosting Scores----------------------")
print('Training set score:', gbc.score(X_train, y_train))
print('\nTest set score:', gbc.score(X_test, y_test))

KeyboardInterrupt: 