For this challenge, you will need to choose a corpus of data from nltk or another source that includes categories you can predict and create an analysis pipeline that includes the following steps:

1. Data cleaning / processing / language parsing
2. Create features using two different NLP methods: For example, BoW vs tf-idf.
3. Use the features to fit supervised learning models for each feature set to predict the category outcomes.
4. Assess your models using cross-validation and determine whether one model performed better.
5. Pick one of the models and try to increase accuracy by at least 5 percentage points.

In [34]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import gutenberg, stopwords
import gensim
from gensim.models import word2vec
from sklearn.pipeline import make_pipeline
from collections import Counter
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB, GaussianNB

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# Utility Functions

In [35]:
def text_cleaner(text):
    # Visual inspection identifies a form of punctuation spaCy does not
    # recognize: the double dash '--'.  Better get rid of it now!
    text = re.sub(r'--',' ',text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = ' '.join(text.split())
    return text

In [36]:
def bag_of_words(text):
    '''Filter out punctuation and stop words.'''
    allwords = [token.lemma_
                for token in text
                if not token.is_punct
                and not token.is_stop]
    
    # Return the most common words.
    return [item[0] for item in Counter(allwords).most_common(2000)]

In [37]:
def bow_features(sentences, common_words):
    '''Creates a data frame with features for each word in our common word set.
    Each value is the count of the times the word appears in each sentence.'''
    counts = np.zeros((len(sentences),len(common_words)), dtype=int)
   
    # Process each row, counting the occurrence of words in each sentence.
    text_sentences = list(sentences[0])
    for i, sentence in enumerate(text_sentences):
        
        # Convert the sentence to lemmas, then filter out punctuation,
        # stop words, and uncommon words.
        words = [token.lemma_
                 for token in sentence
                 if (
                     not token.is_punct
                     and not token.is_stop
                     and token.lemma_ in common_words
                 )]
        
        # Populate the row with word counts.
        for word in words:
            counts[i, common_words.index(word)] += 1
        
        # This counter is just to make sure the kernel didn't hang.
#         if i % 1000 == 0:
#             print("Processing row {}".format(i))
        
    counts_df = pd.DataFrame(data=counts, columns=common_words)
    counts_df['text_sentence'] = sentences[0]
    counts_df['text_source'] = sentences[1]
    
    return counts_df

In [38]:
def make_bow_df(texts, authors):
    cleaned_texts = [text_cleaner(text) for text in texts]
    
    nlp = spacy.load('en')
    docs = [nlp(cleaned) for cleaned in cleaned_texts]
    
    bags = [bag_of_words(doc) for doc in docs]
    common_words = list({word for bag in bags for word in bag})
    
    sentences = pd.DataFrame([[sent, authors[i]] for i, doc in enumerate(docs) for sent in doc.sents])
    
    counts_df = bow_features(sentences, common_words)
    
    return counts_df

In [39]:
def tfidf_features(sentences):
    vectorizer = TfidfVectorizer(max_df=0.5, 
                             min_df=5, 
                             stop_words='english',   
                             use_idf=True,
                             norm=u'l2', 
                             smooth_idf=True 
                            )
    text_sentences = [str(sent) for sent in sentences[0]]
    tfidf = vectorizer.fit_transform(text_sentences).tocsr()
    
    df = pd.DataFrame(tfidf.todense(), columns=vectorizer.get_feature_names())
    df['text_sentence'] = sentences[0]
    df['text_source'] = sentences[1]
    
    return df

In [40]:
def make_tfidf_df(texts, authors):
    cleaned_texts = [text_cleaner(text) for text in texts]
    
    nlp = spacy.load('en')
    docs = [nlp(cleaned) for cleaned in cleaned_texts]
    
    sentences = pd.DataFrame([[sent, authors[i]] for i, doc in enumerate(docs) for sent in doc.sents])
    
    df = tfidf_features(sentences)
    
    return df

# Choose corpus

In [41]:
gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [42]:
# lets pick caesar, thursday, and stories
caesar = gutenberg.raw('shakespeare-caesar.txt')
thursday = gutenberg.raw('chesterton-thursday.txt')
stories = gutenberg.raw('bryant-stories.txt')

# Make features

In [43]:
texts = [caesar, thursday, stories]
authors = ['Shakespeare', 'Chesterton', 'Bryant']

In [44]:
bow_df = make_bow_df(texts, authors)
bow_df.head()

Unnamed: 0,fig,italy,appoint,fashter,son,swiftly,darknesse,morrow,iust,utterly,...,tragic,roman,husband,however,adventures,love,breath,railway,text_sentence,text_source
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Actus, Primus, .)",Shakespeare
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Scoena, Prima, .)",Shakespeare
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Enter, Flauius, ,, Murellus, ,, and, certaine...",Shakespeare
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Flauius, .)",Shakespeare
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Hence, :, home, you, idle, Creatures, ,, get,...",Shakespeare


In [45]:
tfidf_df = make_tfidf_df(texts, authors)
tfidf_df.head()

Unnamed: 0,able,abrupt,abruptly,absent,absurd,accent,accept,accident,acorn,act,...,year,years,yellow,yes,yesterday,yong,young,zelia,text_sentence,text_source
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"(Actus, Primus, .)",Shakespeare
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"(Scoena, Prima, .)",Shakespeare
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"(Enter, Flauius, ,, Murellus, ,, and, certaine...",Shakespeare
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"(Flauius, .)",Shakespeare
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"(Hence, :, home, you, idle, Creatures, ,, get,...",Shakespeare


In [46]:
X_bow = np.array(bow_df.drop(['text_sentence','text_source'], 1))
y_bow = bow_df['text_source']

X_tfidf = np.array(tfidf_df.drop(['text_sentence','text_source'], 1))
y_tfidf = tfidf_df['text_source']

# Models

## Naive Bayes

In [47]:
# Bag of Words
model = MultinomialNB()
cv = cross_val_score(model, X_bow, y_bow, cv=6)
print(cv.mean())
cv

0.8428004197839233


array([0.84506041, 0.85704125, 0.87250712, 0.85327635, 0.83618234,
       0.79273504])

In [48]:
# tfidf
model = MultinomialNB()
cv = cross_val_score(model, X_tfidf, y_tfidf, cv=6)
print(cv.mean())
cv

0.8051999666623915


array([0.78464819, 0.82076814, 0.85754986, 0.81125356, 0.8048433 ,
       0.75213675])

## Logistic Regression

In [49]:
# Bag of Words
model = LogisticRegression(solver='liblinear', multi_class='auto')
cv = cross_val_score(model, X_bow, y_bow, cv=6)
print(cv.mean())
cv

0.8069751964096034


array([0.81023454, 0.80512091, 0.84045584, 0.81766382, 0.8048433 ,
       0.76353276])

In [50]:
# Bag of Words
model = LogisticRegression(solver='liblinear', multi_class='auto')
cv = cross_val_score(model, X_tfidf, y_tfidf, cv=6)
print(cv.mean())
cv

0.7836014432636585


array([0.77754087, 0.80440967, 0.83974359, 0.77849003, 0.77706553,
       0.72435897])

## Random Forest

In [51]:
# bag of words
model = RandomForestClassifier(n_estimators=10)
cv = cross_val_score(model, X_bow, y_bow, cv=6)
print(cv.mean())
cv

0.7127735155687499


array([0.71926084, 0.71906117, 0.75213675, 0.71438746, 0.71509972,
       0.65669516])

In [52]:
# tfidf
model = RandomForestClassifier(n_estimators=10)
cv = cross_val_score(model, X_tfidf, y_tfidf, cv=6)
print(cv.mean())
cv

0.727725463782023


array([0.72850036, 0.72759602, 0.77136752, 0.74358974, 0.71438746,
       0.68091168])

# Improve Bag of Words Naive Bayes

## Bigger bag of words

In [53]:
# rewrite bag of words function to make a bigger bag of words
def bag_of_words(text):
    '''Filter out punctuation and stop words.'''
    allwords = [token.lemma_
                for token in text
                if not token.is_punct
                and not token.is_stop]
    
    # Return the most common words.
    return [item[0] for item in Counter(allwords).most_common(16000)]

In [54]:
big_bow_df = make_bow_df(texts, authors)
big_bow_df.head()

Unnamed: 0,exception,italy,six,gild,locust,waye,rhinoceros,valuable,india,outspread,...,loiterer,shtick,somersault,sweep,vouchsafe,clatter,however,breath,text_sentence,text_source
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Actus, Primus, .)",Shakespeare
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Scoena, Prima, .)",Shakespeare
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Enter, Flauius, ,, Murellus, ,, and, certaine...",Shakespeare
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Flauius, .)",Shakespeare
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Hence, :, home, you, idle, Creatures, ,, get,...",Shakespeare


In [55]:
X_big_bow = np.array(big_bow_df.drop(['text_sentence','text_source'], 1))
y_big_bow = big_bow_df['text_source']

In [56]:
# Bag of Words
model = MultinomialNB()
cv = cross_val_score(model, X_big_bow, y_big_bow, cv=6)
print(cv.mean())
cv

0.8385342571320576


array([0.83013504, 0.8485064 , 0.86965812, 0.85683761, 0.82977208,
       0.7962963 ])

## Use length of sentences

In [57]:
bow_df['sentence_length'] = bow_df['text_sentence'].apply(lambda x: len(x))

In [59]:
print('Shakespeare mean sent length:', bow_df[bow_df['text_source']=='Shakespeare']['sentence_length'].mean())
print('Chesterton mean sent length:', bow_df[bow_df['text_source']=='Chesterton']['sentence_length'].mean())
print('Bryant mean sent length:', bow_df[bow_df['text_source']=='Bryant']['sentence_length'].mean())

Shakespeare mean sent length: 11.727611940298507
Chesterton mean sent length: 19.92470051340559
Bryant mean sent length: 20.203670385030588


In [60]:
bow_df['long_sentence'] = bow_df['sentence_length'].apply(lambda x: x//7)
# big_bow_df['short_sentence'] = big_bow_df['sentence_length'].apply(lambda x: x <= 17)

In [61]:
X_bow = np.array(bow_df.drop(['text_sentence','text_source', 'sentence_length'], 1))
y_bow = bow_df['text_source']

In [62]:
# Bag of Words
model = MultinomialNB(fit_prior=False)
cv = cross_val_score(model, X_bow, y_bow, cv=6)
print(cv.mean())
cv

0.8452917771783968


array([0.84363895, 0.8655761 , 0.87250712, 0.85683761, 0.83333333,
       0.79985755])

## Use Rounded Avg Word Length and Length of Sentence

In [63]:
def avg_word_length(sentence):
    words = [token
                for token in sentence
                if not token.is_punct
                and not token.is_stop]
    lengths = np.array([len(word) for word in words])
    return round(np.mean(lengths), 0)

bow_df['avg_word_length'] = bow_df['text_sentence'].apply(avg_word_length)
bow_df['avg_word_length'] = bow_df['avg_word_length'].fillna(0)

  out=out, **kwargs)


In [64]:
print('Shakespeare sentences mean avg word length:', bow_df[bow_df['text_source']=='Shakespeare']['avg_word_length'].mean())
print('Chesterton sentences mean avg word length:', bow_df[bow_df['text_source']=='Chesterton']['avg_word_length'].mean())
print('Bryant sentences mean avg word length:', bow_df[bow_df['text_source']=='Bryant']['avg_word_length'].mean())

Shakespeare sentences mean avg word length: 4.627798507462686
Chesterton sentences mean avg word length: 5.276098117512835
Bryant sentences mean avg word length: 4.7430730478589425


In [65]:
X_bow = np.array(bow_df.drop(['text_sentence','text_source', 'sentence_length'], 1))
y_bow = bow_df['text_source']

In [66]:
# Bag of Words
model = MultinomialNB(fit_prior=False)
cv = cross_val_score(model, X_bow, y_bow, cv=6)
print(cv.mean())
cv

0.8419711474175994


array([0.83795309, 0.86059744, 0.86965812, 0.85968661, 0.82834758,
       0.79558405])