In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import spacy
import nltk
import re
from nltk.corpus import gutenberg, stopwords
from collections import Counter

In [3]:
# Utility function for standard text cleaning.
def text_cleaner(text):
    # Visual inspection identifies a form of punctuation spaCy does not
    # recognize: the double dash '--'.  Better get rid of it now!
    text = re.sub(r'--',' ',text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = ' '.join(text.split())
    return text
    
# Load and clean the data.
persuasion = gutenberg.raw('austen-persuasion.txt')
alice = gutenberg.raw('carroll-alice.txt')

# The Chapter indicator is idiosyncratic
persuasion = re.sub(r'Chapter \d+', '', persuasion)
alice = re.sub(r'CHAPTER .*', '', alice)
    
alice = text_cleaner(alice[:int(len(alice)/10)])
persuasion = text_cleaner(persuasion[:int(len(persuasion)/10)])

In [4]:
# Parse the cleaned novels. This can take a bit.
nlp = spacy.load('en')
alice_doc = nlp(alice)
persuasion_doc = nlp(persuasion)

In [5]:
# Group into sentences.
alice_sents = [[sent, "Carroll"] for sent in alice_doc.sents]
persuasion_sents = [[sent, "Austen"] for sent in persuasion_doc.sents]

# Combine the sentences from the two novels into one data frame.
sentences = pd.DataFrame(alice_sents + persuasion_sents)
sentences.head()

Unnamed: 0,0,1
0,"(Alice, was, beginning, to, get, very, tired, ...",Carroll
1,"(So, she, was, considering, in, her, own, mind...",Carroll
2,"(There, was, nothing, so, VERY, remarkable, in...",Carroll
3,"(Oh, dear, !)",Carroll
4,"(I, shall, be, late, !, ')",Carroll


In [20]:
# Utility function to create a list of the 2000 most common words.
def bag_of_words(text):
    
    # Filter out punctuation and stop words.
    allwords = [token.lemma_
                for token in text
                if not token.is_punct
                and not token.is_stop]
    
    # Return the most common words.
    return [item[0] for item in Counter(allwords).most_common(2000)]

In [21]:
# Set up the bags.
alicewords = bag_of_words(alice_doc)
persuasionwords = bag_of_words(persuasion_doc)

# Combine bags to create a set of unique words.
common_words = set(alicewords + persuasionwords)

In [22]:
len(common_words)

1647

In [23]:
# Creates a data frame with features for each word in our common word set.
# Each value is the count of the times the word appears in each sentence.
# Also features for sentence length, amount of punctuation per sentence,
# and parts of speech counts
def nlp_features(sentences, common_words):
    
    # Scaffold the data frame
    df = pd.DataFrame(columns=common_words)
    df['text_sentence'] = sentences[0]
    df['text_source'] = sentences[1]
    # Set empty features
    df['sentence_length'] = np.nan
    df['punctuation_count'] = np.nan
    df['noun_count'] = np.nan
    df['verb_count'] = np.nan
    df['adjective_count'] = np.nan
    df['adverb_count'] = np.nan
    df['pronoun_count'] = np.nan
    df['proper_noun_count'] = np.nan
    df['conjunction_count'] = np.nan
    # Initialize word counts to zer0
    df.loc[:, common_words] = 0
    
    # Process each row, counting the occurrence of words in each sentence.
    for i, sentence in enumerate(df['text_sentence']):
        
        # Convert the sentence to lemmas, then filter out punctuation,
        # stop words, and uncommon words.
        words = [token.lemma_
                 for token in sentence
                 if (
                     not token.is_punct
                     and not token.is_stop
                     and token.lemma_ in common_words
                 )]
        
        # Populate the row with word counts.
        for word in words:
            df.loc[i, word] += 1
        
        # Create the other feature counts
        df.loc[i, 'sentence_length'] = len([token for token in sentence if not token.is_punct])
        df.loc[i, 'punctuation_count'] = len([token for token in sentence if token.is_punct])
        df.loc[i, 'noun_count'] = len([token for token in sentence if token.pos_ == 'NOUN'])
        df.loc[i, 'verb_count'] = len([token for token in sentence if token.pos_ == 'VERB'])
        df.loc[i, 'conjunction_count'] = len([token for token in sentence if token.pos_ == 'CONJ' or 'ADP'])
        df.loc[i, 'adjective_count'] = len([token for token in sentence if token.pos_ == 'ADJ'])
        df.loc[i, 'adverb_count'] = len([token for token in sentence if token.pos_ == 'ADV'])
        df.loc[i, 'pronoun_count'] = len([token for token in sentence if token.pos_ == 'PRON'])
        df.loc[i, 'proper_noun_count'] = len([token for token in sentence if token.pos_ == 'PROPN'])
            
    return df

In [37]:
nlp = nlp_features(sentences, common_words)

In [38]:
nlp.head()

Unnamed: 0,scruple,disdain,furniture,intimate,laugh,beautifully,thought,feel,write,mr,...,text_source,sentence_length,punctuation_count,noun_count,verb_count,adjective_count,adverb_count,pronoun_count,proper_noun_count,conjunction_count
0,0,0,0,0,0,0,0,0,0,0,...,Carroll,57.0,10.0,12.0,13.0,3.0,3.0,3.0,2.0,67.0
1,0,0,0,0,0,0,0,1,0,0,...,Carroll,56.0,7.0,8.0,11.0,7.0,7.0,4.0,2.0,63.0
2,0,0,0,0,0,0,0,0,0,0,...,Carroll,29.0,4.0,2.0,5.0,1.0,6.0,2.0,2.0,33.0
3,0,0,0,0,0,0,0,0,0,0,...,Carroll,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
4,0,0,0,0,0,0,0,0,0,0,...,Carroll,4.0,2.0,0.0,2.0,1.0,0.0,1.0,0.0,6.0


In [40]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

X = nlp.drop(columns=['text_source', 'text_sentence'])
y = nlp['text_source']

lr = LogisticRegression(penalty='l2', solver='lbfgs')
cross_val_score(lr, X, y, cv=5)

array([0.91764706, 0.87951807, 0.95180723, 0.8313253 , 0.79518072])