In [2]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import gutenberg, stopwords
from collections import Counter

from sklearn import ensemble
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [3]:
from sklearn.model_selection import cross_val_score, GridSearchCV

In [4]:
# Utility function for standard text cleaning.
def text_cleaner(text):
    # Visual inspection identifies a form of punctuation spaCy does not
    # recognize: the double dash '--'.  Better get rid of it now!
    text = re.sub(r'--',' ',text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = ' '.join(text.split())
    return text
    
# Load and clean the data.
persuasion = gutenberg.raw('austen-persuasion.txt')
alice = gutenberg.raw('carroll-alice.txt')

# The Chapter indicator is idiosyncratic
persuasion = re.sub(r'Chapter \d+', '', persuasion)
alice = re.sub(r'CHAPTER .*', '', alice)
    
alice = text_cleaner(alice)
persuasion = text_cleaner(persuasion)

In [5]:
nlp = spacy.load('en')

In [6]:
# Parse the cleaned novels. This can take a bit.
alice_doc = nlp(alice)

In [7]:
persuasion_doc = nlp(persuasion)

In [8]:
# Group into sentences.
alice_sents = [[sent, "Carroll"] for sent in alice_doc.sents]
persuasion_sents = [[sent, "Austen"] for sent in persuasion_doc.sents]

# Combine the sentences from the two novels into one data frame.
sentences = pd.DataFrame(alice_sents + persuasion_sents)
sentences.head()


Unnamed: 0,0,1
0,"(Alice, was, beginning, to, get, very, tired, ...",Carroll
1,"(So, she, was, considering, in, her, own, mind...",Carroll
2,"(There, was, nothing, so, VERY, remarkable, in...",Carroll
3,"(Oh, dear, !)",Carroll
4,"(I, shall, be, late, !, ')",Carroll


In [7]:
print(type(alice_sents))

<class 'list'>


In [9]:
# Utility function to create a list of the 2000 most common words.
def bag_of_words(text):
    
    # Filter out punctuation and stop words.
    allwords = [token.lemma_
                for token in text
                if not token.is_punct
                and not token.is_stop]
    
    # Return the most common words.
    return [item[0] for item in Counter(allwords).most_common(2000)]
    


In [10]:
# Creates a data frame with features for each word in our common word set.
# Each value is the count of the times the word appears in each sentence.
def bow_features(sentences, common_words):
    
    # Scaffold the data frame and initialize counts to zero.
    df = pd.DataFrame(columns=common_words)
    df['text_sentence'] = sentences[0]
    df['text_source'] = sentences[1]
    df.loc[:, common_words] = 0
    
    # Process each row, counting the occurrence of words in each sentence.
    for i, sentence in enumerate(df['text_sentence']):
        
        # Convert the sentence to lemmas, then filter out punctuation,
        # stop words, and uncommon words.
        words = [token.lemma_
                 for token in sentence
                 if (
                     not token.is_punct
                     and not token.is_stop
                     and token.lemma_ in common_words
                 )]
        
        # Populate the row with word counts.
        for word in words:
            df.loc[i, word] += 1
        
        # This counter is just to make sure the kernel didn't hang.
        if i % 500 == 0:
            print("Processing row {}".format(i))
            
    return df

In [11]:
# Set up the bags.
alicewords = bag_of_words(alice_doc)
persuasionwords = bag_of_words(persuasion_doc)
# Combine bags to create a set of unique words.
common_words = set(alicewords + persuasionwords)

# Create our data frame with features. This can take a while to run.
word_counts = bow_features(sentences, common_words)
word_counts.head()

Processing row 0
Processing row 500
Processing row 1000
Processing row 1500
Processing row 2000
Processing row 2500
Processing row 3000
Processing row 3500
Processing row 4000
Processing row 4500
Processing row 5000


Unnamed: 0,friend,entrance,hop,praise,charming,drawl,amiable,fling,glad,reasonable,...,mouse,six,player,lowing,objection,cutting,poker,seal,text_sentence,text_source
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Alice, was, beginning, to, get, very, tired, ...",Carroll
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(So, she, was, considering, in, her, own, mind...",Carroll
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(There, was, nothing, so, VERY, remarkable, in...",Carroll
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(Oh, dear, !)",Carroll
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(I, shall, be, late, !, ')",Carroll


In [11]:
print(len(common_words))

3057


In [13]:
rfc = RandomForestClassifier()
Y = word_counts['text_source']
X = np.array(word_counts.drop(['text_sentence','text_source'], 1))

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.4,
                                                    random_state=0)
train = rfc.fit(X_train, y_train)

print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', rfc.score(X_test, y_test))

Training set score: 0.982252141983

Test set score: 0.838990825688


In [14]:
lr = LogisticRegression()
train = lr.fit(X_train, y_train)
print(X_train.shape, y_train.shape)
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))

(3268, 3057) (3268,)
Training set score: 0.948286413709

Test set score: 0.884403669725


In [14]:
clf = ensemble.GradientBoostingClassifier()
train = clf.fit(X_train, y_train)

print('Training set score:', clf.score(X_train, y_train))
print('\nTest set score:', clf.score(X_test, y_test))

Training set score: 0.856181150551

Test set score: 0.849082568807


# Challenge

In [15]:
# Utility function to create a list of the 5000 most common words.
def bag_of_words(text):
    
    # Filter out punctuation and stop words.
    allwords = [token.lemma_
                for token in text
                if not token.is_punct
                and not token.is_stop]
    
    # Return the most common words.
    return [item[0] for item in Counter(allwords).most_common(5000)]
    


In [16]:
#Try adding some common phrases (500 from each author) from the works into our dataframe


def text_phrases(text):
    noun_phrases = [np.text for np in text.noun_chunks]
    return [item[0] for item in Counter(noun_phrases).most_common(500)]


# Set up
alice_phrases = text_phrases(alice_doc)
persuasion_phrases = text_phrases(persuasion_doc)

# Combine bags to create a set of unique words.
common_phrases = set(alice_phrases + persuasion_phrases)

In [17]:
def bow_features(sentences, common_words):
    
   
    
    df = pd.DataFrame(columns= set(list(common_words) + list(common_phrases )))

    df['text_sentence'] = sentences[0]
    df['text_source'] = sentences[1]
    df.loc[:, common_words] = 0
    df.loc[:, common_phrases] = 0
    df['sent_length'] = 0
    
    df['prev_sent_length'] = 0
    df['next_sent_length'] = 0
    df['num_words_repeated_from_prior_sent'] = 0
    
    for i, sentence in enumerate(df['text_sentence']):
        
          #Check to see if each phrase turns up in the sentence (store as binary var for the time being)
        
        for phrase in common_phrases:
            if phrase in str(sentence):
                df.loc[i, phrase] = 1
                
        words = [token.lemma_
                 for token in sentence
                 if (
                     not token.is_punct
                     and not token.is_stop
                     and token.lemma_ in common_words
                 )]
        
        # Populate the row with word counts.
        #Also add # of repeated words from one sentence to the next
        repeats = 0
        for word in words:
            df.loc[i, word] += 1
            if i > 0: 
                if ((df.loc[i-1, word] > 0) & (df.loc[i, word] > 0)):
                    repeats += 1
            else: 
                repeats = 0
        df['num_words_repeated_from_prior_sent'][i] = repeats        

        sent_len = 0    
        num_punct = 0 
        
        for token in sentence:
        
            if not token.is_punct:
                sent_len += 1
            else:
                num_punct += 1
        df.loc[i, 'sent_length'] = sent_len
        df.loc[i, 'sent_punct_count'] = num_punct
        
        if i > 0:
            df.loc[i, 'prev_sent_length'] = df.loc[i-1, 'sent_length']
        else:
            df.loc[i, 'prev_sent_length'] = np.nan
                              
        # This counter is just to make sure the kernel didn't hang.
                      
        if i % 500 == 0:
            print("Processing row {}".format(i))
    #Back out of the loop through sentences and just shift the df by one to get the "next sent len" feature
    df['next_sent_length'] = df['sent_length'].shift(-1)         
                
    return df

In [18]:
# Set up the bags.
alicewords = bag_of_words(alice_doc)
persuasionwords = bag_of_words(persuasion_doc)
# Combine bags to create a set of unique words.
common_words = set(alicewords + persuasionwords)


In [19]:

# Create our data frame with features. This can take a while to run.
word_counts = bow_features(sentences, common_words)
word_counts.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Processing row 0
Processing row 500
Processing row 1000
Processing row 1500
Processing row 2000
Processing row 2500
Processing row 3000
Processing row 3500
Processing row 4000
Processing row 4500
Processing row 5000


Unnamed: 0,friend,praise,tenantry,amiable,dozen,vanity,precept,ground,scene,bitter,...,thorough,player,lowing,text_sentence,text_source,sent_length,prev_sent_length,next_sent_length,num_words_repeated_from_prior_sent,sent_punct_count
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,"(Alice, was, beginning, to, get, very, tired, ...",Carroll,57,,56.0,0,10.0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,"(So, she, was, considering, in, her, own, mind...",Carroll,56,57.0,29.0,21,7.0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,"(There, was, nothing, so, VERY, remarkable, in...",Carroll,29,56.0,2.0,12,4.0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,"(Oh, dear, !)",Carroll,2,29.0,4.0,2,1.0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,"(I, shall, be, late, !, ')",Carroll,4,2.0,109.0,0,2.0


In [20]:
df = word_counts.copy()

In [21]:
df.shape

(5448, 5845)

In [22]:
def entityty_types(df):
    
    person_ent_type = []
    qty_ent_type = []
    ordinal_ent_type = []
    time_ent_type = []
    org_ent_type = []
    lang_ent_type = []
    date_ent_type = []
    card_ent_type = []
    gpe_ent_type = []
    fac_ent_type = []
    for i, sentence in enumerate(df['text_sentence']):
        person_count = 0
        qty_count= 0
        ordinal_count = 0
        time_count = 0
        org_count = 0
        lang_count = 0
        date_count= 0
        cardinal_count =0 
        gpe_count= 0
        fac_count = 0
    
        for token in sentence:
            if token.ent_type_ == 'PERSON':
                person_count += 1
        
            if token.ent_type_ == 'QUANTITY':
                qty_count += 1
            
            if token.ent_type_ == 'ORDINAL':
                ordinal_count += 1
            
            if token.ent_type_ == 'TIME':
                time_count += 1
            
            if token.ent_type_ == 'ORG':
                org_count += 1
            
            if token.ent_type_ == 'LANGUAGE':
                lang_count += 1
            if token.ent_type_ == 'DATE':
                date_count += 1            
        
            if token.ent_type_ == 'CARDINAL':
                cardinal_count += 1            
            if token.ent_type_ == 'GPE':
                gpe_count += 1            
            if token.ent_type_ == 'FAC':
                fac_count += 1            
            
        person_ent_type.append(person_count)
        qty_ent_type.append(qty_count)
        ordinal_ent_type.append(ordinal_count)
        time_ent_type.append(time_count)
        org_ent_type.append(org_count)
        lang_ent_type.append(lang_count)
        date_ent_type.append(date_count)
        card_ent_type.append(cardinal_count)
        gpe_ent_type.append(gpe_count)
        fac_ent_type.append(fac_count)

          
    df['person_ent'] = person_ent_type
    df['qty_ent'] = qty_ent_type
    df['ordinal_ent'] = ordinal_ent_type
    df['time_ent'] = time_ent_type
    df['org_ent'] = org_ent_type
    df['lang_ent'] = lang_ent_type
    df['date_ent'] = date_ent_type
    df['card_ent'] = card_ent_type
    df['gpe_ent'] = gpe_ent_type
    df['fac_ent'] = fac_ent_type
    return(df)

In [23]:
df = entityty_types(df)

In [24]:
df.shape

(5448, 5855)

In [26]:
def grammars(df):
    adv_count_list = []
    verb_count_list = []
    noun_count_list = []
    propnoun_count_list = []
    punc_count_list = []
    #-----------------
    part_cnt_list= []
    adj_cnt_list= []
    adp_cnt_list= []
    det_cnt_list= []

    for sentence in df['text_sentence']:
        
        advs_cnt = 0
        verb_cnt = 0
        noun_cnt = 0
        propnoun_cnt = 0
        punc_cnt = 0
    #-----------------
        part_cnt= 0
        adj_cnt= 0
        adp_cnt= 0
        det_cnt= 0
        
        for token in sentence:
            if token.pos_ == 'ADV':
                advs_cnt +=1
            if token.pos_ == 'VERB':
                verb_cnt +=1
            if token.pos_ == 'NOUN':
                noun_cnt +=1
            if token.pos_ == 'PROPN':
                propnoun_cnt +=1
            if token.pos_ == 'PUNCT':
                punc_cnt +=1
    #---------------------------------------
            if token.pos_ == 'PART':
                part_cnt +=1
            if token.pos_ == 'ADJ':
                adj_cnt +=1
            if token.pos_ == 'ADP':
                adp_cnt +=1
            if token.pos_ == 'DET':
                det_cnt +=1
        
        adv_count_list.append(advs_cnt)
        verb_count_list.append(verb_cnt)
        noun_count_list.append(noun_cnt)
        propnoun_count_list.append(propnoun_cnt)
        punc_count_list.append(punc_cnt)
        #----------------------------------
        part_cnt_list.append(part_cnt)
        adj_cnt_list.append(adj_cnt)
        adp_cnt_list.append(adp_cnt)
        det_cnt_list.append(det_cnt)
    #---------------------------------------    
        
    df['adv_count'] = adv_count_list
    df['verb_count'] = verb_count_list
    df['noun_count'] = noun_count_list
    df['pronoun_count'] = propnoun_count_list
    df['punc_count'] = punc_count_list

    #-----------------
    df['part_cnt'] = part_cnt_list
    df['adj_cnt'] = adj_cnt_list
    df['adp_cnt'] = adp_cnt_list
    df['det_cnt'] = det_cnt_list
    return(df)

In [27]:
df= grammars(df)

In [28]:
df.shape

(5448, 5864)

In [29]:
df.fillna(0,inplace=True)

In [124]:
#df.to_csv('Check.csv')

In [30]:
Y = df['text_source']
X = np.array(df.drop(['text_sentence','text_source'], 1))

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.3,
                                                    random_state=0)

Try Classifiers with default parameters and 5 fold cross validation

In [34]:
def classifiers_all():
    print('\nRandom Forest Classifier:')
    rfc = ensemble.RandomForestClassifier()
    rfc.fit(X_train, y_train)

    rfc_train_score = cross_val_score(rfc, X_train, y_train, cv= 5)
    rfc_test_score = cross_val_score(rfc, X_test, y_test, cv = 5)
    #print('Training set score:',rfc_train_score )


    print('\t\tAverage score on Train Data %.2f +/- %.2f'% (rfc_train_score.mean(),rfc_train_score.std() ))
    print('\t\tAverage score on test Data %.2f +/- %.2f'%(rfc_test_score.mean(), rfc_test_score.std()))
    
    print()
    print('Logistic Regression')   
    lr = LogisticRegression()
    lr.fit(X_train, y_train)

    lr_train_score = cross_val_score(lr, X_train, y_train, cv= 5)
    lr_test_score = cross_val_score(lr, X_test, y_test, cv = 5)
    print('\t\tAverage score on Train Data %.2f +/- %.2f'% (lr_train_score.mean(),lr_train_score.std() ))
    print('\t\tAverage score on test Data %.2f +/- %.2f'%(lr_test_score.mean(), lr_test_score.std()))
    print()
    print('Gradient Boosting Classifier')    
    gbc = ensemble.GradientBoostingClassifier()
    gbc.fit(X_train, y_train)
    gbc_train_score = cross_val_score(gbc, X_train, y_train, cv= 5)
    gbc_test_score = cross_val_score(gbc, X_test, y_test, cv = 5)


    print('\t\tAverage score on Train Data %.2f +/- %.2f'% (gbc_train_score.mean(),gbc_train_score.std() ))
    print('\t\tAverage score on test Data %.2f +/- %.2f'%(gbc_test_score.mean(), gbc_test_score.std()))
    

In [35]:
classifiers_all()


Random Forest Classifier:
		Average score on Train Data 0.84 +/- 0.01
		Average score on test Data 0.83 +/- 0.01

Logistic Regression
		Average score on Train Data 0.90 +/- 0.01
		Average score on test Data 0.89 +/- 0.02

Gradient Boosting Classifier
		Average score on Train Data 0.86 +/- 0.02
		Average score on test Data 0.87 +/- 0.02


Let's try hyperparameter Tuning for random forest classifier

In [150]:
rfc_params  = {
    'n_estimators':[100,200,500],
    'max_features':['auto', 'sqrt', 'log2'],
    'max_depth':[4, 6,7, 8, None],
    'min_samples_split':[2, 8]
}
rfc_grid = GridSearchCV(ensemble.RandomForestClassifier(random_state=10), param_grid=rfc_params)
rfc_grid.fit(X_train, y_train)

print(rfc_grid.best_score_)
print(rfc_grid.best_params_)

0.860215053763
{'max_depth': None, 'max_features': 'log2', 'min_samples_split': 2, 'n_estimators': 500}


Run the model with the best parameters which we got from GridSearchCV.

In [151]:
rfc = ensemble.RandomForestClassifier(**rfc_grid.best_params_)
rfc.fit(X_train, y_train)

rfc_train_score = cross_val_score(rfc, X_train, y_train, cv= 5)
rfc_test_score = cross_val_score(rfc, X_test, y_test, cv = 5)
print('Training set score:',rfc_train_score )
print('\nTest set score:',rfc_test_score )

print('\nAverage score on Train Data %.2f +/- %.2f'% (rfc_train_score.mean(),rfc_train_score.std() ))
print('\nAverage score on test Data %.2f +/- %.2f'%(rfc_test_score.mean(), rfc_test_score.std()))

Training set score: [ 0.85190039  0.87155963  0.84665793  0.87287025  0.86333771]

Test set score: [ 0.84756098  0.83486239  0.81345566  0.85321101  0.88957055]

Average score on Train Data 0.86 +/- 0.01

Average score on test Data 0.85 +/- 0.02


In [36]:
from sklearn.preprocessing import StandardScaler

In [37]:
df[['adv_count','verb_count','noun_count','pronoun_count','punc_count','part_cnt','adj_cnt','adp_cnt','det_cnt']] = \
StandardScaler().fit_transform(df[['adv_count','verb_count','noun_count','pronoun_count','punc_count','part_cnt','adj_cnt','adp_cnt','det_cnt']])


In [38]:
df[['sent_length']]  = StandardScaler().fit_transform(df[['sent_length']])

In [39]:
df[['person_ent','qty_ent','ordinal_ent','time_ent','org_ent','lang_ent','date_ent','card_ent','gpe_ent','fac_ent']] = \
StandardScaler().fit_transform(df[['person_ent','qty_ent','ordinal_ent','time_ent','org_ent','lang_ent','date_ent','card_ent','gpe_ent','fac_ent']])

In [40]:
Y = df['text_source']
X = np.array(df.drop(['text_sentence','text_source'], 1))

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.3,
                                                    random_state=0)

In [41]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr_train_score = cross_val_score(lr, X_train, y_train, cv= 5)
lr_test_score = cross_val_score(lr, X_test, y_test, cv = 5)
print('Training set score:',lr_train_score )
print('\nTest set score:',lr_test_score )

print('\nAverage score on Train Data %.2f +/- %.2f'% (lr_train_score.mean(),lr_train_score.std() ))
print('\nAverage score on test Data %.2f +/- %.2f'%(lr_test_score.mean(), lr_test_score.std()))

Training set score: [ 0.89252949  0.8938401   0.88728702  0.92398427  0.91064389]

Test set score: [ 0.91158537  0.88685015  0.86544343  0.88685015  0.90490798]

Average score on Train Data 0.90 +/- 0.01

Average score on test Data 0.89 +/- 0.02


In [94]:
parameters =[ {'C': [0.01, 0.1, 1, 10, 100],'solver':['liblinear'],'penalty':['l1', 'l2'],'fit_intercept':[True]},
            {'C': [0.01, 0.1, 1, 10, 100],'solver':['lbfgs','newton-cg'],'fit_intercept':[True]}
            ]

gr_logr = GridSearchCV(lr,param_grid = parameters )
gr_logr.fit(X_train,y_train)
print('Best Parameter ', gr_logr.best_params_)

Best Parameter  {'C': 1, 'fit_intercept': True, 'penalty': 'l2', 'solver': 'liblinear'}


In [160]:
lr = LogisticRegression(**gr_logr.best_params_, random_state = 10)
lr.fit(X_train, y_train)
lr_train_score = cross_val_score(lr, X_train, y_train, cv= 5)
lr_test_score = cross_val_score(lr, X_test, y_test, cv = 5)
print('Training set score:',lr_train_score )
print('\nTest set score:',lr_test_score )

print('\nAverage score on Train Data %.2f +/- %.2f'% (lr_train_score.mean(),lr_train_score.std() ))
print('\nAverage score on test Data %.2f +/- %.2f'%(lr_test_score.mean(), lr_test_score.std()))

Training set score: [ 0.89252949  0.8938401   0.88728702  0.92398427  0.91064389]

Test set score: [ 0.91158537  0.88685015  0.86544343  0.88685015  0.90490798]

Average score on Train Data 0.90 +/- 0.01

Average score on test Data 0.89 +/- 0.02


In [161]:
from  sklearn.svm import SVC
svc = SVC()
train = svc.fit(X_train, y_train)
print(X_train.shape, y_train.shape)
print('Training set score:', svc.score(X_train, y_train))
print('\nTest set score:', svc.score(X_test, y_test))

(3813, 5862) (3813,)
Training set score: 0.682926829268

Test set score: 0.698470948012


# Conclusion
We tried adding more features like including grammar, phrases, POS and sentence-level features like number of words, amount of punctuation, length of previous and next sentences, words repeated from one sentence to the next etc. 
Also used cross_validation with 5 folds.
 
After all these improvements, For random forest classifier, we got 2% increase in the score, while for logistic regression, we got 1% increase.

# CHALLENGE 1

# Milton vs. Carroll

In [42]:
#Load in and clean a new book, using our text_cleaner function and the spacy load function
paradise = gutenberg.raw('milton-paradise.txt')
paradise = re.sub(r'CHAPTER .*', '', paradise)
paradise = text_cleaner(paradise)

#Spacy load
paradise_doc = nlp(paradise)

In [43]:
#Extract and store sentences from spacy doc
paradise_sents = [[sent, 'Milton'] for sent in paradise_doc.sents]

#Add the paradise sentences to our existing sentences DF
sentences = pd.DataFrame(alice_sents + persuasion_sents + paradise_sents)
sentences.head()

Unnamed: 0,0,1
0,"(Alice, was, beginning, to, get, very, tired, ...",Carroll
1,"(So, she, was, considering, in, her, own, mind...",Carroll
2,"(There, was, nothing, so, VERY, remarkable, in...",Carroll
3,"(Oh, dear, !)",Carroll
4,"(I, shall, be, late, !, ')",Carroll


In [72]:
paradise_sentences = pd.DataFrame(paradise_sents)
paradise_bow = bow_features(paradise_sentences, common_words)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Processing row 0
Processing row 500
Processing row 1000
Processing row 1500
Processing row 2000
Processing row 2500
Processing row 3000


In [73]:
paradise_bow = entityty_types(paradise_bow)
paradise_bow= grammars(paradise_bow)
paradise_bow.fillna(0,inplace=True)

In [79]:
alice_wc = word_counts[word_counts.text_source == 'Carroll']

In [81]:
alice_wc.fillna(0,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)


In [101]:
paradise_bow2 = paradise_bow.iloc[:,0:8953]

In [102]:
print(paradise_bow2.shape)
print(alice_wc.shape)

(3241, 8953)
(1740, 8953)


In [103]:
# Identifying variables
X_paradise = paradise_bow2.drop(['text_sentence','text_source'], 1)
y_paradise = paradise_bow2.text_source


X_alice = alice_wc.drop(['text_sentence','text_source'], 1)
y_alice = alice_wc.text_source

# Combine the Paradise sentence data with the Alice data from the test set.
X_pa = pd.concat([X_paradise, X_alice], 0)
y_pa = pd.concat([y_paradise, y_alice], 0)


In [104]:
# Split into train and test sets
X_train_pa, X_test_pa, y_train_pa, y_test_pa = train_test_split(X_pa, y_pa, test_size=0.4, random_state=0)

In [106]:
# Model.
lr = LogisticRegression()
lr.fit(X_train_pa, y_train_pa)
lr_train_score = cross_val_score(lr, X_train_pa, y_train_pa, cv= 5)
lr_test_score = cross_val_score(lr, X_test_pa, y_test_pa, cv = 5)
print('Training set score:',lr_train_score )
print('\nTest set score:',lr_test_score )

print('\nAverage score on Train Data %.2f +/- %.2f'% (lr_train_score.mean(),lr_train_score.std() ))
print('\nAverage score on test Data %.2f +/- %.2f'%(lr_test_score.mean(), lr_test_score.std()))


Training set score: [ 0.95986622  0.95652174  0.9548495   0.96314908  0.94304858]

Test set score: [ 0.94235589  0.94987469  0.94235589  0.94221106  0.93969849]

Average score on Train Data 0.96 +/- 0.01

Average score on test Data 0.94 +/- 0.00


In [107]:
lr_pa_predicted = lr.predict(X_test_pa)
pd.crosstab(y_test_pa, lr_pa_predicted)

col_0,Carroll,Milton
text_source,Unnamed: 1_level_1,Unnamed: 2_level_1
Carroll,660,50
Milton,45,1238


These same common words and features are pretty good at separating sentences by Carroll and Milton. This means that the features are able to identify Carroll from other works. Now, let's see if the same features are able to identify Austen from other works.

# Milton vs. Austen

In [109]:
persuasion_wc = word_counts[word_counts.text_source == 'Austen']
persuasion_wc.shape

(3708, 8953)

In [112]:
# Identifying variables
X_paradise = paradise_bow2.drop(['text_sentence','text_source'], 1)
y_paradise = paradise_bow2.text_source


X_persuasion = persuasion_wc.drop(['text_sentence','text_source'], 1)
y_persuasion = persuasion_wc.text_source

# Combine the Paradise sentence data with the Alice data from the test set.
X_pp = pd.concat([X_paradise, X_persuasion], 0)
y_pp = pd.concat([y_paradise, y_persuasion], 0)

# Split into train and test sets
X_train_pp, X_test_pp, y_train_pp, y_test_pp = train_test_split(X_pp, y_pp, test_size=0.4, random_state=0)

# Model.
lr = LogisticRegression()
lr.fit(X_train_pp, y_train_pp)
lr_train_score = cross_val_score(lr, X_train_pp, y_train_pp, cv= 5)
lr_test_score = cross_val_score(lr, X_test_pp, y_test_pp, cv = 5)
print('Training set score:',lr_train_score )
print('\nTest set score:',lr_test_score )

print('\nAverage score on Train Data %.2f +/- %.2f'% (lr_train_score.mean(),lr_train_score.std() ))
print('\nAverage score on test Data %.2f +/- %.2f'%(lr_test_score.mean(), lr_test_score.std()))

Training set score: [ 0.94850299  0.94004796  0.95203837  0.96038415  0.95438175]

Test set score: [ 0.92625899  0.93345324  0.92266187  0.96043165  0.94964029]

Average score on Train Data 0.95 +/- 0.01

Average score on test Data 0.94 +/- 0.01


In [113]:
lr_pp_predicted = lr.predict(X_test_pp)
pd.crosstab(y_test_pp, lr_pp_predicted)

col_0,Austen,Milton
text_source,Unnamed: 1_level_1,Unnamed: 2_level_1
Austen,1416,64
Milton,77,1223


The same feature space is able to distinguish between Austen and Milton. This means that, in general, a set of common words, along with sentence length, and the numbers of different parts of speech, are enough to tell which author wrote a particular sentence.

# Conclusion
The same feature space is able to distinguish between Austen and Milton or y Carroll and Milton. 
This time we got scores of 94%