# Create 200 documents each of 150 words from 8 different authors from Gutenberg library

In [1]:
import nltk
from nltk.probability import FreqDist
import matplotlib.pyplot as plt
from nltk import word_tokenize
from nltk.corpus import stopwords
import random
import re
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
import warnings
warnings.filterwarnings('ignore')

stop_words = set(stopwords.words('english'))

def create_document(raw_text,author_name):
    sentences = nltk.sent_tokenize(raw_text)
    random.shuffle(sentences)
    word_count_dic = []
    documents = []
    for sentence in sentences:
        words_in_sentence = nltk.word_tokenize(sentence)
        for word in words_in_sentence:
            if len(word_count_dic) > 149:
                documents.append((" ".join(word_count_dic.copy()),author_name))
                word_count_dic.clear()
                if(len(documents) > 199):
                    return documents
            if not word.lower() in stop_words and re.match('[a-z]+', word.lower()):
                word_count_dic.append(word.lower())     
    return documents

In [2]:
final_documents= []

text_austen = nltk.corpus.gutenberg.raw('austen-emma.txt')
documents_austen = create_document(text_austen,'austen')

text_kjv = nltk.corpus.gutenberg.raw('bible-kjv.txt')
documents_kjv = create_document(text_kjv,'kjv')

text_chesterton = nltk.corpus.gutenberg.raw('chesterton-ball.txt')
documents_chesterton = create_document(text_chesterton,'chesterton')

text_edgeworth = nltk.corpus.gutenberg.raw('edgeworth-parents.txt')
documents_edgeworth = create_document(text_edgeworth,'edgeworth')

text_melville = nltk.corpus.gutenberg.raw('melville-moby_dick.txt')
documents_melville = create_document(text_melville,'melville')

text_milton = nltk.corpus.gutenberg.raw('milton-paradise.txt')
documents_milton = create_document(text_milton,'milton')

text_whitman = nltk.corpus.gutenberg.raw('whitman-leaves.txt')
documents_whitman = create_document(text_whitman,'whitman')

text_shakespeare = nltk.corpus.gutenberg.raw('shakespeare-hamlet.txt')
documents_shakespeare = create_document(text_whitman,'shakespeare')


final_documents = documents_austen + documents_kjv + documents_chesterton + documents_edgeworth + documents_melville + documents_milton + documents_whitman + documents_shakespeare

random.shuffle(final_documents)


# Transforming documents into Bag of Words

In [3]:
each_document_words = []
y = []
for document in final_documents:
    each_document_words.append(document[0])
    y.append(document[1])

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(each_document_words)

#Display BOW
import pandas as pd
pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())

Unnamed: 0,000,02,11,aaron,aback,abaft,abandon,abandoned,abandoning,abandonment,...,zohar,zolas,zone,zones,zooks,zoological,zoology,zorah,zuyder,zuzims
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Training machine with transformed data
4.1 Splitting data into test and train data 

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.10, random_state=0)

4.2 Training machine using Multinomial Naive Bayes model

In [5]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

nb_MN = MultinomialNB()
%time nb_MN.fit(X_train, y_train)

CPU times: user 19.2 ms, sys: 12.7 ms, total: 31.9 ms
Wall time: 44.6 ms


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

4.3 Predicting authors for test data and evaluating accuracy

In [6]:
y_pred_class = nb_MN.predict(X_test)
print('Multinomial NB BOW: Accuracy: ')
metrics.accuracy_score(y_test, y_pred_class) * 100

Multinomial NB BOW: Accuracy: 


75.625

confusion matrix

In [7]:
print('Confusion matrix')
metrics.confusion_matrix(y_test, y_pred_class)

Confusion matrix


array([[22,  0,  0,  0,  0,  0,  0,  0],
       [ 0, 26,  0,  0,  0,  0,  0,  0],
       [ 0,  0, 19,  0,  0,  0,  0,  0],
       [ 0,  0,  0, 21,  0,  0,  0,  0],
       [ 0,  0,  0,  0, 15,  0,  0,  0],
       [ 0,  0,  0,  0,  0, 17,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0, 24],
       [ 0,  0,  0,  0,  0,  0, 15,  1]])

# Predicting any random sentence from book Emma by Austen

In [8]:
y_pred_class = nb_MN.predict(vectorizer.transform(['The want of Miss Taylor would be felt every hour of every day. She recalled her past kindness--the kindness, the affection of sixteen years--how she had taught and how she had played with her from five years old--how she had devoted all her powers to attach and amuse her in health--and how nursed her through the various illnesses of childhood']))
print(y_pred_class)

['austen']


# Evaluation - 10 fold testing
5.1 Ten fold testing without shuffle

In [9]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(nb_MN, X, y, cv=10)
print('Mutinomial NB BOW : 10 fold test WITHOUT shuffle')
print(scores)
print("Multinomail NB BOW: Mean Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Mutinomial NB BOW : 10 fold test WITHOUT shuffle
[0.775   0.75    0.76875 0.78125 0.775   0.7625  0.775   0.775   0.7625
 0.76875]
Multinomail NB BOW: Mean Accuracy: 0.77 (+/- 0.02)


5.2 Ten fold testing with shuffle

In [10]:
from sklearn.model_selection import ShuffleSplit
import statistics

cv = ShuffleSplit(n_splits=10, test_size=0.1, random_state=0)
numbers = cross_val_score(nb_MN, X, y, cv=cv)
print('Multinomail NB BOW : 10 fold test WITH shuffle')
print(numbers)

print(f'Multinomail NB BOW: Mean Accuracy: {statistics.mean(numbers)}' )


Multinomail NB BOW : 10 fold test WITH shuffle
[0.75625 0.74375 0.79375 0.7625  0.76875 0.78125 0.75625 0.74375 0.7375
 0.75625]
Multinomail NB BOW: Mean Accuracy: 0.76


5.3 Ten fold testing with shuffle using StandardScaler

In [11]:
from sklearn import preprocessing
from sklearn.pipeline import make_pipeline
clf_MN = make_pipeline(preprocessing.StandardScaler(with_mean=False), MultinomialNB() )
numbers = cross_val_score(clf_MN, X, y, cv=cv)
print('Multnomail NB BOW StandardScaler: 10 fold test WITH shuffle')
print(numbers)
print(f'Multinomial NB BOW StandardScaler: Mean Accuracy {statistics.mean(numbers)}' )


Multnomail NB BOW StandardScaler: 10 fold test WITH shuffle
[0.75625 0.7625  0.7875  0.7625  0.775   0.775   0.75625 0.74375 0.75
 0.75625]
Multinomial NB BOW StandardScaler: Mean Accuracy 0.7625


# Error Analysis

In [12]:
#print(vectorizer.vocabulary_)

unmatched_index = 0
for num in range(len(y_pred_class)):
    if not y_pred_class[num] == y_test[num]:
        unmatched_index = num
        print(f'Actual author of the document: {y_test[num]}')
        print(f'Predicted author of the document: {y_pred_class[num]}')
        break
          

words_index = []
count = 0
for num in X_test.toarray()[unmatched_index]:
    if num > 0:
        words_index.append(count) 
    count += 1
        
words = []
for index in words_index:
    for wor,ind in vectorizer.vocabulary_.items():
        if(index == ind):
            words.append(wor)

print('Words present in the document: ')
print(words)


Actual author of the document: chesterton
Predicted author of the document: austen
Words present in the document: 
['agnostic', 'alone', 'anger', 'anklet', 'ask', 'asked', 'atheist', 'balance', 'bars', 'bent', 'beyond', 'boiling', 'bowed', 'broke', 'brought', 'ca', 'came', 'carpentry', 'carrying', 'catholic', 'catholics', 'century', 'chair', 'chess', 'circle', 'colour', 'confine', 'copy', 'cried', 'cross', 'decorative', 'deep', 'deists', 'describe', 'described', 'driven', 'eighteenth', 'enough', 'established', 'even', 'every', 'eyes', 'fanatic', 'far', 'farther', 'fear', 'felt', 'figure', 'flowers', 'frankly', 'french', 'gambits', 'game', 'glanced', 'glittering', 'going', 'good', 'happen', 'head', 'highlands', 'hill', 'image', 'incident', 'ineffectual', 'insolent', 'intolerable', 'large', 'let', 'little', 'loneliness', 'looked', 'ludgate', 'macian', 'magnificence', 'man', 'mass', 'midst', 'next', 'night', 'old', 'open', 'opened', 'orthodox', 'parry', 'pause', 'players', 'points', 'pois

# LogisticRegression on BOW

In [13]:
# Logistics regression
# 1. import
from sklearn.linear_model import LogisticRegression

# 2. instantiate a logistic regression model
logreg = LogisticRegression()

# 3. train the model using X_train_dtm
%time logreg.fit(X_train, y_train)
y_pred_class = logreg.predict(X_test)

# calculate accuracy
print('LogisticRegression BOW: Accuracy: ')
print(metrics.accuracy_score(y_test, y_pred_class))

print('LogisticRegression BOW: Confusion matrix:')
print(metrics.confusion_matrix(y_test, y_pred_class))
# print message text for the false positives (ham incorrectly classified as spam)

#10 fold without shuffle
scores = cross_val_score(logreg, X, y, cv=10)
print('LogisticRegression BOW : 10 fold test WITHOUT shuffle')
print(scores)

print("LogisticRegression BOW: Mean Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

# 10 fold with shuffle
numbers = cross_val_score(logreg, X, y, cv=cv)
print('LogisticRegression BOW : 10 fold test WITH shuffle')
print(numbers)
print(f'LogisticRegression BOW: Mean Accuracy {statistics.mean(numbers)}' )

# 10 fold with shuffle using StandardScalar
clf_logreg = make_pipeline(preprocessing.StandardScaler(with_mean=False), LogisticRegression() )
numbers = cross_val_score(clf_logreg, X, y, cv=cv)
print('LogisticRegression BOW StandardScaler: 10 fold test WITH shuffle')
print(numbers)
print(f'LogisticRegression BOW StandardScaler: Mean Accuracy {statistics.mean(numbers)}' )



CPU times: user 918 ms, sys: 19.3 ms, total: 937 ms
Wall time: 489 ms
LogisticRegression BOW: Accuracy: 
0.76875
LogisticRegression BOW: Confusion matrix:
[[22  0  0  0  0  0  0  0]
 [ 0 26  0  0  0  0  0  0]
 [ 0  0 19  0  0  0  0  0]
 [ 0  0  0 21  0  0  0  0]
 [ 0  0  0  0 15  0  0  0]
 [ 0  0  0  0  0 17  0  0]
 [ 0  0  0  0  0  0  2 22]
 [ 0  0  0  0  0  0 15  1]]
LogisticRegression BOW : 10 fold test WITHOUT shuffle
[0.80625 0.7625  0.775   0.8     0.775   0.76875 0.8     0.78125 0.79375
 0.80625]
LogisticRegression BOW: Mean Accuracy: 0.79 (+/- 0.03)
LogisticRegression BOW : 10 fold test WITH shuffle
[0.76875 0.78125 0.80625 0.76875 0.78125 0.7875  0.775   0.74375 0.78125
 0.75625]
LogisticRegression BOW: Mean Accuracy 0.775
LogisticRegression BOW StandardScaler: 10 fold test WITH shuffle
[0.75625 0.7625  0.7875  0.75625 0.76875 0.78125 0.7625  0.75    0.75
 0.75625]
LogisticRegression BOW StandardScaler: Mean Accuracy 0.7631249999999999


# SVCLinear on BOW

In [14]:
from sklearn import svm

clf_SVC = svm.SVC(kernel='linear', C=1)
scores = cross_val_score(clf_SVC, X, y, cv=10)
print('SVCLinear BOW : 10 fold test WITHOUT shuffle')
print(scores)
print("SVCLinear BOW: Mean Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))


numbers = cross_val_score(clf_SVC, X, y, cv=cv)
print('SVCLinear BOW : 10 fold test WITH shuffle')
print(numbers)
print(f'SVCLinear BOW: Mean Accuracy {statistics.mean(numbers)}' )


clf_StSVC = make_pipeline(preprocessing.StandardScaler(with_mean=False), svm.SVC(C=1))
numbers = cross_val_score(clf_StSVC, X, y, cv=cv)
print('SVCLinear BOW StandardScaler: 10 fold test WITH shuffle')
print(numbers)
print(f'SVCLinear BOW StandardScaler: Mean Accuracy {statistics.mean(numbers)}' )


SVCLinear BOW : 10 fold test WITHOUT shuffle
[0.80625 0.76875 0.79375 0.8125  0.78125 0.78125 0.7875  0.78125 0.79375
 0.8    ]
SVCLinear BOW: Mean Accuracy: 0.79 (+/- 0.03)
SVCLinear BOW : 10 fold test WITH shuffle
[0.78125 0.7875  0.80625 0.78125 0.78125 0.8     0.78125 0.75    0.775
 0.7625 ]
SVCLinear BOW: Mean Accuracy 0.780625
SVCLinear BOW StandardScaler: 10 fold test WITH shuffle
[0.69375 0.68125 0.64375 0.65625 0.64375 0.69375 0.65    0.6375  0.63125
 0.675  ]
SVCLinear BOW StandardScaler: Mean Accuracy 0.660625


# Transforming documents to TFIDF

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer=TfidfVectorizer(use_idf=True)
 
X_TfIdf=tfidf_vectorizer.fit_transform(each_document_words)

#print(tfidf_vectorizer_vectors)

import pandas as pd
pd.DataFrame(X_TfIdf.toarray(), columns=tfidf_vectorizer.get_feature_names())

Unnamed: 0,000,02,11,aaron,aback,abaft,abandon,abandoned,abandoning,abandonment,...,zohar,zolas,zone,zones,zooks,zoological,zoology,zorah,zuyder,zuzims
0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0


# Mutlinomial NaiveBayes on TFIDF

In [16]:
X_TfIdf_train, X_TfIdf_test, y_train, y_test = train_test_split(X_TfIdf, y,test_size=0.10, random_state=1)

%time nb_MN.fit(X_TfIdf_train, y_train)

# 4. make class predictions for X_test_dtm
y_pred_class = nb_MN.predict(X_TfIdf_test)

print(f'Multinomail NB TFIDF: {metrics.accuracy_score(y_test, y_pred_class) * 100}')

scores = cross_val_score(nb_MN, X_TfIdf, y, cv=10)
print('Multonmial TFIDF BOW : 10 fold test WITHOUT shuffle')
print(scores)
print("Multinomail NB TFIDF: Mean Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

numbers = cross_val_score(nb_MN, X_TfIdf, y, cv=cv)
print('Multinomail NB TFIDF: 10 fold test WITH shuffle')
print(numbers)
print(f'Multinomail NB TFIDF: Mean Accuracy {statistics.mean(numbers)}' )

numbers  = cross_val_score(clf_MN, X_TfIdf, y, cv=cv)
print('Multinomail NB TFIDF StandardScaler: 10 fold test WITH shuffle')
print(numbers)
print(f'Multinomail NB TFIDF StandardScaler: Mean Accuracy {statistics.mean(numbers)}' )

CPU times: user 10.3 ms, sys: 1.47 ms, total: 11.8 ms
Wall time: 9.45 ms
Multinomail NB TFIDF: 74.375
Multonmial TFIDF BOW : 10 fold test WITHOUT shuffle
[0.7875  0.74375 0.76875 0.78125 0.7625  0.7625  0.78125 0.78125 0.76875
 0.775  ]
Multinomail NB TFIDF: Mean Accuracy: 0.77 (+/- 0.02)
Multinomail NB TFIDF: 10 fold test WITH shuffle
[0.78125 0.7625  0.7875  0.76875 0.7625  0.79375 0.7625  0.74375 0.7375
 0.75625]
Multinomail NB TFIDF: Mean Accuracy 0.765625
Multinomail NB TFIDF StandardScaler: 10 fold test WITH shuffle
[0.75625 0.7625  0.7875  0.7625  0.76875 0.775   0.7625  0.74375 0.75
 0.75625]
Multinomail NB TFIDF StandardScaler: Mean Accuracy 0.7625


# LogisticsRegression on TFIDF

In [17]:
%time logreg.fit(X_TfIdf_train, y_train)
y_pred_class = logreg.predict(X_TfIdf_test)

# calculate accuracy
print(f'LogisticRegression TFIDF: {metrics.accuracy_score(y_test, y_pred_class) * 100}')
metrics.accuracy_score(y_test, y_pred_class)

metrics.confusion_matrix(y_test, y_pred_class)
# print message text for the false positives (ham incorrectly classified as spam)


scores = cross_val_score(logreg, X_TfIdf, y, cv=10)
print('LogisticRegression TFIDF : 10 fold test WITHOUT shuffle')
print(scores)
print("LogisticRegression TFIDF: Mean Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

#Shuffle Split
numbers = cross_val_score(logreg, X_TfIdf, y, cv=cv)
print('LogisticRegression TFIDF: 10 fold test WITH shuffle')
print(numbers)
print(f'LogisticRegression TFIDF: Mean Accuracy {statistics.mean(numbers)}' )

# Using StandardScalar transformation
numbers = cross_val_score(clf_logreg, X_TfIdf, y, cv=cv)
print('LogisticRegression TFIDF StandardScaler: 10 fold test WITH shuffle')
print(numbers)
print(f'LogisticRegression TFIDF StandardScaler: Mean Accuracy {statistics.mean(numbers)}' )




CPU times: user 451 ms, sys: 8.04 ms, total: 459 ms
Wall time: 231 ms
LogisticRegression TFIDF: 72.5
LogisticRegression TFIDF : 10 fold test WITHOUT shuffle
[0.80625 0.75    0.7625  0.78125 0.76875 0.76875 0.78125 0.78125 0.78125
 0.7875 ]
LogisticRegression TFIDF: Mean Accuracy: 0.78 (+/- 0.03)
LogisticRegression TFIDF: 10 fold test WITH shuffle
[0.775   0.75    0.8     0.76875 0.775   0.79375 0.7625  0.75    0.7375
 0.75625]
LogisticRegression TFIDF: Mean Accuracy 0.766875
LogisticRegression TFIDF StandardScaler: 10 fold test WITH shuffle
[0.75625 0.7625  0.7875  0.75625 0.7625  0.775   0.7625  0.75    0.75
 0.75625]
LogisticRegression TFIDF StandardScaler: Mean Accuracy 0.761875


# SVCLinear on TFIDF


In [18]:
scores = cross_val_score(clf_SVC, X_TfIdf, y, cv=10)
print('SVCLinear TFIDF : 10 fold test WITHOUT shuffle')
print(scores)
print("SVCLinear TFIDF: Mean Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

#Shuffle split
numbers = cross_val_score(clf_SVC, X_TfIdf, y, cv=cv)
print('SVCLinear TFIDF : 10 fold test WITH shuffle')
print(numbers)
print(f'SVCLinear TFIDF: Mean Accuracy {statistics.mean(numbers)}' )


numbers = cross_val_score(clf_StSVC, X_TfIdf, y, cv=cv)
print('SVCLinear TFIDF StandardScaler: 10 fold test WITH shuffle')
print(numbers)
print(f'SVCLinear TFIDF StandardScaler: Mean Accuracy {statistics.mean(numbers)}' )



SVCLinear TFIDF : 10 fold test WITHOUT shuffle
[0.8     0.75625 0.75625 0.7875  0.7625  0.76875 0.7875  0.775   0.775
 0.78125]
SVCLinear TFIDF: Mean Accuracy: 0.78 (+/- 0.03)
SVCLinear TFIDF : 10 fold test WITH shuffle
[0.79375 0.78125 0.79375 0.775   0.775   0.81875 0.75    0.74375 0.775
 0.7625 ]
SVCLinear TFIDF: Mean Accuracy 0.776875
SVCLinear TFIDF StandardScaler: 10 fold test WITH shuffle
[0.7     0.6625  0.6     0.65625 0.61875 0.675   0.65    0.625   0.6
 0.6625 ]
SVCLinear TFIDF StandardScaler: Mean Accuracy 0.645
