In [None]:
import pandas as pd
import numpy as np
import codecs
import re
import string
import os
import seaborn as sns
from sklearn import pipeline
from sklearn.model_selection import train_test_split, GroupKFold, StratifiedKFold, KFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction import text
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import matplotlib.pyplot as plt
from sklearn.base import clone
from sklearn.base import BaseEstimator, TransformerMixin
from textblob import TextBlob, Word
from nltk.corpus import wordnet as wn
from nltk.tag import pos_tag
%matplotlib inline

In [None]:
# open novel files in the folder
files = []
for i in os.listdir('txt/'):
    if i.endswith('.txt'):
        files.append(i)

In [None]:
def get_book_data(file_name):
    '''Function to read, split the novel into paragraphs and remove line separators'''
    filenm = 'txt/'+file_name
    book_data = []
    with codecs.open(filenm, "r",encoding='utf-8', errors='ignore') as f:
        # return the split results, which is all the words in the file.
        text = f.read().lower()
        text = re.sub(r'-',' ',text)
        text = text.split('\r\n\r\n')
        #remove the end of lines and replace with space
        text = [re.sub(r'[\r\n]',' ',line) for line in text] 


    # extracting author name and book name from the file name
    author_name = file_name[file_name.find('/')+1:file_name.find('_')]
    book_name = file_name[file_name.find('_')+3:file_name.find('.txt')]

    # removing punctuation
    paragraphs = []
    for line in text:
        if len(line) > 30:
            line = re.sub('[^  A-Za-z0-9]+','',line)
            paragraphs.append(line)
    
    book_data = [[author_name,book_name,i] for i in paragraphs] 
    return book_data

In [None]:
# creating the dataframe with the paragraphs
all_book_data = pd.DataFrame()
for file_name in files:
    all_book_data = all_book_data.append(get_book_data(file_name))

In [None]:
all_book_data.columns = ['author_name','novel_name','paragraph']

all_book_data.reset_index(inplace=True)

# all_book_data.shape

#### Testing the model on a small subset of the data

In [None]:
X = all_book_data[:20000].paragraph
y = all_book_data[:20000].author_name

In [None]:
#Sanity check
print(all_book_data[:20000].author_name.value_counts())
print(y.value_counts())

In [None]:
plt.figure(figsize=(20,6))
plt.bar(range(1,101),svd.explained_variance_ratio_)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_svd,y,test_size=0.2,random_state=42)

In [None]:
gbc = GradientBoostingClassifier(random_state=42)
gbc.fit(X_train,y_train)

In [None]:
y_pred = gbc.predict(X_test)

In [None]:
confusion_matrix(y_test,y_pred)

In [None]:
y_test.value_counts()

In [None]:
print(classification_report(y_test,y_pred))

### SGD Classifier

In [None]:
# stopwords
new_stop_words = ['a',
           'about',
           'above',
           'across',
           'after',
           'afterwards',
           'again',
           'against',
           'all',
           'almost',
           'alone',
           'along',
           'already',
           'also',
           'although',
           'always',
           'am',
           'among',
           'amongst',
           'amoungst',
           'amount',
           'an',
           'and',
           'another',
           'any',
           'anyhow',
           'anyone',
           'anything',
           'anyway',
           'anywhere',
           'are',
           'around',
           'as',
           'at',
           'back',
           'be',
           'became',
           'because',
           'become',
           'becomes',
           'becoming',
           'been',
           'before',
           'beforehand',
           'behind',
           'being',
           'below',
           'beside',
           'besides',
           'between',
           'beyond',
           'bill',
           'both',
           'bottom',
           'but',
           'by',
           'call',
           'can',
           'cannot',
           'cant',
           'co',
           'con',
           'could',
           'couldnt',
           'cry',
           'de',
           'describe',
           'detail',
           'do',
           'done',
           'down',
           'due',
           'during',
           'each',
           'eg',
           'eight',
           'either',
           'eleven',
           'else',
           'elsewhere',
           'empty',
           'enough',
           'etc',
           'even',
           'ever',
           'every',
           'everyone',
           'everything',
           'everywhere',
           'except',
           'few',
           'fifteen',
           'fifty',
           'fill',
           'find',
           'fire',
           'first',
           'five',
           'for',
           'former',
           'formerly',
           'forty',
           'found',
           'four',
           'from',
           'front',
           'full',
           'further',
           'get',
           'give',
           'go',
           'had',
           'has',
           'hasnt',
           'have',
           'he',
           'hence',
           'her',
           'here',
           'hereafter',
           'hereby',
           'herein',
           'hereupon',
           'hers',
           'herself',
           'him',
           'himself',
           'his',
           'how',
           'however',
           'hundred',
           'i',
           'ie',
           'if',
           'in',
           'inc',
           'indeed',
           'interest',
           'into',
           'is',
           'it',
           'its',
           'itself',
           'keep',
           'last',
           'latter',
           'latterly',
           'least',
           'less',
           'ltd',
           'made',
           'many',
           'may',
           'me',
           'meanwhile',
           'might',
           'mill',
           'mine',
           'more',
           'moreover',
           'most',
           'mostly',
           'move',
           'much',
           'must',
           'my',
           'myself',
           'name',
           'namely',
           'neither',
           'never',
           'nevertheless',
           'next',
           'nine',
           'no',
           'nobody',
           'none',
           'noone',
           'nor',
           'not',
           'nothing',
           'now',
           'nowhere',
           'of',
           'off',
           'often',
           'on',
           'once',
           'one',
           'only',
           'onto',
           'or',
           'other',
           'others',
           'otherwise',
           'our',
           'ours',
           'ourselves',
           'out',
           'over',
           'own',
           'part',
           'per',
           'perhaps',
           'please',
           'put',
           'rather',
           're',
           'same',
           'see',
           'seem',
           'seemed',
           'seeming',
           'seems',
           'serious',
           'several',
           'she',
           'should',
           'show',
           'side',
           'since',
           'sincere',
           'six',
           'sixty',
           'so',
           'some',
           'somehow',
           'someone',
           'something',
           'sometime',
           'sometimes',
           'somewhere',
           'still',
           'such',
           'system',
           'take',
           'ten',
           'than',
           'that',
           'the',
           'their',
           'them',
           'themselves',
           'then',
           'thence',
           'there',
           'thereafter',
           'thereby',
           'therefore',
           'therein',
           'thereupon',
           'these',
           'they',
           'thick',
           'thin',
           'third',
           'this',
           'those',
           'though',
           'three',
           'through',
           'throughout',
           'thru',
           'thus',
           'to',
           'together',
           'too',
           'top',
           'toward',
           'towards',
           'twelve',
           'twenty',
           'two',
           'un',
           'under',
           'until',
           'up',
           'upon',
           'us',
           'very',
           'via',
           'was',
           'we',
           'well',
           'were',
           'what',
           'whatever',
           'when',
           'whence',
           'whenever',
           'where',
           'whereafter',
           'whereas',
           'whereby',
           'wherein',
           'whereupon',
           'wherever',
           'whether',
           'which',
           'while',
           'whither',
           'who',
           'whoever',
           'whole',
           'whom',
           'whose',
           'why',
           'will',
           'with',
           'within',
           'without',
           'would',
           'yet',
           'you',
           'your',
           'yours',
           'yourself',
           'yourselves']

Removing the roman numerals that are are part of the chapter heading

In [None]:
roman_nums = pd.read_excel('roman-numerals-1-1000.xlsx',header=None)

In [None]:
roman_nums = roman_nums[0].values.tolist()

In [None]:
roman_nums = [x.lower() for x in roman_nums]

In [None]:
roman_nums = roman_nums[1:]

In [None]:
X = all_book_data.paragraph
y = all_book_data.author_name

In [None]:
all_book_data.paragraph[0]

Applying TF-IDF vectorizer

In [None]:
tfidf = TfidfVectorizer(stop_words=roman_nums)
tfidf.fit(X)
counts_X = tfidf.transform(X)

In [None]:
cv = CountVectorizer()
cv.fit(X)
counts_X = cv.transform(X)

### SGD Classifier - log loss function

In [None]:
sgd = SGDClassifier(loss='log',n_jobs=-1,random_state=42)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(counts_X,y,test_size=0.1,random_state=42)

In [None]:
sgd.fit(X_train,y_train)

In [None]:
# sgd.partial_fit(X_train,y_train,classes=y_train.unique())

In [None]:
# y_pred = sgd.predict(X_test[0])
# y_pred

In [None]:
y_pred_prob = sgd.predict_proba(X_test[0])

In [None]:
author_likeness = list(zip(sgd.classes_,y_pred_prob[0]))

In [None]:
sorted(author_likeness,key=lambda x: x[1],reverse=True)

In [None]:
all_book_data.loc[3143257].values

In [None]:
y_test=='George Alfred Henty'

In [None]:
print(classification_report(y_test,y_pred))

In [None]:
sgd2 = SGDClassifier(loss='hinge',n_jobs=-1,random_state=42)

In [None]:
sgd2.fit(X_train,y_train)

In [None]:
author_out = sgd2.predict(counts_X[721288])
author_out

In [None]:
y_pred_2 = sgd2.predict(X_test)
accuracy_score(y_test,y_pred_2)

In [None]:
y_test

In [None]:
sum(y_pred_2==y_test)/len(y_pred_2)

### Stratified KFold 

In [None]:
skf = StratifiedKFold(n_splits=numFolds,shuffle=True)
total = 0
for train_indices, test_indices in skf.split(counts_X,y):
    X_train = counts_X[train_indices,:]
    y_train = y.iloc[train_indices]
    X_test = counts_X[test_indices,:] 
    y_test = y.iloc[test_indices]

    sgd2.fit(X_train, y_train)
    y_pred_3 = sgd2.predict(X_test)
    total += accuracy_score(y_test, y_pred_3)
accuracy = total / numFolds

In [None]:
accuracy

In [None]:
y_test.loc[y_test==y_pred_2]

In [None]:
# predict the author of a single input
y_pred_act = sgd2.predict(counts_X[600000])
y_pred_act

In [None]:
y_test.unique()

In [None]:
all_book_data.iloc[2933598][3]

In [None]:
all_book_data[all_book_data.novel_name=='The Invisible Man']

# Feature union and pipeline

In [None]:
#count of stop words per paragraph

class StopWordCounter(BaseEstimator, TransformerMixin):
    """Takes in paragraph, gives count of stop words"""

    def __init__(self):
        pass

    def stop_word_freq(self,x):
        word_tokens = x.split()
        counter = 0
        for word in word_tokens:
            if word in new_stop_words:
                counter += 1
        return counter
    
    def transform(self, X, y=None):
        """The workhorse of this feature extractor"""
        return X.apply(self.stop_word_freq)

    def fit(self, X, y=None):
        """Returns `self` unless something different happens in train and test"""
        return self
    

In [None]:
#average word length
class AverageWordLengthExtractor(BaseEstimator, TransformerMixin):
    """Takes in paragraph, gives average word length"""

    def __init__(self):
        pass

    def average_word_length(self, name):
        """Helper code to compute average word length of a name"""
        return np.mean([len(word) for word in name.split()])

    def transform(self, X, y=None):
        """The workhorse of this feature extractor"""
        return X.apply(self.average_word_length)

    def fit(self, X, y=None):
        """Returns `self` unless something different happens in train and test"""
        return self

In [None]:
#noun count

class NounCounter(BaseEstimator, TransformerMixin):
    """Takes in paragraph, gives count of nouns in the sentence"""

    def __init__(self):
        pass

    def noun_counter(self,x):
        word_tokens = x.split()
        nouns = 0
        blob = TextBlob(x)
        nouns_list = [n for n,t in blob.tags if t == 'NN']
        for word in word_tokens:
            if word in nouns_list:
                nouns += 1
        return nouns
    
    def transform(self, X, y=None):
        """The workhorse of this feature extractor"""
        return X.apply(self.noun_counter)

    def fit(self, X, y=None):
        """Returns `self` unless something different happens in train and test"""
        return self

In [None]:
#adjective counter

class AdjectiveCounter(BaseEstimator, TransformerMixin):
    """Takes in paragraph, gives count of adjecives"""

    def __init__(self):
        pass

    def adj_counter(self,x):
        word_tokens = x.split()
        adj = 0
        blob = TextBlob(x)
        adj_list = [n for n,t in blob.tags if t == 'JJ']
        for word in word_tokens:
            if word in adj_list:
                adj += 1
        return adj
    
    def transform(self, X, y=None):
        """The workhorse of this feature extractor"""
        return X.apply(self.adj_counter)

    def fit(self, X, y=None):
        """Returns `self` unless something different happens in train and test"""
        return self

In [None]:
#Features using the functions above
features = pipeline.FeatureUnion([('tfidf',TfidfVectorizer(stop_words=roman_nums)),
                                        ('stop_word_count',StopWordCounter()),
                                        ('mean_word_length',AverageWordLengthExtractor()),
                                        ('count_nouns',NounCounter()),
                                        ('count_adjectives',AdjectiveCounter()),
                                      ])


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [None]:
#Final pipeline
main_pipe_steps = [('feature_matrix',features),
                    ('model',sgd2)]

main_pipe = pipeline.Pipeline(main_pipe_steps)
main_pipe.fit(X_train,y_train)
main_pipe.predict(y_test)