In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import re 
import en_core_web_lg
nlp = en_core_web_lg.load()
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

In [None]:
train = pd.read_csv('/kaggle/input/spooky-author-identification/train.zip')
test = pd.read_csv('/kaggle/input/spooky-author-identification/test.zip')

In [None]:
def string_cleanup(string):
    string = str(string)
    string = re.sub('[^A-Za-z0-9\s]+', ' ', string)
    string = re.sub('[^A-Za-z\s]+', ' ', string)
    string = re.sub('[\s\s]+', ' ', string)
    string = string.strip()
    return string.lower()


def remove_stop_spacy(x, spacy_nlp):
    customize_stop_words = []

    for w in customize_stop_words:
        w = w.lower()
        spacy_nlp.vocab[w].is_stop = True
    x = x.lower()
    doc = spacy_nlp(x)
    tokens = []
    for token in doc:
        if not token.is_stop and len(str(token.text)) >1:
            tokens.append(token.text)
    return (' '.join(tokens))

In [None]:
train['clean_text'] = train['text'].apply(string_cleanup)

train['no_stop'] = train['clean_text'].apply(lambda x:remove_stop_spacy(x, nlp))

Alright, so we need to understand the basic fact that one easy way to identify the style of a particluar author is to first understand what kinf of Parts of Speech they are using on an average. If we can get these numbers, this will definitely add a lot of value to our model.

Example - Few authors tend to have more characters in their novels which means their Proper noun count on an average will be high compared to others. Let's see if we can get some patterns.

In [None]:
def get_count_of_pos(x, nlp):
    try:
        len_val = len((x).split())
        noun_count = 0
        propn_count = 0
        verb_count = 0
        adv_count = 0
        adj_count = 0
        doc = nlp(x)
        for tok in doc:
    #         print(tok, tok.pos_)
            if tok.pos_ == 'NOUN':
                noun_count = noun_count + 1
            if tok.pos_ == 'PROPN':
                propn_count = propn_count + 1
            if tok.pos_ == 'VERB':
                verb_count = verb_count + 1
            if tok.pos_ == 'ADV':
                adv_count = adv_count + 1
            if tok.pos_ == 'ADJ':
                adj_count = adj_count + 1 
        return pd.Series([len_val, noun_count, propn_count, verb_count, adv_count, adj_count])
    except(e):
        print(x)

In [None]:
train[['word_count', 'noun_count', 'propn_count', 'verb_count', 'adv_count', 'adj_count']] = train['text'].apply(lambda x:get_count_of_pos(x, nlp))

In [None]:
train.groupby(['author'])['word_count'].agg('mean')

In [None]:
train.groupby(['author'])['noun_count'].agg('mean')

In [None]:
train.groupby(['author'])['propn_count'].agg('mean')

This is great! As rightly predicted, The author 'MWS' does not have many characters in his books compared to the other author 'HPL'. This is a very valuable insight!

In [None]:
train.groupby(['author'])['verb_count'].agg('mean')


In [None]:
train.groupby(['author'])['adv_count'].agg('mean')

In [None]:
train.groupby(['author'])['adj_count'].agg('mean')

Although we tried everything we could to get the meta data details of the text, only proper noun is giving us some extra information as it indicates a definite pattern. Rest all can be ignored.

In [None]:
skf = StratifiedKFold(n_splits=5)

In [None]:
X = train[['propn_count', 'verb_count', 'no_stop', 'text']]
y = train['author']

In [None]:
for train_index, test_index in skf.split(X, y):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

In [None]:
# text and numeric classes that use sklearn base libaries
class TextTransformer(BaseEstimator, TransformerMixin):
    """
    Transform text features
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None, *parg, **kwarg):
        return self

    def transform(self, X):
        return X[self.key]
    
class NumberTransformer(BaseEstimator, TransformerMixin):
    """
    Transform numeric features
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]

In [None]:
propn_count = Pipeline([
                ('transformer', NumberTransformer(key='propn_count')),
                ('standard_scalar', StandardScaler())
                ])

In [None]:
text = Pipeline([
                ('transformer', TextTransformer(key='no_stop')),
                ('vectorizer', TfidfVectorizer(ngram_range=(1,3)))
                ])

In [None]:
features = FeatureUnion([('Text_Feature', text),
                         ('propn_count', propn_count)
                      ])

In [None]:
clf = LogisticRegression(random_state=0, multi_class = 'ovr', max_iter = 2000)


In [None]:
pipe = Pipeline([('features', features),
                 ('clf',clf)                 
                 ])

In [None]:
pipe.fit(X_train, y_train)

In [None]:
preds = pipe.predict(X_test)

In [None]:
accuracy_score(preds, y_test)