# I. Importing Libraries and Data

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/spooky-author-identification/test.zip
/kaggle/input/spooky-author-identification/sample_submission.zip
/kaggle/input/spooky-author-identification/train.zip


In [2]:
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import tensorflow.keras.utils as ku
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import string

Using TensorFlow backend.


In [3]:
data_train = pd.read_csv("/kaggle/input/spooky-author-identification/train.zip")
data_val = pd.read_csv("/kaggle/input/spooky-author-identification/test.zip")

print('Training data shape:',data_train.shape)
print('Validation data shape:',data_val.shape)
data_train.head()

Training data shape: (19579, 3)
Validation data shape: (8392, 2)


Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


# II. Text Preprocessing

In [4]:
StopWords = set(stopwords.words('english'))

def text_preprocess(text):
    trans = str.maketrans('','',string.punctuation)
    text = text.translate(trans)
    text = ' '.join([word.lower() for word in text.split() if word.lower() not in StopWords])
    return text

data_train['text'] = data_train['text'].apply(text_preprocess)
data_val['text'] = data_val['text'].apply(text_preprocess)
data_train.head()

Unnamed: 0,id,text,author
0,id26305,process however afforded means ascertaining di...,EAP
1,id17569,never occurred fumbling might mere mistake,HPL
2,id11008,left hand gold snuff box capered hill cutting ...,EAP
3,id27763,lovely spring looked windsor terrace sixteen f...,MWS
4,id12958,finding nothing else even gold superintendent ...,HPL


# III. Tokenization and Lemmatization

In [5]:
label_encoder = LabelEncoder()
X_train = data_train['text']
X_train = X_train.tolist()
X_test = data_val['text']
X_test = X_test.tolist()
y_train = data_train['author']
y_train = label_encoder.fit_transform(y_train)
y_train_cat = ku.to_categorical(y_train, num_classes=3)
val_id = data_val['id']

lemmatizer = WordNetLemmatizer()
X_train_lemm = []
for text in X_train:
    lem_text = ''
    for word in text.split():
        lem_word = lemmatizer.lemmatize(word, pos='v')
        lem_word = lemmatizer.lemmatize(lem_word)
        lem_text = lem_text + ' ' + lem_word
    X_train_lemm.append(lem_text)

X_test_lemm = []
for text in X_test:
    lem_text = ''
    for word in text.split():
        lem_word = lemmatizer.lemmatize(word, pos='v')
        lem_word = lemmatizer.lemmatize(lem_word)
        lem_text = lem_text + ' ' + lem_word
    X_test_lemm.append(lem_text)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train_lemm)
vocab_size = len(tokenizer.word_index)
max_len = 150
train_seq = tokenizer.texts_to_sequences(X_train_lemm)
train_pad = pad_sequences(train_seq, maxlen=max_len)
test_seq = tokenizer.texts_to_sequences(X_test_lemm)
test_pad = pad_sequences(test_seq, maxlen=max_len)

label2idx = {
    'EAP': 0,
    'HPL': 1,
    'MWS': 2
}

# IV. Training using TFIDF Vectorizer

In [6]:
tfidf = TfidfVectorizer(ngram_range=(1,3), min_df=5, max_df=0.5)
X_train_tfidf = tfidf.fit_transform(X_train_lemm)
X_test_tfidf = tfidf.transform(X_test_lemm)

In [7]:
clf = LogisticRegression(max_iter=1000).fit(X_train_tfidf, y_train)
y_pred = clf.predict(X_test_tfidf)
print(y_pred)
output_prob = clf.predict_proba(X_test_tfidf)
output_prob[:,0]

[2 0 1 ... 0 2 1]


array([0.18965379, 0.51298162, 0.37707488, ..., 0.64167805, 0.26446662,
       0.45248174])

# V. Training using Bi-LSTM NN

In [8]:
model = keras.Sequential([
    keras.layers.Embedding(vocab_size+1, 300, input_length=max_len),
    keras.layers.SpatialDropout1D(0.5),
    keras.layers.Bidirectional(keras.layers.LSTM(64, return_sequences=True, dropout=0.3, recurrent_dropout=0.3)),
    keras.layers.Bidirectional(keras.layers.LSTM(32, dropout=0.3, recurrent_dropout=0.3)),
    keras.layers.Dense(300, activation='relu'),
    keras.layers.Dense(3, activation='softmax')
])
opt = keras.optimizers.Adam(learning_rate=0.01)
model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])
history = model.fit(train_pad, y_train_cat, epochs=20, batch_size=512)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [9]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 150, 300)          5430300   
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 150, 300)          0         
_________________________________________________________________
bidirectional (Bidirectional (None, 150, 128)          186880    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 64)                41216     
_________________________________________________________________
dense (Dense)                (None, 300)               19500     
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 903       
Total params: 5,678,799
Trainable params: 5,678,799
Non-trainable params: 0
______________________________________________

In [10]:
y_pred_nn = model.predict_classes(test_pad)
print(y_pred_nn)


[2 0 0 ... 2 2 1]


In [11]:
#cosine similarity between outputs from both methods.
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity([y_pred], [y_pred_nn])

array([[0.86144056]])

In [12]:
#Submission file.
df = pd.DataFrame()
df['id'] = val_id
df['EAP'] = output_prob[:,0]
df['HPL'] = output_prob[:,1]
df['MWS'] = output_prob[:,2]

df.to_csv('Submission.csv', index=False)