# Spooky Author Identification Model

Model : sort the text as per its author.

## Importing Libraries

In [72]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as ply
import seaborn as sns
sns.set()

In [73]:
data = pd.read_csv("spooky_train.csv")

In [74]:
data.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [75]:
data.describe()

Unnamed: 0,id,text,author
count,19579,19579,19579
unique,19579,19579,3
top,id26305,"This process, however, afforded me no means of...",EAP
freq,1,1,7900


## Preprocessing the data

In [76]:
# importing the libraries
import pandas as pd
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Download necessary NLTK data files (only need to run once)
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\RAKSHITA\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\RAKSHITA\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\RAKSHITA\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [77]:
stemmer = PorterStemmer()
lemmate = WordNetLemmatizer()
def preprocessing (text):
    text = text.lower()
    text = text.replace(r'[^a-zA-Z]', "")
    token = word_tokenize(text)
    stem = [stemmer.stem(word) for word in token]
    lemman = [lemmate.lemmatize(word) for word in stem]
    return " ".join(lemman)

In [78]:
x = data["text"]

In [79]:
x = x.apply(lambda x: preprocessing(x))

In [80]:
x = x.str.replace(r'[a-zA-Z]',"")

### Vectorisation

In [81]:
y = data["author"].map({"EAP": 0, "HPL": 1, "MWS":2})

In [82]:
vectorise = TfidfVectorizer(stop_words ="english")
X = vectorise.fit_transform(x)

## Model

In [83]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

m = MultinomialNB()
model = m.fit(X,y)

In [84]:
pred_train = model.predict(X)

In [85]:
score_train = accuracy_score(y,pred_train)
score

0.8828847234281628

In [86]:
report = classification_report(y,pred_train)
report

'              precision    recall  f1-score   support\n\n           0       0.85      0.92      0.88      7900\n           1       0.95      0.82      0.88      5635\n           2       0.88      0.88      0.88      6044\n\n    accuracy                           0.88     19579\n   macro avg       0.89      0.88      0.88     19579\nweighted avg       0.89      0.88      0.88     19579\n'

In [87]:
test = pd.read_csv("Spooky_test.csv")


In [88]:
X_test = test["text"]

In [89]:
X_test = X_test.apply(lambda x: preprocessing(x))

In [90]:
X_test_t = vectorise.transform(X_test)

In [91]:
pred_test = model.predict(X_test_t)

In [92]:
pred_test

array([2, 0, 1, ..., 0, 2, 1], dtype=int64)