In [3]:
import pandas as pd
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')


In [4]:
a_df=pd.read_csv('Authors-dataset.csv')
a_df.head()
y=a_df['author']
a_df.drop(columns=['author','id'],inplace=True)
y.value_counts()

EAP    7900
MWS    6044
HPL    5635
Name: author, dtype: int64

In [5]:
a_df

Unnamed: 0,text
0,"This process, however, afforded me no means of..."
1,It never once occurred to me that the fumbling...
2,"In his left hand was a gold snuff box, from wh..."
3,How lovely is spring As we looked from Windsor...
4,"Finding nothing else, not even gold, the Super..."
...,...
19574,"I could have fancied, while I looked at it, th..."
19575,The lids clenched themselves together as if in...
19576,"Mais il faut agir that is to say, a Frenchman ..."
19577,"For an item of news like this, it strikes us i..."


In [53]:
#since output is in text format we need to do label encoding
le=LabelEncoder()
y=le.fit_transform(y)


In [97]:
tokenizer=RegexpTokenizer('\s+', gaps = True)
sw_en=stopwords.words('english')
lemmatizer = WordNetLemmatizer()

def preprocessing(text):
    tokens=tokenizer.tokenize(text)
    pur_tokens=[token.lower() for token in tokens if token.lower() not in sw_en ]
    lem_tokens=[lemmatizer.lemmatize(token,pos='v') for  token in pur_tokens ]
    return ' '.join(lem_tokens)

x_train,x_test,y_train,y_test=train_test_split(a_df,y,test_size=0.2,random_state=42)

x_train['preprocessed_txt']= x_train['text'].apply(preprocessing)
x_test['Preprocessed_txt']=x_test['text'].apply(preprocessing)

In [89]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()

x_train_tfidf=tfidf.fit_transform(x_train['preprocessed_txt'])
x_test_tfidf=tfidf.transform(x_test['Preprocessed_txt'])

#tfidf.get_feature_names()

In [101]:
from sklearn.naive_bayes import MultinomialNB

mlnb=MultinomialNB()
mlnb.fit(x_train_tfidf,y_train) 

y_test_pred=mlnb.predict(x_test_tfidf)

In [115]:
from sklearn.metrics import confusion_matrix,recall_score,precision_score,f1_score

confusion_matrix(y_test,y_test_pred)

array([[1390,   61,  119],
       [ 227,  773,   71],
       [ 213,   36, 1026]], dtype=int64)

In [109]:
recall_score(y_test,y_test_pred,average=None)

array([0.88535032, 0.72175537, 0.80470588])

In [113]:
precision_score(y_test,y_test_pred,average=None)

array([0.75956284, 0.88850575, 0.84375   ])

In [116]:
f1_score(y_test,y_test_pred,average=None)

array([0.81764706, 0.79649665, 0.82376556])