In [None]:
#importing libraries
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
data=pd.read_csv('../input/spookyauthor/train.csv')
data.head()

In [None]:
#finding the shape of the data (number of rows and columns)
data.shape

In [None]:
#number of unique authors
data['author'].unique()

In [None]:
#visualizing the class proportions
sns.countplot(x=data['author'])

## Feature Engineering

In [None]:
# function to remove punctuations
import string
def remove_punct(text):
    translator=str.maketrans('','',string.punctuation)
    return text.translate(translator)
data['text']=data['text'].apply(remove_punct)
data['text'].head()

In [None]:
#Removing stopwords
sw=stopwords.words('english')
np.array(sw)

In [None]:
print('Number of unique stopwords:',len(sw))

In [None]:
#function to remove stopwords
def stopwords(text):
    # removing the stop words and lowercasing the selected words
    text = [word.lower() for word in text.split() if word.lower() not in sw]
    # joining the list of words with space separator
    return " ".join(text)
data['text']=data['text'].apply(stopwords)
data['text'].head()

In [None]:
# most frequent occuring texts with the help of countvectorizer
count_vector=CountVectorizer()
count_vector.fit(data['text'])
# collect the vocabulary items used in the vectorizer
dictionary = count_vector.vocabulary_.items()  

In [None]:
# storing count and vocab in a dataframe
# lists to store the vocab and counts
vocab = []
count = []
# iterate through each vocab and count append the value to designated lists
for key, value in dictionary:
    vocab.append(key)
    count.append(value)
# store the count in panadas dataframe with vocab as index
vocab_bef_stem = pd.Series(count, index=vocab)
# sort the dataframe
vocab_bef_stem = vocab_bef_stem.sort_values(ascending=False)

In [None]:
top_vacab = vocab_bef_stem.head(10)
top_vacab.plot(kind = 'barh', figsize=(12,5), xlim= (25230, 25260))

In [None]:
# create an object of stemming function
stemmer = SnowballStemmer("english")

def stemming(text):    
    '''a function which stems each word in the given text'''
    text = [stemmer.stem(word) for word in text.split()]
    return " ".join(text) 

In [None]:
data['text'] = data['text'].apply(stemming)
data.head()

In [None]:
# create the object of tfid vectorizer
tfid_vectorizer = TfidfVectorizer("english")
# fit the vectorizer using the text data
tfid_vectorizer.fit(data['text'])
# collect the vocabulary items used in the vectorizer
dictionary = tfid_vectorizer.vocabulary_.items()

In [None]:
# lists to store the vocab and counts
vocab = []
count = []
# iterate through each vocab and count append the value to designated lists
for key, value in dictionary:
    vocab.append(key)
    count.append(value)
# store the count in panadas dataframe with vocab as index
vocab_after_stem = pd.Series(count, index=vocab)
# sort the dataframe
vocab_after_stem = vocab_after_stem.sort_values(ascending=False)
# plot of the top vocab
top_vacab = vocab_after_stem.head(10)
top_vacab.plot(kind = 'barh', figsize=(5,10), xlim= (15120, 15145))

In [None]:
# Finding the text lenght of each author
def length(text):
    return len(text)
data['length']=data['text'].apply(length)
data.head()

In [None]:
#  visualizing the length of texts of authors
EAP=data[data['author']=='EAP']
HPL=data[data['author']=='HPL']
MWS=data[data['author']=='MWS']

matplotlib.rcParams['figure.figsize'] = (12.0, 8.0)
bins = 500
plt.hist(EAP['length'], alpha = 0.6, bins=bins, label='EAP')
plt.hist(HPL['length'], alpha = 0.8, bins=bins, label='HPL')
plt.hist(MWS['length'], alpha = 0.4, bins=bins, label='MWS')
plt.xlabel('length')
plt.ylabel('numbers')
plt.legend(loc='upper right')
plt.xlim(0,300)
plt.grid()
plt.show()

In [None]:
# extract the tfid representation matrix of the text data
tfid_matrix = tfid_vectorizer.transform(data['text'])
# collect the tfid matrix in numpy array
array = tfid_matrix.todense()
# store the tf-idf array into pandas dataframe
df = pd.DataFrame(array)
df.head()

## Training Model

In [None]:
df['output'] = data['author']
df['id'] = data['id']
df.head()

In [None]:
features = df.columns.tolist()
output = 'output'
# removing the output and the id from features
features.remove(output)
features.remove('id')

In [None]:
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, log_loss
from sklearn.model_selection import GridSearchCV

In [None]:
alpha_list1 = np.linspace(0.006, 0.1, 20)
alpha_list1 = np.around(alpha_list1, decimals=4)

# parameter grid
parameter_grid = [{"alpha":alpha_list1}]
# classifier object
classifier1 = MultinomialNB()
# gridsearch object using 4 fold cross validation and neg_log_loss as scoring paramter
gridsearch1 = GridSearchCV(classifier1,parameter_grid, scoring = 'neg_log_loss', cv = 4)
# fit the gridsearch
gridsearch1.fit(df[features], df[output])

In [None]:
print("Best score: ",gridsearch1.best_score_) 