In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
import numpy as np 
import pandas as pd

In [None]:
#READING INPUT
data = pd.read_csv("/kaggle/input/spooky-author-identification/train.csv")
data.head()

we map "EAP" to 0 "HPL" to 1 and "MWS" to 2 as it will be more convenient for our classifier. 
In other words we are just telling our computer that if classifier predicts 0 for the text then it means that it is preicting "EAP", if 1 then it means that it is predicting "HPL", if 2 then it means that it is predicting "MWS".

In [None]:
data['author_num'] = data["author"].map({'EAP':0, 'HPL':1, 'MWS':2})
data.head()

## Define X and y

In [None]:
X = data['text']
y = data['author_num']

## Split training and test data

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

## Vectorisation

#### Count Vectorizer: builds a dictionary of features and transforms documents to feature vectors.



In [None]:
from sklearn.feature_extraction.text import CountVectorizer

* example
* below: the word "life" has been found 2 times in sentence 0 and in sentence 1
* the word paul has been found 1 time in sentence 0 and 0 times in sentence 1
* and so on...

In [None]:
text=["My name is Paul my life is Jane! And we live our life together" , "My name is Guido my life is Victoria! And we live our life together"]
toy = CountVectorizer(stop_words = 'english')
toy.fit_transform(text)
matrix = toy.transform(text)
features = toy.get_feature_names()
df_res = pd.DataFrame(matrix.toarray(), columns=features)
df_res

In [None]:
vect = CountVectorizer(stop_words = 'english')

In [None]:
X_train_matrix = vect.fit_transform(X_train) 

## Model 1 with count vectorizer

In [None]:
from sklearn.naive_bayes import MultinomialNB
clf=MultinomialNB()
clf.fit(X_train_matrix, y_train)
print(clf.score(X_train_matrix, y_train))
X_test_matrix = vect.transform(X_test) 
print (clf.score(X_test_matrix, y_test))

In [None]:
predicted_result=clf.predict(X_test_matrix)
from sklearn.metrics import classification_report
print(classification_report(y_test,predicted_result))

#### Tf-idf: 

* Since longer documents will have higher average count values than shorter documents, even though they might talk about the same topics, we can divide the number of occurrences of each word in a document by the total number of words in the document: **tf** for Term Frequencies.

* **idf** for “Term Frequency times Inverse Document Frequency” : Downscale weights for words that occur in many documents in the corpus and are therefore less informative than those that occur only in a smaller portion of the corpus.

* CountVectorizer and TfidTransformer steps into one using [TfidVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html):

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words = 'english')

X_train_tfidf = vectorizer.fit_transform(X_train) 
X_train_tfidf.shape

## Model 2 with TfidVectorizer

In [None]:
from sklearn.naive_bayes import MultinomialNB
clf2=MultinomialNB()
clf2.fit(X_train_tfidf, y_train)
print(clf2.score(X_train_tfidf, y_train))
X_test_tfidf = vectorizer.transform(X_test) 
print (clf2.score(X_test_tfidf, y_test))

* it doesn't perform better in term of accuracy

In [None]:
predicted_result_2=clf2.predict(X_test_tfidf)
from sklearn.metrics import classification_report
print(classification_report(y_test,predicted_result_2))

* there might be something to learn from the predictions on class 2

# Submission

In [None]:
sample = pd.read_csv("/kaggle/input/spooky-author-identification/sample_submission.csv")
sample.head()

In [None]:
test = pd.read_csv("/kaggle/input/spooky-author-identification/test.csv")
test_matrix = vect.transform(test["text"])
predicted_result = clf.predict_proba(test_matrix)

In [None]:
result=pd.DataFrame()
result["id"]=test["id"]
result["EAP"]=predicted_result[:,0]
result["HPL"]=predicted_result[:,1]
result["MWS"]=predicted_result[:,2]
result.head()

In [None]:
result.to_csv("submission_v1.csv", index=False)