<a href="https://colab.research.google.com/github/priyanshgupta1998/Machine_learning/blob/master/kaggle/NLP-CLSSIFICATION-Spooky_Author_Identification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Spooky Author Identification
Share code and discuss insights to identify horror authors from their writings

In [3]:
from google.colab import files
files.upload()
!pip install -q kaggle
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json


In [4]:
!kaggle competitions download -c spooky-author-identification

Downloading sample_submission.zip to /content
  0% 0.00/29.4k [00:00<?, ?B/s]
100% 29.4k/29.4k [00:00<00:00, 11.3MB/s]
Downloading test.zip to /content
  0% 0.00/538k [00:00<?, ?B/s]
100% 538k/538k [00:00<00:00, 60.4MB/s]
Downloading train.zip to /content
  0% 0.00/1.26M [00:00<?, ?B/s]
100% 1.26M/1.26M [00:00<00:00, 86.0MB/s]


In [0]:
import pandas as pd
import numpy as np

In [0]:
from tqdm import tqdm
from keras.models import Sequential
from keras.layers.recurrent import LSTM, GRU
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from keras.preprocessing import sequence, text
from keras.callbacks import EarlyStopping


from sklearn import metrics, pipeline
from sklearn.model_selection import GridSearchCV

from nltk import word_tokenize
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [6]:
!unzip train.zip

Archive:  train.zip
  inflating: train.csv               


In [7]:
train = pd.read_csv('train.csv')
print(train.shape)
train.head()

(19579, 3)


Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [8]:
train.info()   # there is no single cell is null

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19579 entries, 0 to 19578
Data columns (total 3 columns):
id        19579 non-null object
text      19579 non-null object
author    19579 non-null object
dtypes: object(3)
memory usage: 459.0+ KB


#The problem requires us to predict the author, i.e. EAP, HPL and MWS given the text. In simpler words, text classification with 3 different classes.

In [12]:
#We use the LabelEncoder from scikit-learn to convert text labels to integers, 0, 1 2
from sklearn import preprocessing
print(train.author.values)
lbl_enc = preprocessing.LabelEncoder()
y = lbl_enc.fit_transform(train.author.values)
print(y)

['EAP' 'HPL' 'EAP' ... 'EAP' 'EAP' 'HPL']
[0 1 0 ... 0 0 1]


In [0]:
#Before going further it is important that we split the data into training and validation sets. We can do it using train_test_split from the model_selection module of scikit-learn.
from sklearn.model_selection import train_test_split
xtrain, xvalid, ytrain, yvalid = train_test_split(train.text.values, y, 
                                                  stratify=y, 
                                                  random_state=42, 
                                                  test_size=0.1, 
                                                  shuffle=True)

In [14]:
print (xtrain.shape)
print (xvalid.shape)

(17621,)
(1958,)


#Building Basic Models
Let's start building our very first model.

Our very first model is a simple TF-IDF (Term Frequency - Inverse Document Frequency) followed by a simple Logistic Regression.
# (1.) logistic regression 
#TF-IDF

In [0]:
# Always start with these features. They work (almost) everytime!
from sklearn.feature_extraction.text import TfidfVectorizer
tfv = TfidfVectorizer(min_df=3,  
                      max_features=None, 
                      strip_accents='unicode', 
                      analyzer='word',
                      token_pattern=r'\w{1,}',
                      ngram_range=(1, 3),
                      use_idf=1,smooth_idf=1,
                      sublinear_tf=1,
                      stop_words = 'english')



In [18]:
list(xtrain)[:5]

['Her hair was the brightest living gold, and despite the poverty of her clothing, seemed to set a crown of distinction on her head.',
 '"No," he said, "oh, no a member of my family my niece, and a most accomplished woman."',
 'The magistrate appeared at first perfectly incredulous, but as I continued he became more attentive and interested; I saw him sometimes shudder with horror; at others a lively surprise, unmingled with disbelief, was painted on his countenance.',
 'Then all at once the horrible eyes, blacker even than the seared face in which they were set, opened wide with an expression which I was unable to interpret.',
 'He was no longer bent to the ground, like an over nursed flower of spring, that, shooting up beyond its strength, is weighed down even by its own coronal of blossoms.']

In [0]:
# Fitting TF-IDF to both training and test sets (semi-supervised learning)
tfv.fit(list(xtrain) + list(xvalid))  # Take whole document 
xtrain_tfv =  tfv.transform(xtrain)  #find the frequency of each word in particular chunks of Document
xvalid_tfv = tfv.transform(xvalid)   #find the frequency of each word in particular chunks of Document

In [26]:
print(len(xtrain[0]))
print(len(xtrain_tfv[0].data))  # removed 119 words(Stopwords,stems ,punctuation ,etc.) from the first page of the Whole document
xtrain_tfv[0].data

130
11


array([0.24694244, 0.31534174, 0.26213576, 0.21913481, 0.26998034,
       0.2821067 , 0.33928339, 0.29135328, 0.33459501, 0.34746745,
       0.37105636])

In [28]:
# Fitting a simple Logistic Regression on TFIDF
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(C=1.0)
clf.fit(xtrain_tfv, ytrain)
predictions = clf.predict_proba(xvalid_tfv)



In [29]:
def multiclass_logloss(actual, predicted, eps=1e-15):
    """Multi class version of Logarithmic Loss metric.
    :param actual: Array containing the actual target classes
    :param predicted: Matrix with class predictions, one probability per class
    """
    # Convert 'actual' to a binary array if it's not already:
    if len(actual.shape) == 1:
        actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
        for i, val in enumerate(actual):
            actual2[i, val] = 1
        actual = actual2

    clip = np.clip(predicted, eps, 1 - eps)
    rows = actual.shape[0]
    vsota = np.sum(actual * np.log(clip))
    return -1.0 / rows * vsota
  
print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

logloss: 0.626 


`And there we go. We have our first model with a multiclass logloss of 0.626.`

`But we are greedy and want a better score. Lets look at the same model with a different data.`

`Instead of using TF-IDF, we can also use word counts as features. This can be done easily using CountVectorizer from scikit-learn.`

#CountVectorizer

In [0]:
from sklearn.feature_extraction.text import CountVectorizer
ctv = CountVectorizer(analyzer='word',
                      token_pattern=r'\w{1,}',
                      ngram_range=(1, 3), 
                      stop_words = 'english')



In [31]:
# Fitting Count Vectorizer to both training and test sets (semi-supervised learning)
ctv.fit(list(xtrain) + list(xvalid))
xtrain_ctv =  ctv.transform(xtrain) 
xvalid_ctv = ctv.transform(xvalid)
# Fitting a simple Logistic Regression on Counts
clf = LogisticRegression(C=1.0)
clf.fit(xtrain_ctv, ytrain)
predictions = clf.predict_proba(xvalid_ctv)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))



logloss: 0.528 


#(2.) Naive Bayes

In [33]:
# Fitting a simple Naive Bayes on TFIDF
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(xtrain_tfv, ytrain)
predictions = clf.predict_proba(xvalid_tfv)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

logloss: 0.578 


`Good performance! But the logistic regression on counts is still better! What happens when we use this model on counts data instead?`

In [34]:
# Fitting a simple Naive Bayes on Counts
clf = MultinomialNB()
clf.fit(xtrain_ctv, ytrain)
predictions = clf.predict_proba(xvalid_ctv)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

logloss: 0.485 


#(3.) SVM 
`nOW ,We must try SVM on this dataset.`

`Since SVMs take a lot of time, we will reduce the number of features from the TF-IDF using"Singular Value Decomposition" before applying SVM.`

`Also, note that before applying SVMs, we must standardize the data.`


`SVM works good on small dataset`



In [0]:
# Apply SVD, I chose 120 components. 120-200 components are good enough for SVM model.
from sklearn import decomposition
from sklearn.decomposition import TruncatedSVD
svd = decomposition.TruncatedSVD(n_components=120)
svd.fit(xtrain_tfv)   # Maximum 120 word vectors
xtrain_svd = svd.transform(xtrain_tfv)
xvalid_svd = svd.transform(xvalid_tfv)

In [40]:
print(len(xtrain_tfv[0].data))  # TF IDF LENGTH , length before Decomposition
print(len(xtrain_svd[0]))      # LENGTH AFTER DECOMPOSITION
print(xtrain_svd[0])

11
120
[ 0.04189869 -0.00797828 -0.00538435 -0.01425648 -0.00417284 -0.00739562
  0.00173417  0.00011446  0.00603873  0.00455325  0.01616831 -0.01336801
  0.00367796 -0.03961204 -0.00472495  0.00872024  0.00271818 -0.00234798
 -0.014717    0.01248365  0.01845469 -0.01312352  0.003614   -0.00623844
 -0.00276322 -0.00147046 -0.01050665 -0.01701258  0.00162414 -0.00527369
 -0.00694241  0.00145785 -0.0055838  -0.0167547   0.01810423 -0.00980061
 -0.00463975  0.00789302 -0.0200923  -0.01405515  0.01130056 -0.02099268
 -0.0231038  -0.00698166 -0.03539095 -0.02199343 -0.01209134 -0.0039957
  0.01007911 -0.02249449 -0.02530505 -0.02138792 -0.02299919 -0.01983994
 -0.0353419   0.02633245 -0.01542837  0.02278953 -0.035088    0.01840937
 -0.06274597 -0.05441819  0.00368279  0.04519979 -0.02793083 -0.03297946
 -0.04978978 -0.04704619  0.03141998  0.05647044  0.00081257  0.02210699
 -0.01357144 -0.064411    0.01407232  0.02586338  0.02903282 -0.01533849
  0.03235539 -0.01554527  0.01977583  0.01410

In [0]:
# Scale the data obtained from SVD. Renaming variable to reuse without scaling.
from sklearn import preprocessing
scl = preprocessing.StandardScaler()
scl.fit(xtrain_svd)
xtrain_svd_scl = scl.transform(xtrain_svd)
xvalid_svd_scl = scl.transform(xvalid_svd)

In [42]:
print(xtrain_svd_scl[0])

[-0.31921348 -0.099407   -0.13853613 -0.22042402 -0.02610204 -0.20672647
  0.08450312  0.00517442  0.16667216  0.10149781  0.39362447 -0.3737355
  0.16326484 -1.15749032 -0.13070938  0.22015503  0.05082299 -0.04981137
 -0.45883857  0.34349224  0.5303299  -0.35950506  0.12330967 -0.09541838
 -0.0775695  -0.03527053 -0.35225232 -0.51663018  0.08239672 -0.16481299
 -0.23777182  0.06769101 -0.17793484 -0.55267278  0.58824918 -0.2934404
 -0.18136289  0.27300274 -0.6479186  -0.43582745  0.3823663  -0.67059375
 -0.77908639 -0.26736711 -1.17583661 -0.7436248  -0.43531877 -0.12272632
  0.29318478 -0.78394682 -0.87763324 -0.74881843 -0.78228095 -0.68477776
 -1.26040384  0.95702904 -0.55304212  0.79088256 -1.235759    0.65258349
 -2.28330175 -1.97262621  0.12543899  1.65029555 -1.02380683 -1.2049027
 -1.84801871 -1.72411791  1.1735457   2.11289362  0.01812502  0.82460716
 -0.53085067 -2.40932084  0.52951042  0.97577121  1.11115665 -0.58556269
  1.24746091 -0.58532278  0.75948787  0.54251627 -0.15

In [44]:
# Fitting a simple SVM
from sklearn.svm import SVC
clf = SVC(C=1.0, 
          probability=True) # since we need probabilities
clf.fit(xtrain_svd_scl, ytrain)
predictions = clf.predict_proba(xvalid_svd_scl)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

logloss: 0.734 


`Looks like SVM doesn't perform well on this data...!Before moving further, lets apply the most popular algorithm on Kaggle: xgboost!`

#(4.) XGBOOST

In [0]:
# Fitting a simple xgboost on tf-idf
import xgboost as xgb

clf = xgb.XGBClassifier(max_depth=7, 
                        n_estimators=200, 
                        colsample_bytree=0.8, 
                        subsample=0.8, 
                        nthread=10, 
                        learning_rate=0.1)

clf.fit(xtrain_tfv.tocsc(), ytrain)

predictions = clf.predict_proba(xvalid_tfv.tocsc())

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

In [0]:
# Fitting a simple xgboost on CountVectorizer
clf = xgb.XGBClassifier(max_depth=7, 
                        n_estimators=200, 
                        colsample_bytree=0.8, 
                        subsample=0.8, 
                        nthread=10, 
                        learning_rate=0.1)

clf.fit(xtrain_ctv.tocsc(), ytrain)

predictions = clf.predict_proba(xvalid_ctv.tocsc())

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

In [0]:
# Fitting a simple xgboost on tf-idf svd features with all the partameters
clf = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, nthread=10, learning_rate=0.1)
clf.fit(xtrain_svd, ytrain)
predictions = clf.predict_proba(xvalid_svd)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

In [0]:
# Fitting a simple xgboost on tf-idf svd features only with thread
clf = xgb.XGBClassifier(nthread=10)
clf.fit(xtrain_svd, ytrain)
predictions = clf.predict_proba(xvalid_svd)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))