In [1]:
import pandas as pd
import numpy as np
import gensim
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, log_loss
from sklearn import preprocessing
import re
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
import xgboost as xgb

from tqdm import tqdm

nltk.download('stopwords')
nltk.download('punkt')
stops = set(stopwords.words('english'))

Using TensorFlow backend.


[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!




In [2]:
train = pd.read_csv('train.csv')
print(len(train))
train.head()

19579


Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [3]:
test = pd.read_csv('test.csv')
print(len(test))
test.head()

8392


Unnamed: 0,id,text
0,id02310,"Still, as I urged our leaving Ireland with suc..."
1,id24541,"If a fire wanted fanning, it could readily be ..."
2,id00134,And when they had broken down the frail door t...
3,id27757,While I was thinking how I should possibly man...
4,id04081,I am not sure to what limit his knowledge may ...


In [4]:
label_enconder = preprocessing.LabelEncoder()
label_enconder.fit(train['author'])
train['label_encoded'] = label_enconder.transform(train['author'])
train.head()

Unnamed: 0,id,text,author,label_encoded
0,id26305,"This process, however, afforded me no means of...",EAP,0
1,id17569,It never once occurred to me that the fumbling...,HPL,1
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP,0
3,id27763,How lovely is spring As we looked from Windsor...,MWS,2
4,id12958,"Finding nothing else, not even gold, the Super...",HPL,1


## Preprocessing function

In [31]:
def transformText(text):
    
    stops = set(stopwords.words("english"))
    
    # Convert text to lower
    text = text.lower()
    # Removing non ASCII chars    
    text = re.sub(r'[^\x00-\x7f]',r' ',text)
    
    # Strip multiple whitespaces
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
    
    # Removing all the stopwords
    #filtered_words = [word for word in text.split() if word not in stops]
    filtered_words = [word for word in text.split()]

    # Removing all the tokens with lesser than 3 characters
    filtered_words = gensim.corpora.textcorpus.remove_short(filtered_words, minsize=3)
    
    # Preprocessed text after stop words removal
    text = " ".join(filtered_words)
    
    # Remove the punctuation
    text = gensim.parsing.preprocessing.strip_punctuation2(text)
    
    # Strip all the numerics
    text = gensim.parsing.preprocessing.strip_numeric(text)
    
    # Strip multiple whitespaces
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
    
    # Stemming
    text = gensim.parsing.preprocessing.stem_text(text)
    return text

## Creating preprocessing column on train and testing

In [32]:
train['text_processed']=train['text'].apply(lambda x: transformText(x))
train.tail()

Unnamed: 0,id,text,author,label_encoded,text_processed
19574,id17718,"I could have fancied, while I looked at it, th...",EAP,0,could have fanci while look it that some emin ...
19575,id08973,The lids clenched themselves together as if in...,EAP,0,the lid clench themselv togeth spasm
19576,id05267,"Mais il faut agir that is to say, a Frenchman ...",EAP,0,mai faut agir that sai frenchman never faint o...
19577,id17513,"For an item of news like this, it strikes us i...",EAP,0,for item new like thi strike wa veri coolli re...
19578,id00393,"He laid a gnarled claw on my shoulder, and it ...",HPL,1,laid gnarl claw shoulder and seem that it shak...


In [33]:
test['text_processed']=test['text'].apply(lambda x: transformText(x))
print(len(test))
test.tail()

8392


Unnamed: 0,id,text,text_processed
8387,id11749,All this is now the fitter for my purpose.,all thi now the fitter for purpos
8388,id10526,I fixed myself on a wide solitude.,fix myself wide solitud
8389,id13477,It is easily understood that what might improv...,easili understood that what might improv close...
8390,id13761,"Be this as it may, I now began to feel the ins...",thi mai now began feel the inspir burn hope an...
8391,id04282,"Long winded, statistical, and drearily genealo...",long wind statist and drearili genealog some t...


## Train/Test split

In [34]:
x_train, x_test, y_train, y_test = train_test_split(train['text_processed'], train['label_encoded'], test_size = 0.2, random_state = 4)
true_label = np.array(y_test)
print("#" * 20 + " Some stats " + "#"*20)
print("Dataset training: {} uterances".format(x_train.shape[0]))
print("Dataset testing: {} uterances".format(x_test.shape[0]))
print("Different classes: {}".format(len(y_train.unique())))

#################### Some stats ####################
Dataset training: 15663 uterances
Dataset testing: 3916 uterances
Different classes: 3


## Loading Glove vectors as features

In [10]:
## Loading Glove vectors
embeddings_index = {}
f = open('../../vectors/glove.42B.300d.txt')
for line in tqdm(f):
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

1917495it [07:07, 4480.89it/s]

Found 1917495 word vectors.





In [15]:
print(embeddings_index['word'].shape)
embeddings_index['word']

(300,)


array([ -1.31420001e-01,  -2.74459988e-01,   2.31170002e-03,
         3.83850001e-02,  -4.46520001e-01,  -5.98660000e-02,
        -3.34130001e+00,   5.85630000e-01,   4.86969985e-02,
        -8.89739990e-02,   9.92240012e-02,  -1.87179998e-01,
         1.64120004e-01,  -1.91770002e-01,  -6.05069995e-02,
        -9.82039981e-03,  -2.91329995e-02,  -1.77550003e-01,
         2.17209995e-01,   3.26810002e-01,  -1.12029999e-01,
        -1.17569998e-01,   2.53160000e-02,   2.46380001e-01,
         2.39759997e-01,   3.25869992e-02,  -1.62080005e-01,
        -2.00859994e-01,   3.39760005e-01,  -8.54809999e-01,
        -2.13070005e-01,  -3.86139989e-01,   1.61249995e-01,
         7.11169988e-02,  -2.73689985e-01,  -9.66690015e-03,
         1.70330003e-01,   5.04290015e-02,  -3.08050007e-01,
        -2.93749988e-01,   1.34839997e-01,   4.98140007e-01,
        -3.91119987e-01,   2.01810002e-02,   3.13430011e-01,
        -9.30779986e-03,   6.50430024e-01,  -1.48819998e-01,
         1.72390006e-02,

In [35]:
# this function creates a normalized vector for the whole sentence
def sent2vec(s):
    words = str(s).lower()
    M = []
    for w in words:
        try:
            M.append(embeddings_index[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    if type(v) != np.ndarray:
        return np.zeros(300)
    return v / np.sqrt((v ** 2).sum())

In [36]:
## Create sentence vectors for the dataset
xtrain_glove = [sent2vec(x) for x in tqdm(x_train)]
xvalid_glove = [sent2vec(x) for x in tqdm(x_test)]


  0%|          | 0/15663 [00:00<?, ?it/s][A
  5%|▌         | 826/15663 [00:00<00:01, 8257.57it/s][A
 10%|█         | 1640/15663 [00:00<00:01, 8221.90it/s][A
 16%|█▌        | 2448/15663 [00:00<00:01, 8173.68it/s][A
 21%|██        | 3245/15663 [00:00<00:01, 8107.18it/s][A
 26%|██▌       | 4072/15663 [00:00<00:01, 8154.41it/s][A
 31%|███       | 4867/15663 [00:00<00:01, 8089.25it/s][A
 36%|███▌      | 5668/15663 [00:00<00:01, 8064.85it/s][A
 41%|████▏     | 6470/15663 [00:00<00:01, 8049.27it/s][A
 46%|████▋     | 7279/15663 [00:00<00:01, 8060.50it/s][A
 52%|█████▏    | 8078/15663 [00:01<00:00, 8035.80it/s][A
 57%|█████▋    | 8881/15663 [00:01<00:00, 8032.20it/s][A
 62%|██████▏   | 9705/15663 [00:01<00:00, 8091.84it/s][A
 67%|██████▋   | 10530/15663 [00:01<00:00, 8137.71it/s][A
 72%|███████▏  | 11355/15663 [00:01<00:00, 8168.82it/s][A
 78%|███████▊  | 12167/15663 [00:01<00:00, 8139.10it/s][A
 83%|████████▎ | 12986/15663 [00:01<00:00, 8152.50it/s][A
 88%|████████▊ | 13806/

In [37]:
print(xtrain_glove[0].shape)
xtrain_glove[0]

(300,)


array([  2.51582749e-02,   4.29424755e-02,   1.54502979e-02,
         4.84528877e-02,   5.63728102e-02,   2.76009645e-02,
        -4.00871009e-01,   6.71796575e-02,   6.37154654e-02,
        -5.15156165e-02,   6.75900728e-02,   5.24606556e-02,
         8.56527884e-04,  -2.31491756e-02,  -3.89017612e-02,
        -3.56453471e-02,  -7.19362823e-03,  -2.27122381e-02,
         4.38692011e-02,  -5.78821450e-02,   7.53861442e-02,
        -1.30662546e-02,  -1.70234814e-02,   6.21875264e-02,
        -3.61096151e-02,  -3.16550862e-03,  -1.42168701e-02,
         2.93877721e-03,   1.67832579e-02,   1.61787458e-02,
         5.92754148e-02,  -2.18988955e-02,  -1.24992607e-02,
        -5.41997477e-02,   6.21770658e-02,   1.21010169e-02,
         1.08940732e-02,  -5.49524836e-03,   5.28204739e-02,
        -1.35870427e-02,  -5.07906377e-02,   8.31716433e-02,
        -5.08788303e-02,  -8.84623360e-03,  -3.55195813e-02,
        -3.59665118e-02,   6.35254290e-03,   8.95094406e-03,
         1.03001133e-01,

In [38]:
xtrain_glove = np.array(xtrain_glove)
xvalid_glove = np.array(xvalid_glove)

In [44]:
## Fitting a simple xgboost on glove features
clf = xgb.XGBClassifier(nthread=10, silent=False)
clf.fit(xtrain_glove, y_train)
preds_proba = clf.predict_proba(xvalid_glove)

In [43]:
print ("logloss: %0.3f " % log_loss(true_label, preds_proba))

logloss: 1.008 


In [39]:
## Testing XGBoost with Glove Features
clf = xgb.XGBClassifier(max_depth=5, n_estimators=1000, colsample_bytree=0.8, 
                        subsample=0.8, nthread=10, learning_rate=0.1)
eval_set = [(xvalid_glove,y_test)]
clf.fit(xtrain_glove, y_train, eval_metric = "mlogloss", eval_set=eval_set, verbose=True)
preds_proba = clf.predict_proba(xvalid_glove)
preds = clf.predict(xvalid_glove)

[0]	validation_0-mlogloss:1.09052
[1]	validation_0-mlogloss:1.08306
[2]	validation_0-mlogloss:1.07592
[3]	validation_0-mlogloss:1.06999
[4]	validation_0-mlogloss:1.06424
[5]	validation_0-mlogloss:1.05924
[6]	validation_0-mlogloss:1.05485
[7]	validation_0-mlogloss:1.05068
[8]	validation_0-mlogloss:1.04716
[9]	validation_0-mlogloss:1.04395
[10]	validation_0-mlogloss:1.04112
[11]	validation_0-mlogloss:1.03832
[12]	validation_0-mlogloss:1.03531
[13]	validation_0-mlogloss:1.03275
[14]	validation_0-mlogloss:1.0311
[15]	validation_0-mlogloss:1.02914
[16]	validation_0-mlogloss:1.0271
[17]	validation_0-mlogloss:1.02612
[18]	validation_0-mlogloss:1.02464
[19]	validation_0-mlogloss:1.02296
[20]	validation_0-mlogloss:1.02128
[21]	validation_0-mlogloss:1.0197
[22]	validation_0-mlogloss:1.01879
[23]	validation_0-mlogloss:1.01755
[24]	validation_0-mlogloss:1.01662
[25]	validation_0-mlogloss:1.01585
[26]	validation_0-mlogloss:1.01506
[27]	validation_0-mlogloss:1.0141
[28]	validation_0-mlogloss:1.01399

KeyboardInterrupt: 