In [1]:
import pandas as pd
import numpy as np
import gensim 

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, log_loss
from sklearn import preprocessing
import re
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize

from keras.models import Sequential
from keras.layers.recurrent import LSTM, GRU
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils
from keras.preprocessing import sequence, text
from keras.callbacks import EarlyStopping
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D


from tqdm import tqdm

nltk.download('stopwords')
nltk.download('punkt')
stops = set(stopwords.words('english'))

Using TensorFlow backend.
  return f(*args, **kwds)


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/rsilveira79/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /home/rsilveira79/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [2]:
train = pd.read_csv('train.csv')
print(len(train))
train.tail()

19579


Unnamed: 0,id,text,author
19574,id17718,"I could have fancied, while I looked at it, th...",EAP
19575,id08973,The lids clenched themselves together as if in...,EAP
19576,id05267,"Mais il faut agir that is to say, a Frenchman ...",EAP
19577,id17513,"For an item of news like this, it strikes us i...",EAP
19578,id00393,"He laid a gnarled claw on my shoulder, and it ...",HPL


In [3]:
test = pd.read_csv('test.csv')
print(len(test))
test.tail()

8392


Unnamed: 0,id,text
8387,id11749,All this is now the fitter for my purpose.
8388,id10526,I fixed myself on a wide solitude.
8389,id13477,It is easily understood that what might improv...
8390,id13761,"Be this as it may, I now began to feel the ins..."
8391,id04282,"Long winded, statistical, and drearily genealo..."


In [4]:
label_enconder = preprocessing.LabelEncoder()
label_enconder.fit(train['author'])
train['label_encoded'] = label_enconder.transform(train['author'])
train.head()

Unnamed: 0,id,text,author,label_encoded
0,id26305,"This process, however, afforded me no means of...",EAP,0
1,id17569,It never once occurred to me that the fumbling...,HPL,1
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP,0
3,id27763,How lovely is spring As we looked from Windsor...,MWS,2
4,id12958,"Finding nothing else, not even gold, the Super...",HPL,1


## Preprocessing function

In [5]:
def transformText(text):
    
    stops = set(stopwords.words("english"))
    
    # Convert text to lower
    text = text.lower()
    # Removing non ASCII chars    
    text = re.sub(r'[^\x00-\x7f]',r' ',text)
    
    # Strip multiple whitespaces
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
    
    # Removing all the stopwords
    filtered_words = [word for word in text.split() if word not in stops]
    #filtered_words = [word for word in text.split()]

    # Removing all the tokens with lesser than 3 characters
    filtered_words = gensim.corpora.textcorpus.remove_short(filtered_words, minsize=3)
    
    # Preprocessed text after stop words removal
    text = " ".join(filtered_words)
    
    # Remove the punctuation
    text = gensim.parsing.preprocessing.strip_punctuation2(text)
    
    # Strip all the numerics
    text = gensim.parsing.preprocessing.strip_numeric(text)
    
    # Strip multiple whitespaces
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
    
    # Stemming
    text = gensim.parsing.preprocessing.stem_text(text)
    return text

## Creating preprocessing column on train and testing

In [6]:
train['text_processed']=train['text'].apply(lambda x: transformText(x))
train.tail()

Unnamed: 0,id,text,author,label_encoded,text_processed
19574,id17718,"I could have fancied, while I looked at it, th...",EAP,0,could fanci look it emin landscap painter buil...
19575,id08973,The lids clenched themselves together as if in...,EAP,0,lid clench togeth spasm
19576,id05267,"Mais il faut agir that is to say, a Frenchman ...",EAP,0,mai faut agir sai frenchman never faint outright
19577,id17513,"For an item of news like this, it strikes us i...",EAP,0,item new like thi strike coolli receiv
19578,id00393,"He laid a gnarled claw on my shoulder, and it ...",HPL,1,laid gnarl claw shoulder seem shake altogeth m...


In [7]:
test['text_processed']=test['text'].apply(lambda x: transformText(x))
print(len(test))
test.tail()

8392


Unnamed: 0,id,text,text_processed
8387,id11749,All this is now the fitter for my purpose.,fitter purpos
8388,id10526,I fixed myself on a wide solitude.,fix wide solitud
8389,id13477,It is easily understood that what might improv...,easili understood might improv close scrutin d...
8390,id13761,"Be this as it may, I now began to feel the ins...",mai began feel inspir burn hope length nurtur ...
8391,id04282,"Long winded, statistical, and drearily genealo...",long wind statist drearili genealog matter wa ...


## Train/Test split

In [8]:
x_train, x_valid, y_train, y_valid = train_test_split(train['text_processed'], train['label_encoded'], test_size = 0.2, random_state = 4)
true_label = np.array(y_valid)
print("#" * 20 + " Some stats " + "#"*20)
print("Dataset training: {} uterances".format(x_train.shape[0]))
print("Dataset testing: {} uterances".format(x_valid.shape[0]))
print("Different classes: {}".format(len(y_train.unique())))

#################### Some stats ####################
Dataset training: 15663 uterances
Dataset testing: 3916 uterances
Different classes: 3


In [9]:
## Loading Glove vectors
embeddings_index = {}
f = open('../../vectors/glove.42B.300d.txt')
for line in tqdm(f):
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

1917494it [01:28, 21562.11it/s]

Found 1917494 word vectors.





In [10]:
word = 'awesome'
print(embeddings_index[word].shape)
embeddings_index[word][0:50]

(300,)


array([-0.080353  , -0.24218   , -0.26267001, -0.33482999,  0.27803999,
       -0.14549001, -2.7026999 , -0.75669998,  0.28336   , -0.72943997,
        0.21162   , -0.52950001,  0.31413001, -0.063769  , -0.1401    ,
       -0.15177999, -0.44237   , -0.48963001, -0.109     ,  0.20123   ,
        0.56939   , -0.13128   ,  0.37606001,  0.29006001,  0.10225   ,
        0.071932  ,  0.15706   ,  0.33153999, -0.012658  ,  0.73517001,
        0.30506   ,  0.16429999, -0.021664  ,  0.29159001, -0.10311   ,
       -0.46601   ,  0.10972   , -0.26945001,  0.37584001,  0.12056   ,
       -0.087698  , -0.051568  ,  0.20632   , -0.027614  ,  0.40970999,
        0.14663   ,  0.15049   , -0.44549   , -0.17702   ,  0.11271   ], dtype=float32)

In [11]:
stops

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 'd',
 'did',
 'didn',
 'do',
 'does',
 'doesn',
 'doing',
 'don',
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 'has',
 'hasn',
 'have',
 'haven',
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 'it',
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 'more',
 'most',
 'mustn',
 'my',
 'myself',
 'needn',
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 're',
 's',
 'same',
 'shan',
 'she',
 'should',
 'shouldn',
 'so',
 'some',
 'such',
 't',
 'than',
 'that',
 'the',
 'their',
 'theirs',
 'them',
 

In [12]:
# this function creates a normalized vector for the whole sentence
def sent2vec(s):
    words = str(s).lower()
    words = word_tokenize(words)
    words = [w for w in words if not w in stops]
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(embeddings_index[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    if type(v) != np.ndarray:
        return np.zeros(300)
    return v / np.sqrt((v ** 2).sum())

In [13]:
## Create sentence vectors for the dataset
xtrain_glove = [sent2vec(x) for x in tqdm(x_train)]
xvalid_glove = [sent2vec(x) for x in tqdm(x_valid)]

100%|██████████| 15663/15663 [00:01<00:00, 8992.82it/s]
100%|██████████| 3916/3916 [00:00<00:00, 8888.38it/s]


In [14]:
xtrain_glove = np.array(xtrain_glove)
xvalid_glove = np.array(xvalid_glove)

In [15]:
xtrain_glove.shape

(15663, 300)

In [16]:
xvalid_glove.shape

(3916, 300)

In [17]:
# scale the data before any neural net:
scl = preprocessing.StandardScaler()
xtrain_glove_scl = scl.fit_transform(xtrain_glove)
xvalid_glove_scl = scl.transform(xvalid_glove)

In [18]:
# we need to binarize the labels for the neural net
ytrain_enc = np_utils.to_categorical(y_train)
yvalid_enc = np_utils.to_categorical(y_valid)

## Tokenize text data

In [19]:
token = text.Tokenizer(num_words=None)
max_len = 70

token.fit_on_texts(list(x_train) + list(x_valid))
xtrain_seq = token.texts_to_sequences(x_train)
xvalid_seq = token.texts_to_sequences(x_valid)

# zero pad the sequences
xtrain_pad = sequence.pad_sequences(xtrain_seq, maxlen=max_len)
xvalid_pad = sequence.pad_sequences(xvalid_seq, maxlen=max_len)

word_index = token.word_index

In [20]:
# create an embedding matrix for the words we have in the dataset
embedding_matrix = np.zeros((len(word_index) + 1, 300))
for word, i in tqdm(word_index.items()):
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

100%|██████████| 15565/15565 [00:00<00:00, 709681.84it/s]


In [None]:
# GRU with glove embeddings and two dense layers
model = Sequential()
model.add(Embedding(len(word_index) + 1,
                     300,
                     weights=[embedding_matrix],
                     input_length=max_len,
                     trainable=False))
model.add(SpatialDropout1D(0.3))
model.add(GRU(300, dropout=0.3, recurrent_dropout=0.3, return_sequences=True))
model.add(GRU(300, dropout=0.3, recurrent_dropout=0.3))

model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.8))

model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.8))

model.add(Dense(3))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

# Fit the model with early stopping callback
earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=0, mode='auto')
model.fit(xtrain_pad, y=ytrain_enc, batch_size=512, epochs=100, 
          verbose=1, validation_data=(xvalid_pad, yvalid_enc), callbacks=[earlystop])

Train on 15663 samples, validate on 3916 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100

In [None]:
preds_proba = model.predict_proba(xvalid_pad)
print("Log-loss: {0:.3f}".format(log_loss(true_label, preds_proba)))

## Generating submissions

In [None]:
my_sub = pd.DataFrame(columns={'id', 'EAP','HPL', 'MWS'})
my_sub=my_sub[['id', 'EAP','HPL', 'MWS']]
my_sub

In [None]:
x_test=test['text']
xtest_seq = token.texts_to_sequences(x_test)
xtest_pad = sequence.pad_sequences(xtest_seq, maxlen=max_len)
preds_proba = model.predict_proba(xtest_pad)

In [None]:
preds_proba[2]

In [None]:
for i in range(len(test)):
    my_sub.loc[i] = [test['id'][i], preds_proba[i][0], preds_proba[i][1], preds_proba[i][2]]

In [None]:
my_sub.to_csv('roberto_new_2.csv',index=False)

In [None]:
my_sub.to_csv('roberto_new.csv',index=False)