In [1]:
import pandas as pd
import numpy as np
import gensim 

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, log_loss
from sklearn import preprocessing
import re
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize

from keras.models import Sequential
from keras.layers.recurrent import LSTM, GRU
from keras.layers import GlobalAveragePooling1D
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils
from keras.preprocessing import sequence, text
from keras.callbacks import EarlyStopping
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D


from tqdm import tqdm

nltk.download('stopwords')
nltk.download('punkt')
stops = set(stopwords.words('english'))

Using TensorFlow backend.


[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
train = pd.read_csv('train.csv')
print(len(train))
train.tail()

In [None]:
test = pd.read_csv('test.csv')
print(len(test))
test.tail()

In [None]:
label_enconder = preprocessing.LabelEncoder()
label_enconder.fit(train['author'])
train['label_encoded'] = label_enconder.transform(train['author'])
train.head()

## Preprocessing function

In [None]:
def transformText(text):
    
    stops = set(stopwords.words("english"))
    
    # Convert text to lower
    text = text.lower()
    # Removing non ASCII chars    
    text = re.sub(r'[^\x00-\x7f]',r' ',text)
    
    # Strip multiple whitespaces
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
    
    # Removing all the stopwords
   # filtered_words = [word for word in text.split() if word not in stops]
    filtered_words = [word for word in text.split()]

    # Removing all the tokens with lesser than 3 characters
   # filtered_words = gensim.corpora.textcorpus.remove_short(filtered_words, minsize=3)
    
    # Preprocessed text after stop words removal
    text = " ".join(filtered_words)
    
    # Remove the punctuation
    text = gensim.parsing.preprocessing.strip_punctuation2(text)
    
    # Strip all the numerics
    text = gensim.parsing.preprocessing.strip_numeric(text)
    
    # Strip multiple whitespaces
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
    
    # Stemming
    text = gensim.parsing.preprocessing.stem_text(text)
    return text

## Creating preprocessing column on train and testing

In [None]:
train['text_processed']=train['text'].apply(lambda x: transformText(x))
train.tail()

In [None]:
test['text_processed']=test['text'].apply(lambda x: transformText(x))
print(len(test))
test.tail()

## Train/Test split

In [None]:
x_train, x_valid, y_train, y_valid = train_test_split(train['text_processed'], train['label_encoded'], test_size = 0.2, random_state = 4)
true_label = np.array(y_valid)
print("#" * 20 + " Some stats " + "#"*20)
print("Dataset training: {} uterances".format(x_train.shape[0]))
print("Dataset testing: {} uterances".format(x_valid.shape[0]))
print("Different classes: {}".format(len(y_train.unique())))

In [None]:
## Loading Glove vectors
embeddings_index = {}
f = open('../../vectors/glove.42B.300d.txt')
for line in tqdm(f):
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

In [None]:
word = 'awesome'
print(embeddings_index[word].shape)
embeddings_index[word][0:50]

In [None]:
stops

In [None]:
# this function creates a normalized vector for the whole sentence
def sent2vec(s):
    words = str(s).lower()
    words = word_tokenize(words)
    words = [w for w in words if not w in stops]
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(embeddings_index[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    if type(v) != np.ndarray:
        return np.zeros(300)
    return v / np.sqrt((v ** 2).sum())

In [None]:
## Create sentence vectors for the dataset
xtrain_glove = [sent2vec(x) for x in tqdm(x_train)]
xvalid_glove = [sent2vec(x) for x in tqdm(x_valid)]

In [None]:
xtrain_glove = np.array(xtrain_glove)
xvalid_glove = np.array(xvalid_glove)

In [None]:
xtrain_glove.shape

In [None]:
xvalid_glove.shape

In [None]:
# scale the data before any neural net:
scl = preprocessing.StandardScaler()
xtrain_glove_scl = scl.fit_transform(xtrain_glove)
xvalid_glove_scl = scl.transform(xvalid_glove)

In [None]:
# we need to binarize the labels for the neural net
ytrain_enc = np_utils.to_categorical(y_train)
yvalid_enc = np_utils.to_categorical(y_valid)

## Tokenize text data

In [None]:
token = text.Tokenizer(num_words=None)
max_len = 256

token.fit_on_texts(list(x_train) + list(x_valid))
xtrain_seq = token.texts_to_sequences(x_train)
xvalid_seq = token.texts_to_sequences(x_valid)

# zero pad the sequences
xtrain_pad = sequence.pad_sequences(xtrain_seq, maxlen=max_len)
xvalid_pad = sequence.pad_sequences(xvalid_seq, maxlen=max_len)

word_index = token.word_index

In [None]:
# create an embedding matrix for the words we have in the dataset
embedding_matrix = np.zeros((len(word_index) + 1, 300))
for word, i in tqdm(word_index.items()):
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
'''
# GRU with glove embeddings and two dense layers
model = Sequential()
model.add(Embedding(len(word_index) + 1,
                     300,
                     weights=[embedding_matrix],
                     input_length=max_len,
                     trainable=False))
model.add(SpatialDropout1D(0.3))
model.add(GRU(300, dropout=0.3, recurrent_dropout=0.3, return_sequences=True))
model.add(GRU(300, dropout=0.3, recurrent_dropout=0.3))

model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.8))

model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.8))

model.add(Dense(3))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

# Fit the model with early stopping callback
earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=0, mode='auto')
model.fit(xtrain_pad, y=ytrain_enc, batch_size=512, epochs=100, 
          verbose=1, validation_data=(xvalid_pad, yvalid_enc), callbacks=[earlystop])'''

In [None]:
# GRU with glove embeddings and two dense layers
input_dim = max_len
hidden = 300

model = Sequential()


model.add(Embedding(len(word_index) + 1,
                     300,
                     weights=[embedding_matrix],
                     input_length=max_len,
                     trainable=False))

model.add(SpatialDropout1D(0.5))
model.add(Bidirectional(LSTM(300, dropout=0.5, recurrent_dropout=0.5)))

model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.7))

model.add(Dense(3))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

# Fit the model with early stopping callback
#earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=0, mode='auto')
model.fit(xtrain_pad, y=ytrain_enc, batch_size=16, epochs=500, 
          verbose=1, validation_data=(xvalid_pad, yvalid_enc))

## Some stats
### Architecture 1 
- <b> val_loss = 0.6162 @ 20 epochs </b>   
model.add(Dropout(0.5))  
model.add(LSTM(1024))  
model.add(Dropout(0.5))  
model.add(Dense(3))  
### Architecture 2  
- <b> val_loss = 0.5966 @ 10 epochs, 0.5471 @ 20 epochs, 0.5524 @ 40 epochs </b>  
model.add(Conv1D(filters=300, kernel_size=3, padding='same', activation='relu'))  
model.add(MaxPooling1D(pool_size=2))  
model.add(Dropout(0.8))  
model.add(LSTM(600))  
model.add(Dropout(0.8))  
model.add(Dense(3))  

### Architecture 3  - batch size 512
- <b> val_loss = 0.5734 @ 10 epochs, 0.5456 @ 20 epochs,  @ 40 epochs </b>  
model.add(Conv1D(filters=150, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))  
model.add(Dropout(0.7))  
model.add(LSTM(300))  
model.add(Dropout(0.7))  
model.add(Dense(3))  
model.add(Activation('softmax'))  
model.compile(loss='categorical_crossentropy', optimizer='adam')  

### Architecture 4  - batch size 512
- <b> val_loss = 0.7962 @ 10 epochs, 0.6971 @ 20 epochs, 0.5995 @ 40 epochs </b>  
model.add(SpatialDropout1D(0.5))  
model.add(Bidirectional(LSTM(300, dropout=0.5, recurrent_dropout=0.5)))  
model.add(Dense(1024, activation='relu'))  
model.add(Dropout(0.7))  
model.add(Dense(1024, activation='relu'))  
model.add(Dropout(0.7))  
model.add(Dense(3))  
model.add(Activation('softmax'))  
model.compile(loss='categorical_crossentropy', optimizer='adam')  

In [None]:
preds_proba = model.predict_proba(xvalid_pad)
print("Log-loss: {0:.3f}".format(log_loss(true_label, preds_proba)))

## Generating submissions

In [None]:
my_sub = pd.DataFrame(columns={'id', 'EAP','HPL', 'MWS'})
my_sub=my_sub[['id', 'EAP','HPL', 'MWS']]
my_sub

In [None]:
x_test=test['text']
xtest_seq = token.texts_to_sequences(x_test)
xtest_pad = sequence.pad_sequences(xtest_seq, maxlen=max_len)
preds_proba = model.predict_proba(xtest_pad)

In [None]:
preds_proba[2]

In [None]:
for i in range(len(test)):
    my_sub.loc[i] = [test['id'][i], preds_proba[i][0], preds_proba[i][1], preds_proba[i][2]]

In [None]:
my_sub.to_csv('roberto_new_2.csv',index=False)

In [None]:
my_sub.to_csv('roberto_new.csv',index=False)