In [1]:
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import pickle
import numpy as np
from nltk import word_tokenize
from nltk.corpus import stopwords
from tqdm import tqdm
import xgboost as xgb

stop_words = stopwords.words('english')

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample = pd.read_csv('sample_submission.csv')

In [3]:
print(train.shape,test.shape,sample.shape)
train.head()

(19579, 3) (8392, 2) (8392, 4)


Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [4]:
#since target variable is 3 values we need to enode it into binaries
lbl_enc = preprocessing.LabelEncoder()
y = lbl_enc.fit_transform(train.author.values)

In [5]:
print(y)

[0 1 0 ... 0 0 1]


In [6]:
#Split the data into train and validate
xtrain,xvalid,ytrain,yvalid = train_test_split(train.text.values,y,stratify=y,random_state=42,test_size=0.1,shuffle=True)

In [7]:
print(xtrain.shape,xvalid.shape)

(17621,) (1958,)


In [8]:
#here we will be glove pre-trained model for word embeddings replace the loc of glove files with your's path
path = '/media/raghava/506A91BB6A919DF2/software materials/PiRuby/trans scripts/models/'

def loadGloveModel(gloveFiles):
    model = {}
    for file in gloveFiles:
        with open(file, 'rb') as fp:
            model.update(pickle.load(fp))
    print ("Done.",len(model)," words loaded!")
    return model

# I devided the total 4 lac words file into 4 files due to some processing issue... You can use your own logic to load the glove files
gloveFiles=list()
gloveFiles.append(path+'glove100000.pickle')
gloveFiles.append(path+'glove200000.pickle')
gloveFiles.append(path+'glove300000.pickle')
gloveFiles.append(path+'glove400000.pickle')

model = loadGloveModel (gloveFiles)
#following code will convers the each sentense from into the normalized vector using the pre-trained glove vectores
def sent2vec(s):
    words = str(s).lower()
    words = word_tokenize(words)
    words = [w for w in words if not w in stop_words]
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(model[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    if type(v) != np.ndarray:
        return np.zeros(300)
    return v /np.sqrt((v**2).sum())

Done. 400000  words loaded!


In [9]:
#converting xtrain into the vector formate
xtrain_glove = [sent2vec(x) for x in tqdm(xtrain)]

100%|██████████| 17621/17621 [00:10<00:00, 1641.21it/s]


In [10]:
#converting valid data set into the vector formate
xvalid_glove = [sent2vec(x) for x in tqdm(xvalid)]

100%|██████████| 1958/1958 [00:01<00:00, 1467.80it/s]


In [11]:
xtrain_glove = np.array(xtrain_glove)
xvalid_glove = np.array(xvalid_glove)
#since our problem is multi label predecter, I copied the following code from kaggle popular code snippets for multiclass log loss calculation
def multiclass_logloss(actual, predicted, eps=1e-15):
    """Multi class version of Logarithmic Loss metric.
    :param actual: Array containing the actual target classes
    :param predicted: Matrix with class predictions, one probability per class
    """
    # Convert 'actual' to a binary array if it's not already:
    if len(actual.shape) == 1:
        actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
        for i, val in enumerate(actual):
            actual2[i, val] = 1
        actual = actual2

    clip = np.clip(predicted, eps, 1 - eps)
    rows = actual.shape[0]
    vsota = np.sum(actual * np.log(clip))
    return -1.0 / rows * vsota

In [38]:
#Using xgboost as a algo for intial check up
clf = xgb.XGBClassifier(nthread=10,silent=False)
clf.fit(xtrain_glove,ytrain)
predictions = clf.predict_proba(xvalid_glove)

print("logloss: %0.3f " % multiclass_logloss(yvalid,predictions))

logloss: 0.852 


In [12]:
from keras.preprocessing import sequence,text
# Keras first extract the words out of corpus and assigns a hashed value for identification of each word in the dict
token = text.Tokenizer(num_words=None)
# this variable refer to the each text value length 
max_len = 70
#converting each text into the their numbered representation using dict above
token.fit_on_texts(list(xtrain) + list(xvalid))
xtrain_seq = token.texts_to_sequences(xtrain)
xvalid_seq = token.texts_to_sequences(xvalid)

# zero pad the sequences for input text whose length is less than 70
xtrain_pad = sequence.pad_sequences(xtrain_seq, maxlen=max_len)
xvalid_pad = sequence.pad_sequences(xvalid_seq, maxlen=max_len)

word_index = token.word_index

Using TensorFlow backend.


In [14]:
# binarize the labels for the neural net

from keras.models import Sequential
from keras.layers.recurrent import LSTM, GRU
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
ytrain_enc = np_utils.to_categorical(ytrain)
yvalid_enc = np_utils.to_categorical(yvalid)
len(word_index)

25943

In [24]:
#preparing embeded matrics for the words create above so that we can embeded directly it into the keras
embedding_matrix = np.zeros((len(word_index) + 1, 300))
for word, i in tqdm(word_index.items()):
    embedding_vector = model.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

100%|██████████| 25943/25943 [00:00<00:00, 172999.14it/s]


In [30]:
print(len(embedding_matrix),type(word_index))
print(type(embedding_matrix))

25944 <class 'dict'>
<class 'numpy.ndarray'>


In [31]:
# A simple LSTM with glove embeddings and two dense layers
model = Sequential()
model.add(Embedding(len(word_index) + 1,
                     300,
                     weights=[embedding_matrix],
                     input_length=max_len,
                     trainable=False))
model.add(SpatialDropout1D(0.3))
model.add(LSTM(100, dropout=0.3, recurrent_dropout=0.3))

model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.8))

model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.8))

model.add(Dense(3))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
model.fit(xtrain_pad, y=ytrain_enc, batch_size=512, epochs=100, verbose=1, validation_data=(xvalid_pad, yvalid_enc))

Train on 17621 samples, validate on 1958 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100