In [149]:
import numpy as np
import pandas as pd
from collections import Counter, defaultdict
from sklearn.metrics import accuracy_score
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer , text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [150]:
NEGATIVE_LABEL = '__label__1'
def preprocess_pd(file, nrows):
    df = pd.read_csv(file, header=None, delimiter='\n', nrows=nrows)
    # split label and data
    df['Y'] = df[0].str[0:10]
    df['X'] = df[0].str[11:]
    # drop the unsplitted column
    df.drop(0,axis=1, inplace=True)
    # make labels binary
    df.Y = df.Y.apply(lambda label : 0 if label==NEGATIVE_LABEL else 1)
    # tokenize
    df.X = df.X.apply(text_to_word_sequence)
    # get lengths
    l = []
    for _, row in df.iterrows():
        l.append(len(row.X))
    df['len'] = l
    
    return df

In [151]:
df = preprocess_pd('train.ft.txt', 100000)
MAX_SEQUENCE_LENGTH = max(df.len)

In [136]:
# all sentences flat (generator)
def data_flat_generator(data):
    return (w for l in data for w in l)

def all_texts_generator():
    return (x for x in df.X)

def get_word_counts():
    return Counter(data_flat_generator(df.X))

In [18]:
count = get_word_counts()

In [27]:
df.X

0        [stuning, even, for, the, non, gamer, this, so...
1        [the, best, soundtrack, ever, to, anything, i'...
2        [amazing, this, soundtrack, is, my, favorite, ...
3        [excellent, soundtrack, i, truly, like, this, ...
4        [remember, pull, your, jaw, off, the, floor, a...
5        [an, absolute, masterpiece, i, am, quite, sure...
6        [buyer, beware, this, is, a, self, published, ...
7        [glorious, story, i, loved, whisper, of, the, ...
8        [a, five, star, book, i, just, finished, readi...
9        [whispers, of, the, wicked, saints, this, was,...
10       [the, worst, a, complete, waste, of, time, typ...
11       [great, book, this, was, a, great, book, i, ju...
12       [great, read, i, thought, this, book, was, bri...
13       [oh, please, i, guess, you, have, to, be, a, r...
14       [awful, beyond, belief, i, feel, i, have, to, ...
15       [don't, try, to, fool, us, with, fake, reviews...
16       [a, romantic, zen, baseball, comedy, when, you.

In [152]:
def prepare_test(file, nrows, word_index, MSL):
    df = preprocess_pd(file, nrows)
    padded = pad_texts(apply_word2index(df.X, word_index), MSL)
    print(len(padded), len(df.Y))
    return padded, df.Y
    
def apply_word2index(texts, word_index):
    data =[]
    for s in texts:
        indexes=[]
        for w in s:
            indexes.append(word_index[w])
        data.append(indexes)
    return data
        

In [153]:
def generate_word_index(texts):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(texts)  
    word_index = defaultdict(int, tokenizer.word_index)
#     sequences = tokenizer.texts_to_sequences(all_texts_generator())
#     data = pad_sequences(sequences, MSL)
    return word_index

def pad_texts(indexed_texts, MSL):
        return pad_sequences(indexed_texts, MSL)

In [154]:
word_index = generate_word_index(df.X)
input_to_rnn = pad_texts(apply_word2index(df.X, word_index), MAX_SEQUENCE_LENGTH)

In [155]:
len(word_index)

121536

In [156]:
MAX_SEQUENCE_LENGTH

242

In [157]:
EMB_SIZE = 100
zeros = np.zeros((EMB_SIZE,))
def create_embeddings_index(file):
    embeddings_index = {}
    f = open(file, encoding='UTF-8')
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()
    return embeddings_index

In [158]:
emb = create_embeddings_index('glove.6B.100d.txt')

In [159]:
def create_embeddings_matrix(word_index, emb_index, EMBEDDING_DIM=EMB_SIZE):
    embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
    for word, i in word_index.items():
        embedding_vector = emb_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector
    return embedding_matrix


In [161]:
embedding_matrix = create_embeddings_matrix(word_index, emb_index=emb)

In [79]:
# xtrain, xtest, ytrain, ytest = train_test_split(input_to_rnn, df.Y, test_size=0.2, random_state=42)

In [162]:
Xtest, Ytest = prepare_test('test.ft.txt', 400000, word_index.copy(), MAX_SEQUENCE_LENGTH)

400000 400000


In [164]:
def RNNCustom(word_index={}, embedding_weights=[],input_dim = 100,Hidden_Layer_Size= 64,
              timesteps=8, RLAYER=None, ACTIVATION="sigmoid",
              LOSS="binary_crossentropy",OPTIMIZER='nadam',METRICS=["accuracy"]):
    from keras.layers.core import Activation, Dense, Dropout, SpatialDropout1D
    from keras.layers.embeddings import Embedding
    from keras.layers.recurrent import LSTM
    from keras.models import Sequential
    
    RLAYER = LSTM if RLAYER is None else RLAYER
    
    model = Sequential()
    model.add(Embedding(len(word_index) + 1,  # vocab size 
                            input_dim,
                            weights = [embedding_weights],
                            input_length = timesteps,
                            trainable = False))
    model.add(RLAYER(Hidden_Layer_Size))
    model.add(Dense(1))
    model.add(Activation(ACTIVATION))
    model.compile(loss=LOSS, optimizer=OPTIMIZER,metrics=METRICS)
    return model

In [165]:
model = RNNCustom(word_index=word_index, embedding_weights=embedding_matrix, input_dim=EMB_SIZE, 
                  timesteps=MAX_SEQUENCE_LENGTH)

In [166]:
len(word_index)

121536

In [167]:
BATCH_SIZE = 64
EPOCHS = 4
model.fit(input_to_rnn, df.Y, batch_size=BATCH_SIZE, epochs=EPOCHS)
model.evaluate(x=Xtest, y=Ytest)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


[0.2214288782683015, 0.9116525]