This tiny notebook illustrates a simple NLP task with a neural network model using keras

In [1]:
import numpy as np
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten
from keras.layers.embeddings import Embedding


Using TensorFlow backend.


In [2]:
docs =['Well done!','Good work','Great effort','nice work','Excellent!','Weak','Poor effort!','not good','poor work','Could have doone better.']
labels= np.array([1,1,1,1,1,0,0,0,0,0])

In [4]:
# encoding docs
vocab_size = 50
encoded_docs = [one_hot(d,vocab_size) for d in docs]
print(encoded_docs)

[[18, 8], [41, 34], [17, 5], [22, 34], [4], [1], [26, 5], [19, 41], [26, 34], [32, 27, 47, 20]]


In [5]:
one_hot('Well done!',50)

[18, 8]

In [6]:
one_hot('Well',50)

[18]

In [7]:
# pad documents to a max length of 4 words
max_length =4
padded_docs = pad_sequences(encoded_docs, maxlen=max_length,padding='post')
print(padded_docs)

[[18  8  0  0]
 [41 34  0  0]
 [17  5  0  0]
 [22 34  0  0]
 [ 4  0  0  0]
 [ 1  0  0  0]
 [26  5  0  0]
 [19 41  0  0]
 [26 34  0  0]
 [32 27 47 20]]


## Create model

In [9]:
def create_model():
    model = Sequential()
    model.add(Embedding(vocab_size,8,input_length=max_length))
    model.add(Flatten())
    model.add(Dense(1,activation='sigmoid'))
    # compile
    model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['acc'])
    model.summary()
    model.fit(padded_docs, labels,epochs=50,verbose=0)
    return model

model = create_model()
loss, accuracy = model.evaluate(padded_docs, labels,verbose=0)
print('Accuracy: %f' % (accuracy*100))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 4, 8)              400       
_________________________________________________________________
flatten_2 (Flatten)          (None, 32)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 33        
Total params: 433
Trainable params: 433
Non-trainable params: 0
_________________________________________________________________
Accuracy: 80.000001


## Study case: Using pre_trained Glove embedding   
One way is to use pre-trained Glove which can get from [download]('https://nlp.stanford.edu/projects/glove/'). Here we use smallest package glove.6B.zip

In [12]:
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense,Flatten, Embedding

In [13]:
docs =['Well done!','Good work','Great effort','nice work','Excellent!','Weak','Poor effort!','not good','poor work','Could have doone better.']
labels= np.array([1,1,1,1,1,0,0,0,0,0])

## Processing text

In [14]:
t =Tokenizer()
t.fit_on_texts(docs)
vocab_size = len(t.word_index) +1
# encoding the documents
encoded_docs = t.texts_to_sequences(docs)
print(encoded_docs)

[[5, 6], [2, 1], [7, 3], [8, 1], [9], [10], [4, 3], [11, 2], [4, 1], [12, 13, 14, 15]]


In [15]:
# pad document
max_length =4
padded_docs = pad_sequences(encoded_docs, maxlen =max_length,padding='post')
print(padded_docs)

[[ 5  6  0  0]
 [ 2  1  0  0]
 [ 7  3  0  0]
 [ 8  1  0  0]
 [ 9  0  0  0]
 [10  0  0  0]
 [ 4  3  0  0]
 [11  2  0  0]
 [ 4  1  0  0]
 [12 13 14 15]]


## load pretrainded

In [20]:
embeddings_index= dict()
f =open('/home/tri/Downloads/MLdatasets/glove.6B.100d.txt',mode='rt',encoding='utf-8')
for line in f:
    values =line.split()
    word = values[0]
    coefs = np.asarray(values[1:],dtype='float32')
    embeddings_index[word] =coefs
f.close()
print('Loaded %s word vectors' % len(embeddings_index))

Loaded 400000 word vectors


In [21]:
embeddings_index

{'fey': array([-0.55002999, -0.1327    , -0.025465  , -0.79474002, -0.08995   ,
         0.26159999, -0.058364  ,  0.73098999,  0.3786    , -0.2904    ,
        -0.22504   ,  0.20589   , -0.027185  , -0.21832   ,  0.28801   ,
        -0.52539003,  0.057123  , -0.28760001,  0.20366   ,  1.17569995,
         0.3339    , -0.28226   , -0.24835999,  0.90999001,  0.42262   ,
         0.18316001,  0.40401   ,  0.47207999, -0.16365001, -0.091326  ,
        -0.22606   ,  0.31224   ,  0.35675001, -0.24992   ,  0.038945  ,
        -0.46786001,  0.2902    ,  0.39409   , -0.63576001, -0.62624002,
        -0.38431999,  0.36482999, -0.63823003,  0.54654002, -0.65831   ,
         0.30429   , -1.26629996, -0.67474997,  0.73622   , -0.49871999,
        -0.60782999, -0.59322   ,  0.63384002,  0.71824002, -0.38042   ,
        -0.43052   ,  0.40779999,  0.010772  , -1.21140003,  0.10012   ,
         0.1214    ,  0.46393001, -0.21067999,  0.57112998,  0.81358999,
        -0.17617001,  0.65995002, -0.1370200

## Create a weight matrix for word

In [23]:
embedding_matrix = np.zeros((vocab_size,100))
for word, i in t.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector


In [25]:
def create_model():
    model = Sequential()
    e = Embedding(vocab_size,100, weights=[embedding_matrix],input_length =4, trainable = False)
    model.add(e)
    model.add(Flatten())
    model.add(Dense(1,activation='sigmoid'))
    # compile
    model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['acc'])
    # summary
    model.summary()
    # Fit 
    model.fit(padded_docs,labels, epochs=50,verbose=0)
    return model

model = create_model()
loss, accuracy = model.evaluate(padded_docs, labels, verbose=0) 
print("Accuracy %f" % (accuracy*100))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 4, 100)            1600      
_________________________________________________________________
flatten_3 (Flatten)          (None, 400)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 401       
Total params: 2,001
Trainable params: 401
Non-trainable params: 1,600
_________________________________________________________________
Accuracy 100.000000
