# AI For Business
## Module : Natural Language Processing - End to end Example

### Lesson 04
### Use Keras Based Network for understanding concepts in LSTM

## Key Steps and Learnings Expected
1.   Setup and Data Acquisition
2.   Load a Clean Dataset
3.   Text Processing and Analysis (Sentences, Tokens and Stemming)
4.   Text Representation - Encoding
5.   Text Representation - Bag of words
6.   Text Representation - Bag of N-Grams
7.   Text Representation - TFIDF
8.   Word Embeddings - Word2Vec, Glove.
9.   Visualize Embeddings






### S1B : Load Libraries and Sample data

In [2]:
from keras.models import Sequential
from keras.layers import LSTM, TimeDistributed, Dense
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import numpy as np

np.random.seed(100)

data = ['xyzaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaxyz', 'pqraaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaapqr']

test_data = ['xyzaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaxyz','pqraaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaapqr']

In [47]:
import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)

In [33]:
# Vectorize the Data using Sklearn's LabelEncoder and OneHotEncoder (One-hot vectors for every character)
enc = LabelEncoder()
alphabet = np.array(list(set([c for w in data for c in w])))  #Unique set of characters
enc.fit(alphabet)
int_enc=enc.fit_transform(alphabet)
onehot_encoder = OneHotEncoder(sparse_output=False)
int_enc=int_enc.reshape(len(int_enc), 1)
onehot_encoded = onehot_encoder.fit_transform(int_enc)

In [9]:
alphabet, int_enc  #Encoding : Integer Encoding example

(array(['r', 'x', 'a', 'q', 'z', 'p', 'y'], dtype='<U1'),
 array([[3],
        [4],
        [0],
        [2],
        [6],
        [1],
        [5]]))

In [5]:
onehot_encoded  #Encoding : Vector / Array or Binary

array([[0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1.],
       [0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0.]])

In [10]:
# Create our Training Data
X_train=[]
y_train=[]

for w in data:
    for i in range(len(w)-1):
        X_train.extend(onehot_encoder.transform([enc.transform([w[i]])]))
        y_train.extend(onehot_encoder.transform([enc.transform([w[i+1]])]))

In [None]:
# Create our Test Data
X_test=[]
y_test=[]

for w in test_data:
    for i in range(len(w)-1):
        test_enc = onehot_encoder.transform([enc.transform([w[i]])])
        X_test.extend(test_enc)
        y_test.extend(onehot_encoder.transform([enc.transform([w[i+1]])]))

len(X_test), X_test[0:10]

In [20]:
sample_size=512
sample_len=len(X_train)

X_train = np.array([X_train*sample_size]).reshape(sample_size,sample_len,len(alphabet))
y_train = np.array([y_train*sample_size]).reshape(sample_size,sample_len,len(alphabet))

test_len=len(X_test)
X_test= np.array([X_test]).reshape(1,test_len,len(alphabet))
y_test= np.array([y_test]).reshape(1,test_len,len(alphabet))

In [None]:
len(X_test), X_test[0:10]

In [25]:
model=Sequential()
model.add(LSTM(input_dim =len(alphabet), units = 100, return_sequences = True))
#model.add(LSTM(input_dim =len(alphabet), output_dim = 100, return_sequences = True))

# NOTE: TimeDistributed is a special layer for passing weight information through Time - sequence
model.add(TimeDistributed(Dense(units = len(alphabet),activation  =  "sigmoid")))
model.compile(loss="binary_crossentropy",metrics=["accuracy"],optimizer = "adam")

In [28]:
n=1
while True:
        score = model.evaluate(X_test, y_test, batch_size=32)
        print ("[Iteration %d] score=%f"%(n,score[1]))
        if score[1] == 1.0:
            break
        n+=1
        model.fit(x= X_train, y=y_train, batch_size = 32, epochs = 1, )

[Iteration 1] score=0.784314
[Iteration 2] score=0.901961
[Iteration 3] score=0.901961
[Iteration 4] score=0.901961
[Iteration 5] score=0.901961
[Iteration 6] score=0.901961
[Iteration 7] score=0.901961
[Iteration 8] score=0.901961
[Iteration 9] score=0.901961
[Iteration 10] score=0.901961
[Iteration 11] score=0.901961
[Iteration 12] score=0.901961
[Iteration 13] score=0.901961
[Iteration 14] score=0.901961
[Iteration 15] score=0.901961
[Iteration 16] score=0.911765
[Iteration 17] score=0.921569
[Iteration 18] score=0.921569
[Iteration 19] score=0.931373
[Iteration 20] score=0.931373
[Iteration 21] score=0.931373
[Iteration 22] score=0.941176
[Iteration 23] score=0.941176
[Iteration 24] score=0.941176
[Iteration 25] score=0.941176
[Iteration 26] score=0.950980
[Iteration 27] score=0.960784
[Iteration 28] score=0.941176
[Iteration 29] score=0.941176
[Iteration 30] score=0.970588
[Iteration 31] score=0.980392
[Iteration 32] score=0.980392
[Iteration 33] score=0.980392
[Iteration 34] scor

In [38]:
m = np.array([6])
enc.inverse_transform(m)

array(['z'], dtype='<U1')

In [59]:
preds=model.predict(X_test)[0]
tmpstr = " "
for p in preds:
    m=np.argmax(p).reshape(-1, 1)
    m = str(enc.inverse_transform(m,))
    tmpstr = tmpstr + m
    #print(m)
print(tmpstr)

 ['y']['z']['a']['a']['a']['a']['a']['a']['a']['a']['a']['a']['a']['a']['a']['a']['a']['a']['a']['a']['a']['a']['a']['a']['a']['a']['a']['a']['a']['a']['a']['a']['a']['a']['a']['a']['a']['a']['a']['a']['a']['a']['a']['a']['a']['a']['a']['a']['x']['y']['z']['q']['r']['a']['a']['a']['a']['a']['a']['a']['a']['a']['a']['a']['a']['a']['a']['a']['a']['a']['a']['a']['a']['a']['a']['a']['a']['a']['a']['a']['a']['a']['a']['a']['a']['a']['a']['a']['a']['a']['a']['a']['a']['a']['a']['a']['a']['a']['a']['p']['q']['r']


In [32]:
print(model.evaluate(X_test,y_test,batch_size=32))

[0.021867329254746437, 1.0]
