<a href="https://colab.research.google.com/github/shabanakausar/shabanakausar/blob/main/Word_Embedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [87]:
#--------Word Embeddings Techniques Using Embedding Layer in Keras

In [88]:
#------- Word embeddings provide a dense representation of words and their relative meanings.
#--------They are an improvement over sparse representations used in simpler bag of word model
#--------representations.Word embeddings can be learned from text data and reused among projects. 
#--------They can also be learned as part of fitting a neural network on text data.

In [89]:
from tensorflow.keras.preprocessing.text import one_hot

In [90]:
sent = [
    'the glass of milk',
    'the glass of juice',
    'the cup of tea',
    'i am a good boy',
    'i am a good developer',
    'understand the meaning of words',
    'your words are good'
]

In [91]:
sent

['the glass of milk',
 'the glass of juice',
 'the cup of tea',
 'i am a good boy',
 'i am a good developer',
 'understand the meaning of words',
 'your words are good']

In [92]:
voc_size = 10000

In [93]:
onehot_repr = [one_hot(words,voc_size)for words in sent]
print(onehot_repr)

[[60, 3213, 3851, 6718], [60, 3213, 3851, 3852], [60, 8113, 3851, 4837], [6053, 3263, 3259, 9049, 7646], [6053, 3263, 3259, 9049, 6573], [5740, 60, 8253, 3851, 4286], [4917, 4286, 1732, 9049]]


In [94]:
#-----------------------------------Word Embedding Representation

In [95]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential

In [96]:
import numpy as np 

In [97]:
sent_length = 8
embedded_docs = pad_sequences(onehot_repr,padding='pre',maxlen=sent_length)
print(embedded_docs)

[[   0    0    0    0   60 3213 3851 6718]
 [   0    0    0    0   60 3213 3851 3852]
 [   0    0    0    0   60 8113 3851 4837]
 [   0    0    0 6053 3263 3259 9049 7646]
 [   0    0    0 6053 3263 3259 9049 6573]
 [   0    0    0 5740   60 8253 3851 4286]
 [   0    0    0    0 4917 4286 1732 9049]]


In [98]:
dim = 10

In [99]:
model = Sequential()
model.add(Embedding(voc_size,10,input_length=sent_length))
model.compile('adam','mse')

In [100]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 8, 10)             100000    
                                                                 
Total params: 100,000
Trainable params: 100,000
Non-trainable params: 0
_________________________________________________________________


In [101]:
print(model.predict(embedded_docs))

[[[-0.01053866  0.00779831 -0.03539757 -0.04246053  0.02607495
    0.03321674  0.04650969 -0.04511593 -0.03486379 -0.02792074]
  [-0.01053866  0.00779831 -0.03539757 -0.04246053  0.02607495
    0.03321674  0.04650969 -0.04511593 -0.03486379 -0.02792074]
  [-0.01053866  0.00779831 -0.03539757 -0.04246053  0.02607495
    0.03321674  0.04650969 -0.04511593 -0.03486379 -0.02792074]
  [-0.01053866  0.00779831 -0.03539757 -0.04246053  0.02607495
    0.03321674  0.04650969 -0.04511593 -0.03486379 -0.02792074]
  [ 0.00836601  0.00814414 -0.01712896  0.01734373  0.02277407
   -0.0367194  -0.02883312  0.0459929   0.00873052  0.04626245]
  [-0.01811395 -0.02689333 -0.02674539  0.00061687  0.04675669
    0.02317642  0.03153678 -0.01485841  0.03527195  0.02218441]
  [-0.0436357   0.03267442 -0.00299769  0.02690214  0.02873241
   -0.01455457  0.02239582 -0.00111525  0.02026743  0.01659402]
  [-0.02855709  0.03532175 -0.04431939  0.04544285 -0.00679753
   -0.01910367 -0.0335505   0.02254362  0.000294

In [102]:
embedded_docs[0]

array([   0,    0,    0,    0,   60, 3213, 3851, 6718], dtype=int32)

In [103]:
print(model.predict(embedded_docs)[0])

[[-0.01053866  0.00779831 -0.03539757 -0.04246053  0.02607495  0.03321674
   0.04650969 -0.04511593 -0.03486379 -0.02792074]
 [-0.01053866  0.00779831 -0.03539757 -0.04246053  0.02607495  0.03321674
   0.04650969 -0.04511593 -0.03486379 -0.02792074]
 [-0.01053866  0.00779831 -0.03539757 -0.04246053  0.02607495  0.03321674
   0.04650969 -0.04511593 -0.03486379 -0.02792074]
 [-0.01053866  0.00779831 -0.03539757 -0.04246053  0.02607495  0.03321674
   0.04650969 -0.04511593 -0.03486379 -0.02792074]
 [ 0.00836601  0.00814414 -0.01712896  0.01734373  0.02277407 -0.0367194
  -0.02883312  0.0459929   0.00873052  0.04626245]
 [-0.01811395 -0.02689333 -0.02674539  0.00061687  0.04675669  0.02317642
   0.03153678 -0.01485841  0.03527195  0.02218441]
 [-0.0436357   0.03267442 -0.00299769  0.02690214  0.02873241 -0.01455457
   0.02239582 -0.00111525  0.02026743  0.01659402]
 [-0.02855709  0.03532175 -0.04431939  0.04544285 -0.00679753 -0.01910367
  -0.0335505   0.02254362  0.00029442  0.04536163]]


In [104]:
#--------Loading the IMDB data for use with an Embedding layer

In [105]:
import tensorflow as tf
from keras.datasets import imdb
from keras import preprocessing


In [106]:
#--------------------------Number of words to consider as features
max_features = 10000

#-------------Cuts off the text after this number of words (among the max_features most common words)
maxlen = 20

In [107]:
#-------------------------------Loads the data as lists of integers
(x_train,y_train),(x_test,y_test) = imdb.load_data(num_words=max_features)

In [108]:
x_train.shape

(25000,)

In [109]:
y_train.shape

(25000,)

In [110]:
#---------------Turns the lists of integers into a 2D integer tensor of shape (samples, maxlen)
x_train = tf.keras.preprocessing.sequence.pad_sequences(x_train,maxlen=maxlen)
x_test =  tf.keras.preprocessing.sequence.pad_sequences(x_test, maxlen=maxlen)

In [111]:
#--------------------Using an Embedding layer and classifier on the IMDB data

In [112]:
from keras.models import Sequential
from keras.layers import Flatten, Dense

In [113]:
model = Sequential()

In [114]:
#---------------Specifies the maximum input length to the Embedding layer 
#---------------so you can later flatten the embedded inputs. After the Embedding layer, 
#---------------the activations have shape (samples, maxlen, 8).

model.add(Embedding(10000, 8, input_length=maxlen))

In [115]:
#---------------Flattens the 3D tensor of embeddings into a 2D tensor of shape (samples, maxlen * 8)
model.add(Flatten())

In [116]:
#---------------------------Adds the classifier on top
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
model.summary()


Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 20, 8)             80000     
                                                                 
 flatten_1 (Flatten)         (None, 160)               0         
                                                                 
 dense_1 (Dense)             (None, 1)                 161       
                                                                 
Total params: 80,161
Trainable params: 80,161
Non-trainable params: 0
_________________________________________________________________


In [117]:
history = model.fit(x_train, y_train,
epochs=10,
batch_size=32,
validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
