In [4]:
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.layers import Embedding , Dense , Lambda
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential

In [5]:
with open("CBOW.txt" , "r") as file:
  text = file.read()

In [6]:
sentences = text.split('.')
sentences = [s.lower() for s in sentences]
sentences

['the speed of transmission is an important point of difference between the two viruses',
 ' influenza has a shorter median incubation period (the time from infection to appearance of symptoms) and a shorter serial interval (the time between successive cases) than covid-19 virus',
 ' the serial interval for covid-19 virus is estimated to be 5-6 days, while for influenza virus, the serial interval is 3 days',
 ' this means that influenza can spread faster than covid-19',
 ' \n\nfurther, transmission in the first 3-5 days of illness, or potentially pre-symptomatic transmission –transmission of the virus before the appearance of symptoms – is a major driver of transmission for influenza',
 ' in contrast, while we are learning that there are people who can shed covid-19 virus 24-48 hours prior to symptom onset, at present, this does not appear to be a major driver of transmission',
 ' \n\nthe reproductive number – the number of secondary infections generated from one infected individual – 

In [10]:
#tokenizing the sentences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)
print(sequences)

[[1, 38, 2, 8, 9, 39, 40, 41, 2, 42, 13, 1, 43, 23], [3, 44, 11, 24, 45, 46, 47, 1, 14, 25, 48, 10, 26, 2, 27, 12, 11, 24, 15, 16, 1, 14, 13, 49, 50, 17, 4, 5, 6], [1, 15, 16, 7, 4, 5, 6, 9, 51, 10, 18, 19, 52, 20, 28, 7, 3, 6, 1, 15, 16, 9, 29, 20], [30, 53, 31, 3, 32, 54, 55, 17, 4, 5], [56, 8, 33, 1, 57, 29, 19, 20, 2, 58, 59, 60, 61, 62, 8, 63, 2, 1, 6, 64, 1, 26, 2, 27, 21, 9, 11, 34, 35, 2, 8, 7, 3], [33, 65, 28, 66, 22, 67, 31, 68, 22, 69, 70, 32, 71, 4, 5, 6, 72, 73, 74, 75, 10, 76, 77, 78, 79, 30, 80, 81, 82, 10, 18, 11, 34, 35, 2, 8], [1, 83, 36, 21, 1, 36, 2, 84, 85, 86, 25, 87, 88, 89, 21, 9, 90, 10, 18, 13, 37, 12, 37], [19, 7, 4, 5, 6, 91, 17, 7, 3], [92, 93, 7, 94, 4, 5, 12, 3, 23, 22, 95, 96, 12, 14, 97, 98, 99, 100, 101, 102], []]


In [13]:
#make indexes
index_to_word = tokenizer.index_word
word_to_index = tokenizer.word_index
print(index_to_word)

{1: 'the', 2: 'of', 3: 'influenza', 4: 'covid', 5: '19', 6: 'virus', 7: 'for', 8: 'transmission', 9: 'is', 10: 'to', 11: 'a', 12: 'and', 13: 'between', 14: 'time', 15: 'serial', 16: 'interval', 17: 'than', 18: 'be', 19: '5', 20: 'days', 21: '–', 22: 'are', 23: 'viruses', 24: 'shorter', 25: 'from', 26: 'appearance', 27: 'symptoms', 28: 'while', 29: '3', 30: 'this', 31: 'that', 32: 'can', 33: 'in', 34: 'major', 35: 'driver', 36: 'number', 37: '2', 38: 'speed', 39: 'an', 40: 'important', 41: 'point', 42: 'difference', 43: 'two', 44: 'has', 45: 'median', 46: 'incubation', 47: 'period', 48: 'infection', 49: 'successive', 50: 'cases', 51: 'estimated', 52: '6', 53: 'means', 54: 'spread', 55: 'faster', 56: 'further', 57: 'first', 58: 'illness', 59: 'or', 60: 'potentially', 61: 'pre', 62: 'symptomatic', 63: '–transmission', 64: 'before', 65: 'contrast', 66: 'we', 67: 'learning', 68: 'there', 69: 'people', 70: 'who', 71: 'shed', 72: '24', 73: '48', 74: 'hours', 75: 'prior', 76: 'symptom', 77: 'o

In [35]:
#generate traing data || creating context and target

vocab_size = len(word_to_index) + 1
emb_size = 100
context_size = 2;
contexts=[]
targets = []
for sequence in sequences:
  for i in range(context_size , len(sequence)-context_size):
    context = [sequence[i-2],sequence[i-1],sequence[i+1],sequence[i+2]]
    target = sequence[i]
    contexts.append(context)
    targets.append(target)

print(targets,"\n")
print(contexts)

[2, 8, 9, 39, 40, 41, 2, 42, 13, 1, 11, 24, 45, 46, 47, 1, 14, 25, 48, 10, 26, 2, 27, 12, 11, 24, 15, 16, 1, 14, 13, 49, 50, 17, 4, 16, 7, 4, 5, 6, 9, 51, 10, 18, 19, 52, 20, 28, 7, 3, 6, 1, 15, 16, 9, 31, 3, 32, 54, 55, 17, 33, 1, 57, 29, 19, 20, 2, 58, 59, 60, 61, 62, 8, 63, 2, 1, 6, 64, 1, 26, 2, 27, 21, 9, 11, 34, 35, 2, 8, 28, 66, 22, 67, 31, 68, 22, 69, 70, 32, 71, 4, 5, 6, 72, 73, 74, 75, 10, 76, 77, 78, 79, 30, 80, 81, 82, 10, 18, 11, 34, 35, 36, 21, 1, 36, 2, 84, 85, 86, 25, 87, 88, 89, 21, 9, 90, 10, 18, 13, 37, 4, 5, 6, 91, 17, 7, 94, 4, 5, 12, 3, 23, 22, 95, 96, 12, 14, 97, 98, 99, 100] 

[[1, 38, 8, 9], [38, 2, 9, 39], [2, 8, 39, 40], [8, 9, 40, 41], [9, 39, 41, 2], [39, 40, 2, 42], [40, 41, 42, 13], [41, 2, 13, 1], [2, 42, 1, 43], [42, 13, 43, 23], [3, 44, 24, 45], [44, 11, 45, 46], [11, 24, 46, 47], [24, 45, 47, 1], [45, 46, 1, 14], [46, 47, 14, 25], [47, 1, 25, 48], [1, 14, 48, 10], [14, 25, 10, 26], [25, 48, 26, 2], [48, 10, 2, 27], [10, 26, 27, 12], [26, 2, 12, 11], [

In [36]:
#printing features with target
for i in range(5):
  word = []
  target = index_to_word.get(targets[i])
  for j in contexts[i]:
    word.append(index_to_word.get(j))
  print(word, "=>" , target)

['the', 'speed', 'transmission', 'is'] => of
['speed', 'of', 'is', 'an'] => transmission
['of', 'transmission', 'an', 'important'] => is
['transmission', 'is', 'important', 'point'] => an
['is', 'an', 'point', 'of'] => important


In [37]:
#creating training and testing data
x = np.array(contexts)
y = np.array(targets)
print("shape of x = ",x.shape)
print("shape of y = ",y.shape)

shape of x =  (162, 4)
shape of y =  (162,)


In [43]:
#defining the model

model=Sequential([
    Embedding(input_dim=vocab_size,output_dim=emb_size,input_length=context_size*2),
    Lambda(lambda x:tf.reduce_mean(x,axis=1)),
    Dense(256,activation='relu'),
    Dense(512,activation='relu'),
    Dense(vocab_size,activation='softmax')
])

In [44]:
model.compile(optimizer='adam',loss='sparse_categorical_crossentropy',metrics=['accuracy'])


In [45]:
history=model.fit(x,y,epochs=20)


Epoch 1/20
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step - accuracy: 0.0106 - loss: 4.6345
Epoch 2/20
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.0616 - loss: 4.6100 
Epoch 3/20
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.0645 - loss: 4.5776
Epoch 4/20
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.0428 - loss: 4.5251 
Epoch 5/20
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.0436 - loss: 4.4267 
Epoch 6/20
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.0555 - loss: 4.2357 
Epoch 7/20
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.0559 - loss: 4.0908 
Epoch 8/20
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.0802 - loss: 3.9327 
Epoch 9/20
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [

In [47]:
#output

test_word = [index_to_word[index] for index in contexts[0]]
input_data = np.expand_dims(contexts[0],axis =0)
print(input_data)

pred = model.predict(input_data)
predicted_index = np.argmax(pred[0])

print("Context words == " , test_word)
print("target word == " , index_to_word[predicted_index])

[[ 1 38  8  9]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
Context words ==  ['the', 'speed', 'transmission', 'is']
target word ==  of
