## Problem Statement 6

### Implement the Continuous Bag of Words (CBOW) Model for the given (textual document 1) using the below steps: 
    a. Data preparation 
    b. Generate training data 
    c. Train model
    d. Output

In [2]:
from keras.preprocessing import text
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.src.utils import np_utils
from keras import backend as K

from keras.models import Sequential
from keras.layers import Dense, Embedding, Lambda

from sklearn.metrics.pairwise import euclidean_distances

import numpy as np
import pandas as pd

In [35]:
file=open("CBOW.txt",'w')

In [36]:
file.write(
    """
    The speed of transmission is an important point of difference between the two viruses. Influenza has a shorter median incubation period (the time from infection to appearance of symptoms) and a shorter serial interval (the time between successive cases) than COVID-19 virus. The serial interval for COVID-19 virus is estimated to be 5-6 days, while for influenza virus, the serial interval is 3 days. This means that influenza can spread faster than COVID-19. 

Further, transmission in the first 3-5 days of illness, or potentially pre-symptomatic transmission â€“transmission of the virus before the appearance of symptoms â€“ is a major driver of transmission for influenza. In contrast, while we are learning that there are people who can shed COVID-19 virus 24-48 hours prior to symptom onset, at present, this does not appear to be a major driver of transmission. 

The reproductive number â€“ the number of secondary infections generated from one infected individual â€“ is understood to be between 2 and 2.5 for COVID-19 virus, higher than for influenza. However, estimates for both COVID-19 and influenza viruses are very context and time-specific, making direct comparisons more difficult.
    """
)

1209

In [37]:
file.close()

In [39]:
file=open("CBOW.txt",'r')
content=file.read()

In [41]:
print(content)


    The speed of transmission is an important point of difference between the two viruses. Influenza has a shorter median incubation period (the time from infection to appearance of symptoms) and a shorter serial interval (the time between successive cases) than COVID-19 virus. The serial interval for COVID-19 virus is estimated to be 5-6 days, while for influenza virus, the serial interval is 3 days. This means that influenza can spread faster than COVID-19. 

Further, transmission in the first 3-5 days of illness, or potentially pre-symptomatic transmission â€“transmission of the virus before the appearance of symptoms â€“ is a major driver of transmission for influenza. In contrast, while we are learning that there are people who can shed COVID-19 virus 24-48 hours prior to symptom onset, at present, this does not appear to be a major driver of transmission. 

The reproductive number â€“ the number of secondary infections generated from one infected individual â€“ is understood to 

In [42]:
data = """The speed of transmission is an important point of difference between the two viruses. Influenza has a shorter median incubation period (the time from infection to appearance of symptoms) and a shorter serial interval (the time between successive cases) than COVID-19 virus. The serial interval for COVID-19 virus is estimated to be 5-6 days, while for influenza virus, the serial interval is 3 days. This means that influenza can spread faster than COVID-19. 

Further, transmission in the first 3-5 days of illness, or potentially pre-symptomatic transmission â€“transmission of the virus before the appearance of symptoms â€“ is a major driver of transmission for influenza. In contrast, while we are learning that there are people who can shed COVID-19 virus 24-48 hours prior to symptom onset, at present, this does not appear to be a major driver of transmission. 

The reproductive number â€“ the number of secondary infections generated from one infected individual â€“ is understood to be between 2 and 2.5 for COVID-19 virus, higher than for influenza. However, estimates for both COVID-19 and influenza viruses are very context and time-specific, making direct comparisons more difficult."""

In [43]:
dl_data=data.split()

# Data preparation

In [48]:
tokenizer=Tokenizer()
tokenizer.fit_on_texts(dl_data)

words2id = tokenizer.word_index
words2id['PAD']=0

id2words={v:k for k,v in words2id.items()}

In [49]:
wids = [[words2id[w]for w in text.text_to_word_sequence(doc)] for doc in dl_data]

In [51]:
vocab_size=len(words2id)
embed_size=100
window_size=2

In [54]:
print(vocab_size)
print("Vocabulary: ",list(words2id.items())[:10])

103
Vocabulary:  [('the', 1), ('of', 2), ('influenza', 3), ('covid', 4), ('19', 5), ('virus', 6), ('for', 7), ('transmission', 8), ('is', 9), ('to', 10)]


# Generate training data

In [64]:
def pairwise(corpus, window_size, vocab_size):
    context_length = window_size*2

    for words in corpus:
        sentence_length=len(words)

        for index, word in enumerate(words):
            context_words=[]
            label_word=[]
            start=index-window_size
            end=index + window_size + 1

            context_words.append([ words[i] for i in range(start, end) if 0 <= i < sentence_length and i!=index])
            label_word.append(word)

            x = pad_sequences(context_words, maxlen=context_length)
            y = np_utils.to_categorical(label_word, vocab_size)
            yield(x,y)


# Train Model

In [65]:
cbow = Sequential()
cbow.add(Embedding(input_dim=vocab_size, output_dim=embed_size, input_length = window_size*2))
cbow.add(Lambda(lambda x:K.mean(x, axis=1), output_shape=(embed_size,)))
cbow.add(Dense(vocab_size, activation='softmax'))

cbow.compile(loss='categorical_crossentropy', optimizer='rmsprop')
print(cbow.summary())

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 4, 100)            10300     
                                                                 
 lambda_3 (Lambda)           (None, 100)               0         
                                                                 
 dense_3 (Dense)             (None, 103)               10403     
                                                                 
Total params: 20703 (80.87 KB)
Trainable params: 20703 (80.87 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [66]:
for epoch in range(1,6):
    loss=0
    for x,y in pairwise(corpus=wids, window_size=window_size, vocab_size=vocab_size):
        loss += cbow.train_on_batch(x,y)
    print("Epoch: ",epoch, "Loss: ", loss)
    print()

Epoch:  1 Loss:  915.4748382568359

Epoch:  2 Loss:  894.0665366649628

Epoch:  3 Loss:  879.0914084911346

Epoch:  4 Loss:  872.744843006134

Epoch:  5 Loss:  868.3536374568939



In [72]:
weights=cbow.get_weights()[0]
weights=weights[1:]
print(weights.shape)

(102, 100)


In [77]:
distance_matrix = euclidean_distances(weights)

inwords=input()

similar_words = {search_term: [id2words[idx] for idx in distance_matrix[words2id[search_term]-1].argsort()[0:6]]
                              for search_term in {inwords}}

similar_words

 covid


{'covid': ['influenza', 'higher', 'major', 'important', 'this', 'understood']}