In [54]:
with open('corona.txt','r') as file:
    corpus = file.read()


The speed of transmission is an important point of difference between the two viruses.
Influenza has a shorter median incubation period (the time from infection to appearance of symptoms)
and a shorter serial interval (the time between successive cases) than COVID-19 virus.
The serial interval for COVID-19 virus is estimated to be 5-6 days, while for influenza virus, the serial interval is 3 days.
This means that influenza can spread faster than COVID-19.

Further, transmission in the first 3-5 days of illness, or potentially pre-symptomatic transmission –
transmission of the virus before the appearance of symptoms – is a major driver of transmission for influenza.
In contrast, while we are learning that there are people who can shed COVID-19 virus 24-48 hours prior to symptom onset,
at present, this does not appear to be a major driver of transmission.

The reproductive number – the number of secondary infections generated from one infected individual –
is understood to be between 2 

In [55]:
import tensorflow as tf
from tensorflow.keras.layers import Dense,Dropout,GlobalAveragePooling1D,Embedding
from tensorflow.keras.models import Model,Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics.pairwise import cosine_similarity 
import numpy as np

In [56]:
sentences = corpus.split('.')
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
total_words = len(tokenizer.word_index) + 1

In [57]:
print(sentences)

['\nThe speed of transmission is an important point of difference between the two viruses', '\nInfluenza has a shorter median incubation period (the time from infection to appearance of symptoms)\nand a shorter serial interval (the time between successive cases) than COVID-19 virus', '\nThe serial interval for COVID-19 virus is estimated to be 5-6 days, while for influenza virus, the serial interval is 3 days', '\nThis means that influenza can spread faster than COVID-19', '\n\nFurther, transmission in the first 3-5 days of illness, or potentially pre-symptomatic transmission –\ntransmission of the virus before the appearance of symptoms – is a major driver of transmission for influenza', '\nIn contrast, while we are learning that there are people who can shed COVID-19 virus 24-48 hours prior to symptom onset,\nat present, this does not appear to be a major driver of transmission', '\n\nThe reproductive number – the number of secondary infections generated from one infected individual 

In [60]:
window_size = 3
tokenized_sentences = tokenizer.texts_to_sequences(sentences)
data, labels = [],[]
for sentence in tokenized_sentences:
    for i,target_word in enumerate(sentence):
        context = [
            sentence[j] for j in range(i - window_size,i + window_size+1)
            if j!=i and 0 <= j < len(sentence) 
        ]
        data.append(context)
        labels.append(target_word)
        
data = pad_sequences(data)
labels = np.array(labels)

In [61]:
model = Sequential()
model.add(Embedding(input_dim = total_words, output_dim = 50, input_length = window_size*2))
model.add(GlobalAveragePooling1D())
model.add(Dense(total_words, activation = 'softmax'))

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 6, 50)             5100      
                                                                 
 global_average_pooling1d (G  (None, 50)               0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 102)               5202      
                                                                 
Total params: 10,302
Trainable params: 10,302
Non-trainable params: 0
_________________________________________________________________


2023-11-16 18:11:35.461537: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [62]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(data, labels, epochs=200, verbose=1)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

Epoch 165/200
Epoch 166/200
Epoch 167/200
Epoch 168/200
Epoch 169/200
Epoch 170/200
Epoch 171/200
Epoch 172/200
Epoch 173/200
Epoch 174/200
Epoch 175/200
Epoch 176/200
Epoch 177/200
Epoch 178/200
Epoch 179/200
Epoch 180/200
Epoch 181/200
Epoch 182/200
Epoch 183/200
Epoch 184/200
Epoch 185/200
Epoch 186/200
Epoch 187/200
Epoch 188/200
Epoch 189/200
Epoch 190/200
Epoch 191/200
Epoch 192/200
Epoch 193/200
Epoch 194/200
Epoch 195/200
Epoch 196/200
Epoch 197/200
Epoch 198/200
Epoch 199/200
Epoch 200/200


<keras.callbacks.History at 0x7f79d22720e0>

In [63]:
word_embeddings = model.layers[0].get_weights()[0]

In [65]:
from sklearn.metrics.pairwise import cosine_similarity

target_word = 'covid'
target_embedding = word_embeddings[tokenizer.word_index[target_word]]

similarities = cosine_similarity(target_embedding.reshape(1, -1), word_embeddings)[0]
most_similar_indices = similarities.argsort()[-5:][::-1]
    
most_similar_words = [word for word, idx in tokenizer.word_index.items() if idx in most_similar_indices]

print(f"Most similar words to '{target_word}': {most_similar_words}")

Most similar words to 'covid': ['covid', 'for', 'than', 'faster', 'higher']
