In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, GlobalAveragePooling1D
from tensorflow.keras.preprocessing.text import Tokenizer

In [2]:
#a1. Direct Text Given then uncomment below line and Comment entire a2 till b (for PS:15)

# corpus = ["The speed of transmission is an important point of difference between the two viruses. Influenza has a shorter median incubation period (the time from infection to appearance of symptoms) and a shorter serial interval (the time between successive cases) than COVID-19 virus. The serial interval for COVID-19 virus is estimated to be 5-6 days, while for influenza virus, the serial interval is 3 days. This means that influenza can spread faster than COVID-19. Further, transmission in the first 3-5 days of illness, or potentially pre-symptomatic transmission –transmission of the virus before the appearance of symptoms – is a major driver of transmission for influenza. In contrast, while we are learning that there are people who can shed COVID-19 virus 24-48 hours prior to symptom onset, at present, this does not appear to be a major driver of transmission. The reproductive number – the number of secondary infections generated from one infected individual – is understood to be between 2 and 2.5 for COVID-19 virus, higher than for influenza. However, estimates for both COVID-19 and influenza viruses are very context and time-specific, making direct comparisons more difficult."]

# a2. Read the corpus from a file and comment entire a1 till a2 (for PS:6 7 8)
corpus = []
with open("input.txt", "r") as f:
    for line in f:
        corpus.append(line.strip())


In [3]:
# b. Generate training data (CBOW representation)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(corpus)
X, y = [], []

for seq in sequences:
    for i, target_word in enumerate(seq):
        for j in range(max(0, i - 2), min(i + 3, len(seq))):
            if i != j:
                X.append([target_word, seq[j]])
                y.append(1)  # Positive context
                X.append([target_word, np.random.choice(list(word_index.values()))])
                y.append(0)

X = np.array(X)
y = np.array(y)

In [4]:
# c. Train model
model = Sequential([
    Embedding(input_dim=len(word_index) + 1, output_dim=1, input_length=2),
    GlobalAveragePooling1D(),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X, y, epochs=100, verbose=0)
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 2, 1)              103       
                                                                 
 global_average_pooling1d (G  (None, 1)                0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 1)                 2         
                                                                 
Total params: 105
Trainable params: 105
Non-trainable params: 0
_________________________________________________________________


In [5]:
# d. Output
word_embeddings = model.layers[0].get_weights()[0]
for word, index in word_index.items():
    print(f"{word}: {word_embeddings[index]}")

the: [-0.37010488]
of: [-0.48188525]
influenza: [-0.46069774]
covid: [-0.45239773]
19: [-0.27636144]
virus: [-0.32765952]
for: [-0.48407477]
transmission: [-0.2200723]
is: [-0.40631753]
to: [-0.5447663]
a: [-0.35021117]
and: [-0.1916189]
between: [-0.01801419]
time: [-0.35303912]
serial: [-0.23205838]
interval: [-0.06594086]
than: [-0.04720962]
be: [0.00748012]
5: [-0.15831295]
days: [-0.2052238]
â€“: [-0.16303705]
are: [-0.4490381]
viruses: [-0.09630861]
shorter: [-0.11520135]
from: [0.03738713]
appearance: [-0.09285238]
symptoms: [-0.00954651]
while: [0.02118974]
3: [-0.20101418]
this: [-0.16127083]
that: [0.04334296]
can: [-0.11270068]
in: [-0.03438266]
major: [-0.05847953]
driver: [0.04912895]
number: [0.10532448]
2: [-0.14720155]
speed: [-0.0617995]
an: [0.21000293]
important: [0.385957]
point: [0.17243873]
difference: [-0.22974]
two: [0.05333519]
has: [-0.2054785]
median: [0.16456467]
incubation: [0.43703637]
period: [0.4987716]
infection: [0.1893826]
successive: [0.30351338]
cas

In [6]:
from sklearn.metrics.pairwise import cosine_similarity

target_word = 'viruses'
target_embedding = word_embeddings[tokenizer.word_index[target_word]]

similarities = cosine_similarity(target_embedding.reshape(1, -1), word_embeddings)[0]
most_similar_indices = similarities.argsort()[-5:][::-1]
    
most_similar_words = [word for word, idx in tokenizer.word_index.items() if idx in most_similar_indices]

print(f"Most similar words to '{target_word}': {most_similar_words}")

Most similar words to 'viruses': ['5', 'days', 'are', 'onset']
