In [290]:
import numpy as np
import re
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Lambda
from tensorflow.keras.preprocessing.text import Tokenizer

# Sample text data
data = """The speed of transmission is an important point of difference between the two viruses. Influenza has a shorter median incubation period (the time from infection to appearance of symptoms) and a shorter serial interval (the time between successive cases) than COVID-19 virus. The serial interval for COVID-19 virus is estimated to be 5-6 days, while for influenza virus, the serial interval is 3 days. This means that influenza can spread faster than COVID-19.Further, transmission in the first 3-5 days of illness, or potentially pre-symptomatic transmission –transmission of the virus before the appearance of symptoms – is a major driver of transmission for influenza. In contrast, while we are learning that there are people who can shed COVID-19 virus 24-48 hours prior to symptom onset, at present, this does not appear to be a major driver of transmission.The reproductive number – the number of secondary infections generated from one infected individual – is understood to be between 2 and 2.5 for COVID-19 virus, higher than for influenza. However, estimates for both COVID-19 and influenza viruses are very context and time-specific, making direct comparisons more difficult.  """

# Data preparation
sentences = data.split('.')
clean_sent = []
for sentence in sentences:
    if sentence == "":
        continue
    sentence = re.sub('[^A-Za-z0-9]+', ' ', sentence)
    sentence = re.sub(r'/b/w/b', ' ', sentence).strip()
    sentence = sentence.lower()
    clean_sent.append(sentence)

# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(clean_sent)
sequences = tokenizer.texts_to_sequences(clean_sent)

# Create a mapping from index to word and word to index
index_to_word = {}
word_to_index = {}
for i, sequence in enumerate(sequences):
    word_in_sentence = clean_sent[i].split()
    for j, value in enumerate(sequence):
        index_to_word[value] = word_in_sentence[j]
        word_to_index[word_in_sentence[j]] = value

# Define model parameters
vocab_size = len(tokenizer.word_index) + 1
emb_size = 10
context_size = 2

# Generate training data (contexts and targets)
contexts = []
targets = []
for sequence in sequences:
    for i in range(context_size, len(sequence) - context_size):
        target = sequence[i]
        context = [sequence[i - 2], sequence[i - 1], sequence[i + 1], sequence[i + 2]]
        contexts.append(context)
        targets.append(target)

# Convert contexts and targets to numpy arrays
X = np.array(contexts)
Y = np.array(targets)

# Build the CBOW model
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=emb_size, input_length=2 * context_size),
    Lambda(lambda x: tf.reduce_mean(x, axis=1)),
    Dense(256, activation='relu'),
    Dense(512, activation='relu'),
    Dense(vocab_size, activation='softmax')
])

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
history = model.fit(X, Y, epochs=80)

# Output the training history
print("Training History:")
print(history.history)


test_sentences = [
    "speed of transmission influenza",
    "serial interval is sdsd",
    "presymptomatic transmission major driver",
    "secondary infections generated individual"
]

for sent in test_sentences:  # Corrected the variable name
    test_words = sent.split(" ")
    x_test = []
    
    # Handle words that are not in the vocabulary
    for i in test_words:
        word_idx = word_to_index.get(i)
        if word_idx is not None:  # Only append the word index if the word exists in the vocabulary
            x_test.append(word_idx)
    
    # If x_test is empty, skip this sentence
    if len(x_test) == 0:
        print(f"Skipping test sentence: '{sent}' as no words were found in vocabulary.")
        continue
    
    x_test = np.array([x_test])
    
    pred = model.predict(x_test)
    pred = np.argmax(pred[0])
    print("Prediction for", test_words, ":", index_to_word.get(pred), "\n")


Epoch 1/80
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.0158 - loss: 4.6143    
Epoch 2/80
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.0882 - loss: 4.6010 
Epoch 3/80
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.0850 - loss: 4.5816  
Epoch 4/80
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.0767 - loss: 4.5500 
Epoch 5/80
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.0703 - loss: 4.4915      
Epoch 6/80
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.1105 - loss: 4.3811 
Epoch 7/80
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.0758 - loss: 4.2709 
Epoch 8/80
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.0807 - loss: 4.2206 
Epoch 9/80
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m