In [2]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from gensim.models import Word2Vec
nltk.download('punkt')
nltk.download('stopwords')

file_path = 'Europe_Covid.txt'

with open(file_path, 'r', encoding='utf-8') as file:
    europe_covid_text = file.read()

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    tokens = word_tokenize(text)
    filtered_words = [word for word in tokens if word not in stopwords.words('english')]
    return filtered_words

preprocessed_data = preprocess_text(europe_covid_text)

preprocessed_data[:10]


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


['european',
 'countries',
 'reporting',
 'record',
 'numbers',
 'covid19',
 'cases',
 'continent',
 'prepares',
 'pandemic']

In [3]:
def simple_preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return tokens

simple_preprocessed_data = simple_preprocess_text(europe_covid_text)

simple_preprocessed_data[:10]


['european',
 'countries',
 'are',
 'reporting',
 'record',
 'numbers',
 'of',
 'covid19',
 'cases',
 'as']

In [7]:
w2v_input_data = [simple_preprocess_text(sentence) for sentence in europe_covid_text.split('.')]

w2v_model = Word2Vec(min_count=1, window=5, vector_size=100, sample=6e-5, alpha=0.03, min_alpha=0.0007, negative=20, workers=1)
w2v_model.build_vocab(w2v_input_data)
w2v_model.train(w2v_input_data, total_examples=w2v_model.corpus_count, epochs=30)

similar_words_government = w2v_model.wv.most_similar('government', topn=5) if 'government' in w2v_model.wv else "Word 'government' not in vocabulary"
similar_words_curfew = w2v_model.wv.most_similar('curfew', topn=5) if 'curfew' in w2v_model.wv else "Word 'curfew' not in vocabulary"



In [6]:
similar_words_government


[('estimates', 0.2841702401638031),
 ('speech', 0.2495124340057373),
 ('have', 0.24307982623577118),
 ('from', 0.24264861643314362),
 ('isolate', 0.2306818664073944)]

In [5]:
similar_words_curfew


[('pandemic', 0.3649963140487671),
 ('since', 0.332381933927536),
 ('appeal', 0.3171386122703552),
 ('officially', 0.3149731755256653),
 ('their', 0.3129030168056488)]