In [1]:
pip install gensim nltk

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
covid_texts = [
    "COVID-19 vaccines are effective in preventing severe illness.",
    "Wearing masks and social distancing helps slow the spread.",
    "The virus primarily spreads through respiratory droplets.",
    "Many countries have implemented lockdowns to reduce transmission.",
    "Vaccination campaigns are ongoing worldwide."
]

In [3]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string

nltk.download('punkt')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def preprocess(text):
    tokens = word_tokenize(text.lower())
    return [word for word in tokens if word not in stop_words and word not in string.punctuation]

corpus = [preprocess(text) for text in covid_texts]
print("Preprocessed Corpus:\n", corpus)


Preprocessed Corpus:
 [['covid-19', 'vaccines', 'effective', 'preventing', 'severe', 'illness'], ['wearing', 'masks', 'social', 'distancing', 'helps', 'slow', 'spread'], ['virus', 'primarily', 'spreads', 'respiratory', 'droplets'], ['many', 'countries', 'implemented', 'lockdowns', 'reduce', 'transmission'], ['vaccination', 'campaigns', 'ongoing', 'worldwide']]


[nltk_data] Downloading package punkt to C:\Users\attar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\attar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
from gensim.models import FastText

# Train FastText model
fasttext_model = FastText(sentences=corpus, vector_size=100, window=3, min_count=1, sg=1, epochs=10)

# Example: Get vector for "covid"
vector_covid_fasttext = fasttext_model.wv['covid']
print("\nFastText vector for 'covid':\n", vector_covid_fasttext)

# Save vectors
fasttext_model.wv.save_word2vec_format("covid_fasttext_vectors.txt")



FastText vector for 'covid':
 [-5.2537007e-04  7.5024628e-04 -1.8182585e-03 -1.5143050e-03
  1.6915161e-03  2.3521522e-04 -1.2964833e-03 -9.9575135e-04
  5.8155687e-04 -1.1400769e-03  2.8334472e-03  9.8211516e-04
 -1.7707513e-03 -2.3138227e-03 -2.5633004e-04 -1.2200292e-04
  7.9091772e-04 -1.2553094e-03 -6.9756212e-04 -1.3173289e-03
 -2.0263945e-03 -2.3070307e-04 -2.0273863e-03 -2.3071766e-03
 -6.0498167e-04  2.5093090e-04  3.9354860e-04 -1.7460535e-03
  1.5518399e-03  9.3254313e-04 -1.3613185e-03 -1.0563096e-03
 -2.3867062e-03  7.0288993e-04  1.2490264e-03  3.0626150e-04
  1.1278578e-03  2.6279795e-03 -1.7221106e-03  2.1671483e-04
 -5.4587156e-04 -4.1127900e-04  2.2859398e-04 -1.6629194e-04
 -2.3908501e-03 -2.9326328e-03  6.8384391e-04 -1.3333720e-03
  4.2666003e-04  4.1887915e-04 -9.4591596e-05 -6.6043396e-04
 -4.8879683e-03 -4.5349385e-04 -1.3059790e-03  2.9200173e-03
 -6.3996231e-05 -3.3272286e-03 -1.2057881e-03  2.9371184e-04
 -4.5855716e-04 -8.2507805e-04  2.3339657e-04 -1.55662

In [5]:
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors

glove_input = 'glove.6B.100d.txt'  # Download and place it in your directory
word2vec_output = 'glove.6B.100d.word2vec.txt'

glove2word2vec(glove_input, word2vec_output)
glove_model = KeyedVectors.load_word2vec_format(word2vec_output, binary=False)

# Example: Get vector for "vaccine"
if 'vaccine' in glove_model:
    vector_glove = glove_model['vaccine']
    print("\nGloVe vector for 'vaccine':\n", vector_glove)

# Save vectors for COVID corpus words
covid_vocab = set(word for doc in corpus for word in doc)
with open("covid_glove_vectors.txt", "w") as f:
    for word in covid_vocab:
        if word in glove_model:
            vec = glove_model[word]
            vec_str = ' '.join(map(str, vec))
            f.write(f"{word} {vec_str}\n")


  glove2word2vec(glove_input, word2vec_output)


FileNotFoundError: [Errno 2] No such file or directory: 'glove.6B.100d.txt'