# Experiment 3: Word2Vec Training on Medical Corpus with Bigram Detection

In [None]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.7/26.7 MB[0m [31m62.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (38.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.6/38.6 MB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scipy, gensim
  Attempting uninstall: scipy
    Found existing installation: scipy 1.14.1
    Uninstalling scipy-1.14.1:
      Successfully 

# Import libraries

In [None]:
from gensim.models import Word2Vec
from gensim.models.phrases import Phrases, Phraser
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# Read medical corpus from an external file

In [None]:
with open("/content/drive/MyDrive/Colab Notebooks/medical_corpus.txt", "r") as file:
    corpus = [line.strip() for line in file if line.strip()]

# Refined stopword list

In [None]:
stopwords = {"the", "a", "is", "for", "with", "to", "of", "and", "in", "can", "are"}
# Tokenization with stopword removal
tokenized_sentences = [
    [word for word in sentence.lower().split() if word not in stopwords]
    for sentence in corpus
]

# Detect bigrams to capture word combinations (e.g., "blood sugar", "high pressure")

In [None]:
bigram = Phrases(tokenized_sentences, min_count=2, threshold=5)
bigram_phraser = Phraser(bigram)
tokenized_sentences = [bigram_phraser[sentence] for sentence in tokenized_sentences]

# Train Word2Vec model with improved parameters

In [None]:
model = Word2Vec(tokenized_sentences, vector_size=150, window=5, min_count=2, epochs=300, sg=1, hs=1,
negative=0)

# Display similar words

In [None]:
diabetes_similar = [(word, round(sim, 2)) for word, sim in model.wv.most_similar("diabetes", topn=5)]
print("Words similar to 'diabetes':", diabetes_similar)
hypertension_similar = [(word, round(sim, 2)) for word, sim in model.wv.most_similar("hypertension",
topn=5)]
print("Words similar to 'hypertension':", hypertension_similar)

Words similar to 'diabetes': [('lifestyle', 0.12), ('disease.', 0.11), ('diet', 0.08), ('sugar', 0.07), ('diabetes.', 0.06)]
Words similar to 'hypertension': [('disease.', 0.09), ('risk', 0.05), ('high', 0.03), ('diet', 0.01), ('blood', 0.01)]
