In [11]:
import gensim
from gensim import corpora
from gensim.models import CoherenceModel
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import string
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [12]:
# Assume you have some medical documents
documents = ["The patient was diagnosed with influenza.", "The doctor prescribed antibiotics for the infection.",
             "The MRI scan showed a tumor in the brain.", "The patient has a history of heart disease."]
documents

['The patient was diagnosed with influenza.',
 'The doctor prescribed antibiotics for the infection.',
 'The MRI scan showed a tumor in the brain.',
 'The patient has a history of heart disease.']

In [13]:
# Preprocess the documents
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
texts = [[lemmatizer.lemmatize(word) for word in word_tokenize(document.lower()) if word not in stop_words and word not in string.punctuation and word.isalnum()] for document in documents]

# Create a dictionary from the texts
dictionary = corpora.Dictionary(texts)

# Create a corpus from the dictionary
corpus = [dictionary.doc2bow(text) for text in texts]

corpus

[[(0, 1), (1, 1), (2, 1)],
 [(3, 1), (4, 1), (5, 1), (6, 1)],
 [(7, 1), (8, 1), (9, 1), (10, 1), (11, 1)],
 [(2, 1), (12, 1), (13, 1), (14, 1)]]

In [14]:
# Train the LDA model
lda_model = gensim.models.LdaModel(corpus, num_topics=2, id2word=dictionary, passes=15)

# Print the topics
topics = lda_model.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.091*"showed" + 0.091*"tumor" + 0.091*"brain" + 0.091*"scan"')
(1, '0.172*"patient" + 0.103*"disease" + 0.103*"history" + 0.103*"heart"')


In [15]:
# Calculate the coherence score
coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.42227729739736464


In [16]:
# Assume you have some patient records
records = [
    "Patient has been experiencing severe chest pain for the past week. EKG shows abnormalities that suggest possible heart disease.",
    "Patient reports frequent headaches and dizziness. Blood tests and MRI scans suggest possible anemia.",
    "Patient has history of chronic kidney disease. Recent tests show high levels of creatinine.",
    "Patient complains of persistent cough and difficulty breathing. Chest X-ray suggests pneumonia."
]

# Preprocess the records
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
texts = [[lemmatizer.lemmatize(word) for word in word_tokenize(record.lower()) if word not in stop_words and word not in string.punctuation and word.isalnum()] for record in records]

# Create a dictionary from the texts
dictionary = corpora.Dictionary(texts)

# Create a corpus from the dictionary
corpus = [dictionary.doc2bow(text) for text in texts]

# Train the LDA model
lda_model = gensim.models.LdaModel(corpus, num_topics=2, id2word=dictionary, passes=15)

# Print the topics
topics = lda_model.print_topics(num_words=4)
for topic in topics:
    print(topic)

# Calculate the coherence score
coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


(0, '0.060*"patient" + 0.060*"chest" + 0.036*"show" + 0.036*"disease"')
(1, '0.060*"patient" + 0.060*"test" + 0.036*"suggest" + 0.036*"possible"')

Coherence Score:  0.2419932228277904


### Fine-tune the LDA model
Fine-tuning an LDA model involves adjusting various parameters to improve the model's performance. One of the most important parameters to adjust is the number of topics. Other parameters include alpha and beta, which control the distribution of topics across documents and the distribution of words across topics, respectively.

In [17]:
# Define function to calculate coherence score
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15, alpha='auto', eta='auto')
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

# Use function to calculate coherence scores for different numbers of topics
model_list, coherence_values = compute_coherence_values(dictionary=dictionary, corpus=corpus, texts=texts, start=2, limit=10, step=1)

# Print the coherence scores
for m, cv in zip(range(2, 10, 1), coherence_values):
    print("Num Topics =", m, " has Coherence Value of", round(cv, 4))

Num Topics = 2  has Coherence Value of 0.2351
Num Topics = 3  has Coherence Value of 0.2265
Num Topics = 4  has Coherence Value of 0.3018
Num Topics = 5  has Coherence Value of 0.3284
Num Topics = 6  has Coherence Value of 0.2902
Num Topics = 7  has Coherence Value of 0.3
Num Topics = 8  has Coherence Value of 0.2969
Num Topics = 9  has Coherence Value of 0.2904
