In [22]:
import pandas as pd
import numpy as np
import os

## 1 - Load the preprocessed data

In [5]:
processed_file_path = os.path.join('..', 'data', 'processed')
clustering_model_name = 'KMeans'
processed_file_name = f'train_{clustering_model_name}.csv'

train_df = pd.read_csv(os.path.join(processed_file_path, processed_file_name), low_memory=False)
train_df.head()

Unnamed: 0,file,VMONTH,VYEAR,VDAYR,YEAR,AGE,SEX,ETHNIC,RACE,USETOBAC,...,OTHPROV,MHP,NODISP,REFOTHMD,RETAPPT,OTHDISP,ERADMHOS,cluster,CombinedText,ProcessedText
0,opd2006.csv,December,2006.0,Friday,2006.0,55.0,Male,Not Hispanic or Latino,White Only,Not current,...,No,,One or more dispositions marked,No,No,No,No,9,"55_year_old Male Acute problem Injury, other a...",55_year_old male acute problem injury unspecif...
1,opd2006.csv,November,2006.0,Thursday,2006.0,66.0,Male,Not Hispanic or Latino,White Only,Not current,...,No,,One or more dispositions marked,No,No,No,No,0,66_year_old Male Acute problem Cough Chronic o...,66_year_old male acute problem cough chronic o...
2,opd2006.csv,November,2006.0,Wednesday,2006.0,71.0,Female,Not Hispanic or Latino,White Only,Not current,...,No,,One or more dispositions marked,Yes,No,No,No,7,71_year_old Female Acute problem General ill f...,71_year_old female acute problem general ill f...
3,opd2006.csv,November,2006.0,Tuesday,2006.0,1.0,Female,Not Hispanic or Latino,White Only,Not current,...,No,,One or more dispositions marked,No,No,No,No,2,1_year_old Female Acute problem Other and unsp...,1_year_old female acute problem unspecified sy...
4,opd2006.csv,November,2006.0,Monday,2006.0,21.0,Female,Not Hispanic or Latino,White Only,Current,...,No,,One or more dispositions marked,No,No,No,No,6,21_year_old Female TOBACCO user Acute problem ...,21_year_old female tobacco user acute problem ...


## 2 - Topic modeling

In [17]:
import gensim
from gensim import corpora

random_seed = 42

In [13]:
# Generate bigrams
def generate_bigrams(row):
    tokens = row.split()
    bigram_model = gensim.models.Phrases(tokens, min_count=5, threshold=100)
    tokens = [token for token in bigram_model[tokens]]
    return tokens


print(generate_bigrams(train_df['ProcessedText'].iloc[0]))
preprocessed_docs = train_df['ProcessedText'].apply(generate_bigrams)

['55_year_old', 'male', 'acute', 'problem', 'injury', 'unspecified', 'sho', 'diabetes', 'hyperlipidemia', 'obesity', 'currently', 'enrol', 'disease', 'management', 'program', 'shoulder', 'upper', 'arm', 'injury']


In [15]:
# Build dictionary and corpus
dictionary = corpora.Dictionary(preprocessed_docs)
corpus = [dictionary.doc2bow(doc) for doc in preprocessed_docs]

In [59]:
# Train LDA model
n_topics = 10

lda_model = gensim.models.LdaMulticore(
    corpus=corpus,
    id2word=dictionary,
    num_topics=n_topics,
    random_state=random_seed,
    chunksize=100,
    passes=10,
)



### 2.1 - Visualize the topics

In [None]:
import pyLDAvis.gensim
import pyLDAvis

pyLDAvis.enable_notebook()

In [64]:
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
vis

## 3 - Get the topic distribution for each cluster

In [79]:
# Get the topic distribution for each document 
def get_avg_topic_distribution(cluster_data, topic_distribution, num_topics):
    cluster_topics = np.zeros((len(cluster_data.index), num_topics))

    # Update the distribution with the actual values
    for i, doc_index in enumerate(cluster_data.index.tolist()):
        for topic, prob in topic_distribution[doc_index]:
            cluster_topics[i, topic] = prob

    avg_topic_dist = np.mean(cluster_topics, axis=0)
    return avg_topic_dist


topic_distribution = lda_model.get_document_topics(corpus, minimum_probability=0.0)
clusters = train_df.groupby('cluster').ProcessedText

cluster_topics = []
for cluster, data in clusters:
    avg_topic_dist = get_avg_topic_distribution(data, topic_distribution, lda_model.num_topics)
    cluster_topics.append([avg_topic_dist.tolist()])

cluster_topics_df = pd.DataFrame(cluster_topics, columns=['avg_topic_distribution'])

cluster_topics_df

Unnamed: 0,avg_topic_distribution
0,"[0.04351350657834368, 0.08953896260369705, 0.0..."
1,"[0.5727948899534653, 0.036105655229040134, 0.0..."
2,"[0.23451522664035143, 0.0441779699728237, 0.01..."
3,"[0.049951425122777816, 0.032879602266279916, 0..."
4,"[0.0592445444466012, 0.11645161067399888, 0.07..."
5,"[0.08448538153522785, 0.1287194910859702, 0.13..."
6,"[0.09001653952557288, 0.0805647702011459, 0.08..."
7,"[0.04975695171163949, 0.0958670370226721, 0.07..."
8,"[0.274437224878984, 0.041424076669917445, 0.01..."
9,"[0.10058591111697571, 0.0794785032956668, 0.10..."


### 3.2 - Visualize the distribution of topics within each cluster

In [80]:
import altair as alt

In [105]:
# Reshape the dataframe to have a row for each cluster and topic
cluster_topics_heatmap_df = cluster_topics_df.avg_topic_distribution.apply(pd.Series).reset_index().rename(columns={'index': 'cluster'}).melt(id_vars='cluster', var_name='topic', value_name='probability')

heatmap = alt.Chart(cluster_topics_heatmap_df).mark_rect().encode(
    x='topic:O',
    y='cluster:O',
    color='probability:Q'
).properties(
    title='Average Topic Distribution for Each Cluster',
    width=400,
    height=400
)

heatmap