In [1]:
"""
W2V VSM Full Pipeline
Testing pipeline that generates vector embeddings & cosine similarities for radiation oncology clinical practice documents 
Created on Thursday Apr 5 19:17:45 2025
@authors: Samuel luk, Shaotai Hu
""";

In [1]:
### packages & custom functions ###
import os, random, re, math, pandas as pd, numpy as np, matplotlib.pyplot as plt, gensim, warnings
from gensim.models import Word2Vec

from sim_functions_final import cos_sim, eud_dis, man_dis
from vsm_functions_final import shuffle, build_corpus, tokenize, w2v_train, bal_ran_subsamp, total_words, vector_append, vector_gen, skl_tfidf

warnings.filterwarnings("ignore")
np.set_printoptions(suppress=True)

In [None]:
### load data ###
output_dir = "###"
path_1 = "###"
path_2 = "###"
data1_f = pd.read_csv(path_1)
data2_f = pd.read_csv(path_2)
data1_f["column"] = data1_f["column"].str.replace(" ", "-", regex=False)
data2_f["column"] = data2_f["column"].str.replace(" ", "-", regex=False)
temp_full_data = pd.concat([data1_f, data2_f], ignore_index=True)
column_subjects = list(temp_full_data["column"].unique())

In [3]:
### testing weight & aggregation functions ###
# Takes tokenized corpus
def count_term(data, node, term):
    count = 0
    for t in data[node]:
        if t == term:
            count+=1
    return count

# aggregated cos_similarity for a subject in column
def aggregate_loc(data, w2v_results):
    N = len(data)
    weighted_nodes = {}
    for node in nodes_in[1:len(nodes_in)]:
        #print(node)
        data[node] = data[node].astype(str)
        weights = {}
        for e in data[node].unique():
            e_count = count_term(data, node, e)
            weights[e] = e_count / N
        #print(np.sum(list(weights.values())))
        temp = 0
        for key in weights:
            #print(f"{key}: {weights[key]}, {key}: {w2v_results[key]}")
            temp += weights[key] * w2v_results[key]
        weighted_nodes[node] = temp
    final_cs = np.sum(list(weighted_nodes.values()))/len(nodes_in[1:len(nodes_in)])
    return final_cs, weighted_nodes 

In [None]:
### all_loc pipeline ###

df_all = pd.DataFrame()
# 0 is model loc cs and 1 is weighted cs
weighted_df = pd.DataFrame(columns = column_subjects, index = [0, 1])

for subject in column_subjects:

    data1 = data1_f[data1_f["column"] == subject] 
    data2 = data2_f[data2_f["column"] == subject] 
    
    # used in weight calculation
    all_data = pd.concat([data1, data2], ignore_index=True)
    all_lower = all_data.applymap(lambda x: x.lower() if isinstance(x, str) else x)
    all_lower["subject1"] = "###_" + all_lower["subject1"].astype(str)
    #...

    toks1 = tokenize(build_corpus(data1, nodes_in))
    toks2 = tokenize(build_corpus(data2, nodes_in))
    toks_base12 = toks1+toks2
    
    w2v_mall = Word2Vec(workers = 1, seed = 12345, min_count = 1000, vector_size = 1000)
    w2v_corall = w2v_train(w2v_mall, toks_base12) 
    cbw_results = {}
    eud_results = {}
    man_results = {}

    for i in w2v_corall.wv.index_to_key:
        cbw_results[i] = []
        eud_results[i] = []
        man_results[i] = []

    for i in range(10):
        # hyperparameters are randomized for confidentiality
        w2v_m1 = Word2Vec(workers = 1000,
                    seed = 12345,
                    vector_size = 1000,
                    window = 1000,
                    min_count = 1000,
                    epochs = 1000,
                    sg = 0,
                    cbow_mean = 1)
    
        w2v_m2 = Word2Vec(workers = 1000,
                    seed = 12345,
                    vector_size = 1000,
                    window = 1000,
                    min_count = 1000,
                    epochs = 1000,
                    sg = 0,
                    cbow_mean = 1)

        #to account for known w2v output randomness
        w2v_cor1 = w2v_train(w2v_m1, shuffle(toks1)) 
        w2v_cor2 = w2v_train(w2v_m2, shuffle(toks2)) 

        mutual_ts = set(w2v_cor1.wv.index_to_key) & set(w2v_cor2.wv.index_to_key)
        # stores the iterations of cos_sim models
        for i in w2v_corall.wv.index_to_key:
            if i in mutual_ts:
                cbw_results[i].append(cos_sim(w2v_cor1.wv[i], w2v_cor2.wv[i]))
                eud_results[i].append(eud_dis(w2v_cor1.wv[i], w2v_cor2.wv[i]))
                man_results[i].append(man_dis(w2v_cor1.wv[i], w2v_cor2.wv[i]))
            else:
                cbw_results[i].append(0)
                eud_results[i].append(0)
                man_results[i].append(0)
                
    ### weights ###
    mean_results = {key: np.mean(value) for key, value in cbw_results.items()}
    #print(mean_results)
    #eud_mean = {key: np.mean(value) for key, value in eud_results.items()}
    #man_mean = {key: np.mean(value) for key, value in man_results.items()}
    
    single_cs = np.round(float(mean_results[subject.lower()]), 4)
    weighted_cs = np.round(aggregate_loc(all_lower, mean_results)[0], 4)
    
    weighted_df[subject] = [single_cs, weighted_cs]
    
    # rename the subject loc column to cbw_loc to represent the single w2v value  
    cbw_results["cbw_loc"] = cbw_results.pop(subject.lower())
    
    # turns into df and adds the subject loc in every row
    cbwdf = pd.DataFrame(cbw_results)
    cbwdf.insert(0, "subject", f"{subject}")
    
    # appends to overall df
    df_all = pd.concat([df_all, cbwdf], ignore_index = True)
    
    # moves the index of cbw_loc to 1
    cols = df_all.columns.tolist()
    cols.insert(1, cols.pop(cols.index("cbw_loc")))
    df_final = df_all[cols]

In [7]:
### test distribution saving as csv ###
test_df = df_final.round(6)
output_folder = os.path.join(output_dir, "distribution.csv") # update
test_df.to_csv(output_folder, index=True)

In [18]:
### Result Summary Table with TF and TF-IDF ###

vsms = ["TF", "sklearn_TF-IDF", "w2v"]
sim_funcs = ["cos_sim", "eud_dis", "man_dis"]

df = pd.DataFrame(index = vsms, columns = sim_funcs) 

df["cos_sim"]["TF"] = np.round((cos_sim(vector_gen(data1, data2, nodes_in)[0], vector_gen(data1, data2, nodes_in)[1])), 4)
df["eud_dis"]["TF"] = np.round((eud_dis(vector_gen(data1, data2, nodes_in)[0], vector_gen(data1, data2, nodes_in)[1])), 4)
df["man_dis"]["TF"] = np.round((man_dis(vector_gen(data1, data2, nodes_in)[0], vector_gen(data1, data2, nodes_in)[1])), 4)

df["cos_sim"]["sklearn_TF-IDF"] = np.round(cos_sim(skl_tfidf(corpora, corpus1)[0], skl_tfidf(corpora, corpus1)[1]), 4)
df["eud_dis"]["sklearn_TF-IDF"] = np.round(eud_dis(skl_tfidf(corpora, corpus1)[0], skl_tfidf(corpora, corpus1)[1]), 4)
df["man_dis"]["sklearn_TF-IDF"] = np.round(man_dis(skl_tfidf(corpora, corpus1)[0], skl_tfidf(corpora, corpus1)[1]), 4)

df["cos_sim"]["w2v"] = mean_results["subject"]
df["eud_dis"]["w2v"] = eud_mean["subject"]
df["man_dis"]["w2v"] = man_mean["subject"]

df

Unnamed: 0,cos_sim,eud_dis,man_dis
TF,0.9267,0.3829,1.704
sklearn_TF-IDF,0.8441,0.5585,2.893
w2v,0.899321,0.445571,4.389835


In [None]:
### dictionary plotting functions ###
def plotting_dict_line(dict):
    fig, axes = plt.subplots(len(dict), 1, figsize=(10, len(dict) * 5))
    for i, (key, values) in enumerate(dict.items()):
        axes[i].plot(values, marker='o', linestyle='-', color='purple')  
        axes[i].set_title(f"subject cos_sims at {key} words in corpus")
        axes[i].set_xlabel("permutation count")
        axes[i].set_ylabel("cos_sims")
        axes[i].axhline(0, color='black',linewidth=1)  
        axes[i].axvline(0, color='black',linewidth=1) 
    plt.tight_layout()
    plt.show()

def plotting_dict_bar(dict):
    fig, axes = plt.subplots(len(dict), 1, figsize=(10, len(dict) * 5))
    for i, (key, values) in enumerate(dict.items()):
        axes[i].bar(range(len(values)), values, color='black')  
        axes[i].set_title(f"cos_sims values at permutation count: {key}")
        axes[i].set_xlabel("index")
        axes[i].set_ylabel("cos_sim")
        axes[i].grid(True, linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()

#plotting_dict_line(cos_sims)