In [1]:
"""
VSM Output Catcher & Reconstruction Code
Created on Tuesday May 20 21:33:14 2025
@authors: Samuel luk, Shaotai Hu
For data reconstruction & processing and analysis of packaged_final_w2v (differential privacy applied) outputs
""";
 
import os, random, re, math, pickle, pandas as pd, numpy as np, matplotlib.pyplot as plt, gensim, warnings
from gensim.models import Word2Vec

from sim_functions_final import cos_sim, eud_dis, man_dis
from vsm_functions_final import shuffle, build_corpus, tokenize, w2v_train, bal_ran_subsamp, total_words, vector_append, vector_gen, skl_tfidf

warnings.filterwarnings('ignore')
np.set_printoptions(suppress=True)

In [None]:
### Cossim catcher & Bridge code ###

# takes pandas dataframes saved as pickle files 
output_catch1 = "###.pkl"
with open(output_catch1, "rb") as f:
    loaded_df1 = pickle.load(f)
    
output_catch2 = "###.pkl"
with open(output_catch2, "rb") as f2:
    loaded_df2 = pickle.load(f2)

# processing & reconstruction of cosine similarities using dataframes of vector embeddings 
mutual_cols = np.sort(list(set(list(loaded_df1.columns)) & set(list(loaded_df2.columns))))
df1_new = loaded_df1[mutual_cols]
df2_new = loaded_df2[mutual_cols]

final_df = pd.DataFrame(columns=df1_new.columns, index=df1_new.index)
for r in range(0, len(final_df.index)):
    for c in range(0, len(final_df.columns)):
        if isinstance(df1_new.iloc[r, c], np.ndarray) and isinstance(df2_new.iloc[r, c], np.ndarray):
            final_df.iloc[r, c] = cos_sim(df1_new.iloc[r, c], df2_new.iloc[r, c])
        elif isinstance(df1_new.iloc[r, c], str):
            final_df.iloc[r, c] = df1_new.iloc[r, c]
        else:
            final_df.iloc[r, c] = np.nan

subject_col = final_df.pop("subject")  
final_df.insert(0, "subject", subject_col)
all_subjects = final_df["subject"].unique()

# grouping per location
grouped = {subject: group_df for subject, group_df in final_df.groupby("subject")}
loc_dfs = [grouped[loc] for loc in final_df["subject"].unique()]
all_subjects

In [None]:
final_dfs = []
for df in loc_dfs:
    subject = df["subject"].iloc[0].lower()
    temp_df = df.rename(columns={"cbw_loc": subject})
    temp_df = temp_df.drop(columns=["subject"])
    cols = temp_df.columns.tolist() 
    cols[0], cols[1] = cols[1], cols[0]  
    temp_df = temp_df[cols]
    final_dfs.append(temp_df)

# final_dfs is a list of all the dataframes contained the calculated cossims
final_dfs[0]

In [4]:
# Weights catcher & Bridge code
# Note, the dfs in the list of dfs final_dfs 
# are in the same order as subjects in all_subjects, so the weighting is matched correctly through indexing
output_weight1 = "###.pkl"
with open(output_weight1, 'rb') as w1:
    loaded_weights1 = pickle.load(w1)
    
output_weight2 = "###.pkl"
with open(output_weight2, 'rb') as w2:
    loaded_weights2 = pickle.load(w2)

# Transforming weights dictionary pickle file to weights and applying to cosine similarities from above
def sum_nested_dicts(dict1, dict2):
    result = {}
    for subject in dict1:
        if subject in dict2:
            if isinstance(dict1[subject], dict) and isinstance(dict2[subject], dict):
                result[subject] = sum_nested_dicts(dict1[subject], dict2[subject])
            else:
                result[subject] = dict1[subject] + dict2[subject]
        else:
            result[subject] = dict1[subject]
    for subject in dict2:
        if subject not in dict1:
            result[subject] = dict2[subject]
    return result

all_weights = sum_nested_dicts(loaded_weights1, loaded_weights2)

# Applies the weights to every subject in final_dfs to generate weighted cos_sims
weighted_per_loc = {}
for i in range(len(all_subjects)):
    subject = all_subjects[i]
    mean_cbw = final_dfs[i].mean().to_dict()
    mean_cbw = {key: 0.0 if math.isnan(value) else value for key, value in mean_cbw.items()}
    weighted_nodes = {}
    for node in nodes_in:
        weighted = []
        for e in all_weights[subject][node].keys():
            if e in mean_cbw.keys():
                w = all_weights[subject][node][e] / all_weights[subject]["total_per_node"] * mean_cbw[e]
                weighted.append(w)
            weighted_nodes[node] = np.sum(weighted)
    weighted_per_loc[subject] = np.round(np.mean(list(weighted_nodes.values())), 4)
weighted_per_loc = {key.lower(): value for key, value in weighted_per_loc.items()}
weighted_per_loc

{'breast': 0.7245,
 'central-nervous-system-(cns)': 0.8671,
 'gastrointestinal-(gi)': 0.8619,
 'genitourinary-(gu)': 0.8462,
 'gynecologic-(gyn)': 0.8994,
 'head-and-neck': 0.7748,
 'hem-lymph': 0.6693,
 'lung': 0.9001,
 'metastasis': 0.7385,
 'musculoskeletal': 0.2091,
 'other': 0.8312,
 'sarcoma': 0.6295,
 'skin': 0.2363,
 'thoracic': 0.0205}

In [5]:
model_per_loc = {}
for i in range(len(all_subjects)):
    subject = all_subjects[i].lower()
    if i != 10:
        mean_cbw = final_dfs[i].mean().to_dict()
        mean_cbw = {key: 0.0 if math.isnan(value) else value for key, value in mean_cbw.items()}
        model_per_loc[subject] = np.round(mean_cbw[subject], 4)
    else: 
        model_per_loc[subject] = np.round(np.mean(final_dfs[i].iloc[:, 0]), 4)
model_per_loc

{'breast': 0.784,
 'central-nervous-system-(cns)': 0.9059,
 'gastrointestinal-(gi)': 0.8942,
 'genitourinary-(gu)': 0.887,
 'gynecologic-(gyn)': 0.9285,
 'head-and-neck': 0.9002,
 'hem-lymph': 0.8475,
 'lung': 0.9351,
 'metastasis': 0.7485,
 'musculoskeletal': 0.4981,
 'other': 0.9116,
 'sarcoma': 0.8844,
 'skin': 0.5439,
 'thoracic': 0.0214}