In [1]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd
import string
from nltk.corpus import stopwords
from tqdm import tqdm
from openai import OpenAI
import re

In [2]:
def get_openai_embedding(text, model="text-embedding-ada-002"):
    client = OpenAI()
    
    response = client.embeddings.create(
        input=[text],
        model=model
    )
    return response.data[0].embedding

In [3]:
model = SentenceTransformer('all-MiniLM-L12-v2')



In [4]:
df = pd.read_excel("CDR_Scale_GroundTruth_v1.xlsx", sheet_name="Sheet1")
df

Unnamed: 0,characteristic,none,questionable,mild,moderate,severe
0,memeory,No memory loss or slight; inconsistent forgetf...,Consistent slight forgetfulness; partial recol...,Moderate memory loss: more marked for recent e...,"Severe memory loss, only highly learned materi...","Severe memory loss, only fragments remain"
1,orientation,Fully oriented,Fully oriented but with slight difficulty with...,Moderate difficulty with time relationships; o...,Severe difficulty with time relationships; usu...,Oriented to person only
2,judgment_ps,Solves everyday problems and handles business ...,"Slight impairment in solving problems, similar...","Moderate difficulty in handling problems, simi...","Severely impaired in handling problems, simila...",Unable to make judgments or solve problems
3,community_affairs,"Independent function as usual in job, shopping...",Slight impairment in these activities,Unable to function independently at these acti...,No pretense of independent function outside th...,Appears too ill to be taken to functions outsi...
4,home_hobbies,"Life at home, hobbies and intellectual interes...","Life at home, hobbies and intellectual interes...",Mild but definite impairment of functions at h...,Only simple chores preserved; very restricted ...,No significant function in the home
5,personal_care,Fully capable of self-care,,Needs prompting,"Requires assistance in dressing, hygiene and k...",Requires much help with personal care; frequen...


In [5]:
def clean_text(text):
    text = text.strip()
    # Remove any log-like patterns within the text
    text = re.sub(r'\d{2}-\w{3}-\d{4} \d{2}:\d{2}', '', text)
    text = re.sub(r'\(.*?\):', '', text)
    text = re.sub(r'(PCP asked question|PCP Closed Request|Specialist requested referral|PCP will make referral):', '', text)
    text = re.sub(r'\n+', '\n', text).strip()
    
    remove_words = ['log', 'summary', 'entry']
    
    # List of months to be removed
    months = [
        'jan', 'feb', 'mar', 'apr', 'may', 'jun',
        'jul', 'aug', 'sep', 'oct', 'nov', 'dec'
    ]
    
    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation+string.digits))
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    stop_words.update(remove_words)
    stop_words.update(months)
    
    cleaned_text = ' '.join([word for word in text.split() if word not in stop_words])
    
    return cleaned_text

In [20]:
df_dementia = pd.read_excel("FinalDementia2021Taxonomy_4DEC2023.xlsx", sheet_name="Data")
df_non_dementia = pd.read_excel("Urban_NonLTC_Controlx2_15DEC2023.xlsx", sheet_name="100 Urban Non LTC Control")

df_dementia['Log Details'] = df_dementia['Log Details'].apply(clean_text)
df_non_dementia['Log Detail'] = df_non_dementia['Log Detail'].apply(clean_text)

dementia_data = df_dementia['Log Details'].values
non_dementia_data = df_non_dementia['Log Detail'].values

log_sentences = np.concatenate((dementia_data, non_dementia_data))

In [21]:
# df_data = pd.read_excel("FinalDementia_11APR2023_v2.xlsx", sheet_name="Final")

# pcp_pattern = re.compile(r'\(PCP Entry\)(.*?)(?=\(\w+ Entry\)|$)', re.DOTALL)
# specialist_pattern = re.compile(r'\(Specialist Entry\)(.*?)(?=\(\w+ Entry\)|$)', re.DOTALL)

# df_data['Log Details'] = df_data['Log Details'].apply(clean_text)

# log_sentences = df_data['Log Details'].values

# df_data.head(1)

In [22]:
embeded_groundtruth_df = df.copy()
for key in list(df)[1:]:
    embeddings = model.encode(df[key])
    embeded_groundtruth_df[key] = list(embeddings)
#     embeddings = [get_openai_embedding(sentence) for sentence in tqdm(list(df[key]))]
#     embeded_groundtruth_df[key] = embeddings

In [23]:
log_embedded = []
for log in tqdm(log_sentences):
    log_embedded.append(model.encode(log))

100%|████████████████████████████████████████████████████████████████████████████████| 399/399 [01:13<00:00,  5.43it/s]


In [24]:
# log_embedded = [get_openai_embedding(sentence) for sentence in tqdm(log_sentences)]
# log_embedded = np.array(log_embedded)

In [25]:
np.array(log_embedded).shape

(399, 384)

In [26]:
dfs = []
for i in range(len(embeded_groundtruth_df)):
    similarity_matrix = cosine_similarity(log_embedded, list(embeded_groundtruth_df.iloc[i].values[1:]))
    similarity_df = pd.DataFrame(similarity_matrix, columns=list(embeded_groundtruth_df)[1:])
    similarity_df.name = embeded_groundtruth_df['characteristic'][i]
    dfs.append(similarity_df)

In [27]:
similarity_dict = {dfs[i].name: dfs[i] for i in range(len(dfs))}

In [28]:
similarity_dict['memeory']

Unnamed: 0,none,questionable,mild,moderate,severe
0,0.231692,0.232067,0.267280,0.208380,0.268492
1,0.310054,0.323458,0.365298,0.267093,0.311592
2,0.206398,0.228724,0.292239,0.238650,0.245839
3,0.223662,0.253085,0.255795,0.253555,0.239205
4,0.306237,0.332709,0.322311,0.248359,0.286268
...,...,...,...,...,...
394,0.133109,0.214480,0.157127,0.111421,0.184328
395,0.142054,0.160120,0.129030,0.115139,0.145894
396,0.185684,0.225664,0.206808,0.136151,0.161459
397,0.070677,0.151810,0.171770,0.101820,0.088843


In [29]:
similarity_dict['judgment_ps']

Unnamed: 0,none,questionable,mild,moderate,severe
0,0.119027,0.219856,0.108487,0.182466,0.158091
1,0.071262,0.272307,0.169302,0.240618,0.205377
2,0.076137,0.198419,0.142446,0.255068,0.171202
3,0.108403,0.205392,0.151611,0.215074,0.179643
4,0.189856,0.360799,0.276058,0.355393,0.327048
...,...,...,...,...,...
394,0.046303,0.189245,0.116484,0.177373,0.220608
395,0.051211,0.146405,0.153542,0.173394,0.200704
396,0.135055,0.260636,0.176980,0.209392,0.240542
397,0.191370,0.184586,0.160696,0.195603,0.197257


In [30]:
s = similarity_dict['memeory']
for d in list(similarity_dict.keys())[1:]:
    s += similarity_dict[d]
s

Unnamed: 0,none,questionable,mild,moderate,severe
0,1.000300,1.193995,1.363698,1.301001,1.538752
1,0.871490,1.355378,1.545500,1.334830,1.475806
2,0.864492,1.057894,1.384265,1.211607,1.315381
3,0.888277,1.042246,1.329002,1.268276,1.387553
4,1.135773,1.653487,1.897804,1.698422,1.762238
...,...,...,...,...,...
394,0.407657,0.760036,1.059067,0.892166,1.288739
395,0.483556,0.771975,1.014699,0.726466,1.070336
396,0.930603,1.213085,1.305958,0.970057,1.400593
397,0.907356,0.865870,1.347426,1.226587,1.276247


In [19]:
# df_transformed = s.copy()
# max_values = df_transformed.max(axis=1).values.reshape(-1, 1)
# df_transformed = (df_transformed.values == max_values).astype(int)
# df_transformed = pd.DataFrame(df_transformed, index=s.index, columns=s.columns)
# df_transformed

In [26]:
s['severe'].to_csv("max_sim_df.csv")

In [27]:
s['none'].to_csv("max_sim_df_none.csv")