In [None]:
import os, sys
import warnings

if not sys.warnoptions:
    warnings.simplefilter("ignore")

import pandas as pd
from comorbidipy import comorbidity

from gensim.models.keyedvectors import KeyedVectors
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

In [None]:
cutoff = ' (cutoff 5000)'

In [None]:
text = pd.read_csv('Unprocessed Text (120)' + cutoff,header=0)
text.rename(columns={'Unnamed: 0':'PatientID'},inplace=True)
text['length'] = text['text'].apply(len)
text


In [None]:
z = text['length'].max()
nlp = spacy.load('en_core_web_md')
nlp.max_length = z

def initial_clean(text):
     
    doc = nlp(text)
    text = [token.text.lower() for token in doc]
    text = ' '.join(text)

    # Remove stop words
    stop_words = spacy.lang.en.stop_words.STOP_WORDS
    text = ' '.join([token for token in text.split() if token not in stop_words])

    # Remove punctuations
    text = ' '.join([token for token in text.split() if token.isalpha()])

    # Perform stemming or lemmatization
    text = ' '.join([token.lemma_ for token in nlp(text)])
    return(text)

text['text']= text['text'].apply(initial_clean)
text.to_csv('Processed Text (120)'+ cutoff)


In [None]:
text['length'] = text['text'].apply(len)
text

In [None]:
vectorizer = TfidfVectorizer(min_df=2)
X = vectorizer.fit_transform(text['text'])
X.shape

In [None]:
tfidf_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
tfidf_df.to_csv('TFIDF (120)' + cutoff)

def get_top_words(document_index, n):
    doc_tfidf_scores = tfidf_df.iloc[document_index]
    top_indices = np.argsort(doc_tfidf_scores)[-n:][::-1]
    top_words = [vectorizer.get_feature_names_out()[i] for i in top_indices]
    return top_words

In [None]:
get_top_words(100,10)

In [None]:
ISA = pd.read_csv('Included Aggregate Data (120)',index_col=0)
ISA

In [None]:
stroke_mapping = {
    'M1': 'MCA',
    'P2': 'PCA',
    'M2': 'MCA',
    'C-ICA': 'ICA',
    'ICA-M1 tandem': 'ICA',
    'P3': 'PCA',
    'top-basilar': 'Basilar/top vertebral',
    'T-ICA': 'ICA',
    'ICA-M2 tandem': 'ICA',
    'A2': 'ACA',
    'P1': 'PCA',
    'M4': 'MCA',
    'M3': 'MCA',
    'ICA->=M3 tandem': 'ICA',
    'pre-terminal I-ICA': 'ICA',
    'VA': 'Basilar/top vertebral',
    'A1': 'ACA'
}

ISA['Occlusion Location'] = ISA['Primary Arterial Site'].map(stroke_mapping)
ISA

In [None]:
Demographics = pd.read_excel('CTP_demographics_first_151_patients_3-14-23.xlsx',header=0)
History = pd.read_excel('CTP_patient_med_hx_w_UserEnteredDates_first_151_patients_3-28-23.xlsx',header=0)

In [None]:
D = pd.get_dummies(Demographics,columns=['Sex', 'PreferredLanguage','English_NonEnglish', 'FirstRace', 'SecondRace', 'PostalCode'])
D['BirthDate'] = pd.to_datetime(D['BirthDate'])
D['PrimaryMrn']= D['PrimaryMrn'].apply(str)
D=D[D['PrimaryMrn'].isin(ISA['PatientID'])]

M2 = pd.merge(ISA,D,left_on='PatientID', right_on='PrimaryMrn').drop(['PrimaryMrn'],axis=1)

H = History[History['UserEnteredDateKey']!= -1]
H['Date'] = pd.to_datetime(H['UserEnteredDateKey'].apply(str),format='%Y%m%d')
H['PrimaryMrn'] = H['PrimaryMrn'].apply(str)


In [None]:
scan_times=M2[['PatientID','ScanTime']].drop_duplicates()
IDs = []
codes = []
for i in range(scan_times.shape[0]):
    patient_mrn = scan_times['PatientID'].values[i]
    patient_scan = scan_times['ScanTime'].values[i]
    patient_rows = H[H['PrimaryMrn']==patient_mrn]
    good_rows = patient_rows[pd.to_datetime(patient_scan)-patient_rows['Date']> pd.Timedelta('0m') ]
    if good_rows.empty:
        continue
    for i,row in good_rows.iterrows():
        ICDs = row[pd.notna(row)].values
        for code in ICDs:
            IDs.append(patient_mrn)
            codes.append(str(code))
ICD = pd.DataFrame()
ICD['id'] = IDs
ICD['code'] = codes
COMO = comorbidity(df=ICD,age=None,variant='quan',weighting='vw',score = 'elixhauser')
COMO

In [None]:
COMO.to_csv('Comorbidities (80)')

In [None]:
F = pd.merge(M2,COMO[['comorbidity_score','id']], how='left', left_on='PatientID',right_on='id').drop(['id'],axis=1)
F['AgeAsofToday'] = F['ScanTime'].apply(pd.to_datetime) - F['BirthDate']
F['AgeAsofToday'] = F['AgeAsofToday'].astype(int)/3.154e16
F['LKWDiff'] = F['LKWDiff'].apply(pd.to_timedelta).astype(int)/3.6e12
F = F.rename(columns={'AgeAsofToday':'AgeAsofScan'}).drop(['ScanTime','BirthDate','IsCurrent'],axis=1)

In [None]:
F

In [None]:
F.to_csv('IAD With Hist and Demos (120)')

In [None]:
tfidf_df

In [None]:
word2vec_model = KeyedVectors.load('/Users/shaunkohli/Desktop/Kummer Project/word2vec_model.kv', mmap='r')

In [None]:
word_Vecs = np.zeros([tfidf_df.shape[1],200])
for i,word in enumerate(tfidf_df.columns):
    if word in word2vec_model.key_to_index:
        word_Vecs[i] = word2vec_model[word]
    else:
        word_Vecs[i] = np.zeros(200)


In [None]:
tfidf_array = np.array(tfidf_df)


In [None]:
weighted_vecs = np.matmul(tfidf_array,word_Vecs)
weighted_vecs.shape

In [None]:
weighted_vecs = pd.DataFrame(weighted_vecs)
weighted_vecs.to_csv('BioWord2Vecs (120)' +cutoff)