In [None]:
import os, sys
import warnings

if not sys.warnoptions:
    warnings.simplefilter("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [None]:
DATA = pd.read_csv('IAD With Hist and Demos (120)', index_col=0,header=0)
DATA

In [None]:
for col in DATA.columns:
    print(col)

In [None]:
from scipy import stats

def numeric_data(data):
    data = data[~pd.isna(data)]
    stat, p_value = stats.shapiro(data)
    print('Shapiro-Wilk Test: Statistics=%.2f, p=%.2f' % (stat, p_value))

    if p_value > 0.05:
        print("Data is normally distributed (fail to reject H0)")
        mean = np.mean(data)
        std_dev = np.std(data)
        print(f"Mean: {mean:.2f}, Standard Deviation: {std_dev:.2f}")
    else:
        print("Data is not normally distributed (reject H0)")
        median = np.median(data)
        lqr = np.percentile(data, 25)
        hqr = np.percentile(data, 75)
        print(f"Median: {median:.2f}, Interquartile Range (IQR): [{lqr:.2f}-{hqr:.2f}]")



In [None]:
numeric_data(DATA['AgeAsofScan'].values)


In [None]:
var = 'Sex_Female'
print(var + f": {DATA[var].sum()} ({DATA[var].sum()/DATA.shape[0]:.3f})")


In [None]:
var = 'Sex_Male'
print(var + f": {DATA[var].sum()} ({DATA[var].sum()/DATA.shape[0]:.3f})")

In [None]:
var = 'SecondRace_Asian'
print(var + f": {DATA[var].sum()} ({DATA[var].sum()/DATA.shape[0]:.3f})")

In [None]:
var = 'SecondRace_Black or African-American'
print(var + f": {DATA[var].sum()} ({DATA[var].sum()/DATA.shape[0]:.3f})")

In [None]:
var = 'SecondRace_White'
print(var + f": {DATA[var].sum()} ({DATA[var].sum()/DATA.shape[0]:.3f})")

In [None]:
var = 'SecondRace_Unknown/Not reported'
print(var + f": {DATA[var].sum()} ({DATA[var].sum()/DATA.shape[0]:.3f})")

In [None]:
numeric_data(DATA['comorbidity_score'])

In [None]:
comorbidities = pd.read_csv('Comorbidities (80)', index_col=0, header=0)
comorbidities

In [None]:
for a in comorbidities:
    if a == 'id':
        continue
    print(a)
    print(comorbidities[a].sum())
    print(comorbidities[a].sum()/comorbidities.shape[0])

In [None]:
OS_DATA = pd.get_dummies(DATA[['Occlusion Location']])
OS_DATA

In [None]:
var = 'Occlusion Location_ICA'
print(var + f": {OS_DATA[var].sum()} ({OS_DATA[var].sum()/OS_DATA.shape[0]:.3f})")

In [None]:
var = 'Occlusion Location_ACA'
print(var + f": {OS_DATA[var].sum()} ({OS_DATA[var].sum()/OS_DATA.shape[0]:.3f})")

In [None]:
var = 'Occlusion Location_MCA'
print(var + f": {OS_DATA[var].sum()} ({OS_DATA[var].sum()/OS_DATA.shape[0]:.3f})")

In [None]:
var = 'Occlusion Location_Basilar/top vertebral'
print(var + f": {OS_DATA[var].sum()} ({OS_DATA[var].sum()/OS_DATA.shape[0]:.3f})")

In [None]:
var = 'Occlusion Location_PCA'
print(var + f": {OS_DATA[var].sum()} ({OS_DATA[var].sum()/OS_DATA.shape[0]:.3f})")

In [None]:
NIHSS = pd.read_csv('NIHSS data (127)',index_col=0, header=0)
NIHSS['PrimaryMrn'] = NIHSS['PrimaryMrn'].map(str)
NIHSS = NIHSS[NIHSS['PrimaryMrn'].isin(DATA['PatientID'])]
NIHSS

In [None]:
numeric_data(NIHSS['FlowsheetNIHSS_Score'].values)

In [None]:
numeric_data(DATA['LKWDiff'].values)

In [None]:
IAD = pd.read_csv('Included Aggregate Data (120)')
IAD['FSDiff'] = pd.to_datetime(IAD['ScanTime']) - pd.to_datetime(IAD['FirstScanTime'])
IAD['FSDiff'] = IAD['FSDiff'].astype(int) * 1.6667e-11

In [None]:
numeric_data(IAD['FSDiff'].values)

In [None]:
Notes = pd.read_excel('/Users/shaunkohli/Desktop/Kummer Project/Most Recent Note Model (9-28-23)/CTP_ClarityNotes_N=151_v3.xlsx')
Notes

In [None]:
count = 0
for id,scantime in IAD[['PatientID','ScanTime']].values:
    sub_Notes = Notes[Notes['PAT_MRN_ID']==id]
    sub_Notes['Entry DateTime'] = sub_Notes['ENTRY_INST_LOCAL_DTTM'].apply(pd.to_datetime)
    sub_Notes = sub_Notes[scantime < sub_Notes['Entry DateTime']]
    sub_Notes = sub_Notes[sub_Notes['CNCT_NOTE_TYPE_NAME'] == 'Brief Op Note']
    if sub_Notes.shape[0] > 0:
        count += 1
    
print(count)
print(count/DATA.shape[0])
print(DATA.shape[0]-count)
print((DATA.shape[0]-count)/DATA.shape[0])

In [None]:
numeric_data(DATA['>6s'])

In [None]:
numeric_data(DATA['<30%'])

In [None]:
var = 'P:C(30)>=1.8'
print(var + f": {DATA[var].sum()} ({DATA[var].sum()/DATA.shape[0]:.3f})")


In [None]:
cutoff = 500
num_notes = []
time_difs = []
word_counts = []
char_counts = []

for id,scantime in IAD[['PatientID','ScanTime']].values:
    Notes_included = Notes[Notes['PAT_MRN_ID']== str(id)]
    Notes_included['Entry DateTime'] = Notes_included['ENTRY_INST_LOCAL_DTTM'].apply(pd.to_datetime)
    scantime = pd.to_datetime(scantime)
    Notes_included = Notes_included[scantime > Notes_included['Entry DateTime']]
    Notes_included = Notes_included[scantime - Notes_included['Entry DateTime'] < pd.Timedelta('1w')]
    merged_notes = Notes_included.groupby('NOTE_ID')['NOTE_TEXT'].agg(' '.join)
    if merged_notes.shape[0] == 0:
        continue
    Notes_included = pd.merge(Notes_included, merged_notes, on='NOTE_ID',how='left')
    Notes_included.drop('NOTE_TEXT_x',axis=1,inplace=True)
    Notes_included.drop_duplicates('NOTE_TEXT_y',inplace=True)
    Notes_included['line length'] = Notes_included['NOTE_TEXT_y'].apply(len)
    Notes_included = Notes_included.iloc[::-1]
    Notes_included['length sum'] = Notes_included['line length'].cumsum()
    Notes_included['met threshold?'] = Notes_included['length sum'] >= cutoff
    if Notes_included['met threshold?'].sum() == 0:
        row_index = Notes_included.shape[0]
    else:
        row_index = Notes_included['met threshold?'].argmax()
    num_notes.append(row_index+1)
    Notes_included = Notes_included.iloc[:row_index+1]
    Notes_included['time_dif'] = scantime - Notes_included['Entry DateTime']
    Notes_included['word_count'] = Notes_included['NOTE_TEXT_y'].apply(lambda text: len(text.split()))
    word_counts.append(Notes_included['word_count'].sum())
    char_counts.append(Notes_included['line length'].sum())
    time_difs.append(Notes_included['time_dif'])

time_difs = pd.concat(time_difs)
minutes = time_difs.dt.total_seconds()/60
minutes.sort_values(inplace=True)

print('Median minutes between included notes and scan time [IQR]')
print(np.percentile(minutes,25))
print(np.percentile(minutes,50))
print(np.percentile(minutes,75))

plt.hist(word_counts,30)
plt.show()

print('Median word count in included text [IQR]')
print(np.percentile(word_counts,25))
print(np.percentile(word_counts,50))
print(np.percentile(word_counts,75))

plt.hist(char_counts,30)
plt.show()

print('Median char count in included text [IQR]')
print(np.percentile(char_counts,25))
print(np.percentile(char_counts,50))
print(np.percentile(char_counts,75))

plt.hist(num_notes,30)
plt.show()

print('Median num notes count in included text [IQR]')
print(np.percentile(num_notes,25))
print(np.percentile(num_notes,50))
print(np.percentile(num_notes,75))

In [None]:
cutoff = 1000
num_notes = []
time_difs = []
word_counts = []
char_counts = []

for id,scantime in IAD[['PatientID','ScanTime']].values:
    Notes_included = Notes[Notes['PAT_MRN_ID']== str(id)]
    Notes_included['Entry DateTime'] = Notes_included['ENTRY_INST_LOCAL_DTTM'].apply(pd.to_datetime)
    scantime = pd.to_datetime(scantime)
    Notes_included = Notes_included[scantime > Notes_included['Entry DateTime']]
    Notes_included = Notes_included[scantime - Notes_included['Entry DateTime'] < pd.Timedelta('1w')]
    merged_notes = Notes_included.groupby('NOTE_ID')['NOTE_TEXT'].agg(' '.join)
    if merged_notes.shape[0] == 0:
        continue
    Notes_included = pd.merge(Notes_included, merged_notes, on='NOTE_ID',how='left')
    Notes_included.drop('NOTE_TEXT_x',axis=1,inplace=True)
    Notes_included.drop_duplicates('NOTE_TEXT_y',inplace=True)
    Notes_included['line length'] = Notes_included['NOTE_TEXT_y'].apply(len)
    Notes_included = Notes_included.iloc[::-1]
    Notes_included['length sum'] = Notes_included['line length'].cumsum()
    Notes_included['met threshold?'] = Notes_included['length sum'] >= cutoff
    if Notes_included['met threshold?'].sum() == 0:
        row_index = Notes_included.shape[0]
    else:
        row_index = Notes_included['met threshold?'].argmax()
    num_notes.append(row_index+1)
    Notes_included = Notes_included.iloc[:row_index+1]
    Notes_included['time_dif'] = scantime - Notes_included['Entry DateTime']
    Notes_included['word_count'] = Notes_included['NOTE_TEXT_y'].apply(lambda text: len(text.split()))
    word_counts.append(Notes_included['word_count'].sum())
    char_counts.append(Notes_included['line length'].sum())
    time_difs.append(Notes_included['time_dif'])

time_difs = pd.concat(time_difs)
minutes = time_difs.dt.total_seconds()/60
minutes.sort_values(inplace=True)

print('Median minutes between included notes and scan time [IQR]')
print(np.percentile(minutes,25))
print(np.percentile(minutes,50))
print(np.percentile(minutes,75))

plt.hist(word_counts,30)
plt.show()

print('Median word count in included text [IQR]')
print(np.percentile(word_counts,25))
print(np.percentile(word_counts,50))
print(np.percentile(word_counts,75))

plt.hist(char_counts,30)
plt.show()

print('Median char count in included text [IQR]')
print(np.percentile(char_counts,25))
print(np.percentile(char_counts,50))
print(np.percentile(char_counts,75))

plt.hist(num_notes,30)
plt.show()

print('Median num notes count in included text [IQR]')
print(np.percentile(num_notes,25))
print(np.percentile(num_notes,50))
print(np.percentile(num_notes,75))

In [None]:
cutoff = 5000
num_notes = []
time_difs = []
word_counts = []
char_counts = []

for id,scantime in IAD[['PatientID','ScanTime']].values:
    Notes_included = Notes[Notes['PAT_MRN_ID']== str(id)]
    Notes_included['Entry DateTime'] = Notes_included['ENTRY_INST_LOCAL_DTTM'].apply(pd.to_datetime)
    scantime = pd.to_datetime(scantime)
    Notes_included = Notes_included[scantime > Notes_included['Entry DateTime']]
    Notes_included = Notes_included[scantime - Notes_included['Entry DateTime'] < pd.Timedelta('1w')]
    merged_notes = Notes_included.groupby('NOTE_ID')['NOTE_TEXT'].agg(' '.join)
    if merged_notes.shape[0] == 0:
        continue
    Notes_included = pd.merge(Notes_included, merged_notes, on='NOTE_ID',how='left')
    Notes_included.drop('NOTE_TEXT_x',axis=1,inplace=True)
    Notes_included.drop_duplicates('NOTE_TEXT_y',inplace=True)
    Notes_included['line length'] = Notes_included['NOTE_TEXT_y'].apply(len)
    Notes_included = Notes_included.iloc[::-1]
    Notes_included['length sum'] = Notes_included['line length'].cumsum()
    Notes_included['met threshold?'] = Notes_included['length sum'] >= cutoff
    if Notes_included['met threshold?'].sum() == 0:
        row_index = Notes_included.shape[0]
    else:
        row_index = Notes_included['met threshold?'].argmax()
    num_notes.append(row_index+1)
    Notes_included = Notes_included.iloc[:row_index+1]
    Notes_included['time_dif'] = scantime - Notes_included['Entry DateTime']
    Notes_included['word_count'] = Notes_included['NOTE_TEXT_y'].apply(lambda text: len(text.split()))
    word_counts.append(Notes_included['word_count'].sum())
    char_counts.append(Notes_included['line length'].sum())
    time_difs.append(Notes_included['time_dif'])

time_difs = pd.concat(time_difs)
minutes = time_difs.dt.total_seconds()/60
minutes.sort_values(inplace=True)

print('Median minutes between included notes and scan time [IQR]')
print(np.percentile(minutes,25))
print(np.percentile(minutes,50))
print(np.percentile(minutes,75))

plt.hist(word_counts,30)
plt.show()

print('Median word count in included text [IQR]')
print(np.percentile(word_counts,25))
print(np.percentile(word_counts,50))
print(np.percentile(word_counts,75))

plt.hist(char_counts,30)
plt.show()

print('Median char count in included text [IQR]')
print(np.percentile(char_counts,25))
print(np.percentile(char_counts,50))
print(np.percentile(char_counts,75))

plt.hist(num_notes,30)
plt.show()

print('Median num notes count in included text [IQR]')
print(np.percentile(num_notes,25))
print(np.percentile(num_notes,50))
print(np.percentile(num_notes,75))