## Scott 10K Allied health dataset

### Aim: Obtain demographic, clinical and neuroimaging (T1 file paths, MWC1T1 file paths) for the scans

### First order of business: create the .txt file of all ADNI T1 patients I want in my table

In [1]:
%%bash 

mydir=/rds/general/project/scott_data_adni/live/ADNI/ADNI_NIFTI/
output_file=/rds/general/project/c3nl_scott_students/live/sankeith/scott_10k_housekeeping/scott_10k_t1_paths.txt
>"$output_file"
find "$mydir" -type f -name "*.nii.gz" >> "$output_file"

wc -l<"${output_file}"

Process was interrupted.


CalledProcessError: Command 'b'\nmydir=/rds/general/project/scott_data_adni/live/ADNI/ADNI_NIFTI/\noutput_file=/rds/general/project/c3nl_scott_students/live/sankeith/scott_10k_housekeeping/scott_10k_t1_paths.txt\n>"$output_file"\nfind "$mydir" -type f -name "*.nii.gz" >> "$output_file"\n\nwc -l<"${output_file}"\n'' died with <Signals.SIGINT: 2>.

In [None]:
import pandas as pd
import numpy as np
import os
from datetime import datetime, timedelta

t1_paths = '/rds/general/project/c3nl_scott_students/live/sankeith/scott_10k_housekeeping/scott_10k_t1_paths.txt'
patient_id_list = []
rid_list = []
data_key_list = []
t1_year_list = []
t1_mon_list = []
t1_day_list = []
visdate_list = []

with open(t1_paths,'r') as paths:
    all_paths = paths.readlines()
    for path in all_paths:
        path_parts = path.split('/')
        patient = path_parts[-1].strip('.nii.gz\n')

        patient_parts = patient.split('_')
        data_key = '_'.join(patient_parts)
        data_key_list.append(data_key)
    
        patient_id = '_'.join(patient_parts[:3])
        rid = patient_id.split('_')[-1]
        rid_list.append(rid)
        patient_id_list.append(patient_id)
    
        try:
            t1_date = patient_parts[4]
            t1_year, t1_mon, t1_day = t1_date.split('-')
            t1_year_list.append(t1_year)
            t1_mon_list.append(t1_mon)
            t1_day_list.append(t1_day)
            visdate_list.append(f"{t1_year}-{t1_mon}-{t1_day}")
        except (IndexError, ValueError):
            t1_year_list.append(np.nan)
            t1_mon_list.append(np.nan)
            t1_day_list.append(np.nan)
            visdate_list.append(np.nan)
        
df = {
    "PTID" : patient_id_list,
    "RID" : rid_list,
    "DATA_KEY" : data_key_list,
    "T1_YEAR" : t1_year_list,
    "T1_MON" : t1_mon_list,
    "T1_DAY" : t1_day_list,
    "VISDATE_STR" : visdate_list
}
df = pd.DataFrame(df)

df['T1_YEAR'] = pd.to_numeric(df['T1_YEAR'], errors='coerce')
df['T1_MON'] = pd.to_numeric(df['T1_MON'], errors='coerce')
df['T1_DAY'] = pd.to_numeric(df['T1_DAY'], errors='coerce')

df['VISDATE'] = pd.to_datetime(df['VISDATE_STR'], errors='coerce')
df = df.drop(columns=['VISDATE_STR'])

df['EXAMDATE_4WKS_LATER'] = df['VISDATE'] + timedelta(weeks = 4)
df['EXAMDATE_4WKS_B4'] = df['VISDATE'] - timedelta(weeks = 4)

df      

### Time to add T1 paths

In [None]:
t1_paths = '/rds/general/project/c3nl_scott_students/live/sankeith/scott_10k_housekeeping/scott_10k_t1_paths.txt'
key_to_path = {}

with open(t1_paths,'r') as paths:
    all_paths = paths.readlines()
    for path in all_paths:
        path  = path.strip('\n')
        raw_key = path.split('/')[-1]
        key = raw_key.replace('.nii.gz','')
        key_to_path[key] = path

df['T1_PATH'] = df['DATA_KEY'].map(key_to_path)
df

### Now for mwc1t1 paths

In [None]:
%%bash

mydir=/rds/general/project/c3nl_scott_students/live/data/sankeith/scott_10k_b2c/
output_file=/rds/general/project/c3nl_scott_students/live/sankeith/scott_10k_housekeeping/successful_scott_10k_mwc1t1_paths.txt

### collect mwc1t1 file paths##

>"$output_file"
find "$mydir" -type f -name "mwc1t1*" >> "$output_file"
wc -l<"${output_file}"

In [None]:
mwc1t1_paths = '/rds/general/project/c3nl_scott_students/live/sankeith/scott_10k_housekeeping/successful_scott_10k_mwc1t1_paths.txt'
key_to_path = {}

with open(mwc1t1_paths,'r') as paths:
    all_paths = paths.readlines()
    for path in all_paths:
        path  = path.strip('\n')
        key = path.split('/')[-2]
        key_to_path[key] = path

df['MWC1T1_PATH'] = df['DATA_KEY'].map(key_to_path)
df

#### How many unique patients are there (as measured by PTID)

In [None]:
len(df['PTID'].unique())

In [None]:
df.to_csv('/rds/general/project/c3nl_scott_students/ephemeral/sankeith/interim_scott_10k_alliedhealth.csv', index = False)

### Make a .m file of Greg's code, and run that. This gets us patient demographics (Gender, Date of Birth and Education)

In [None]:
%%file /rds/general/project/c3nl_scott_students/live/sankeith/scott_10k_housekeeping/scott_10k_add_demographics_info.m

T=readtable('/rds/general/project/c3nl_scott_students/ephemeral/sankeith/interim_scott_10k_alliedhealth.csv');
save('/rds/general/project/c3nl_scott_students/ephemeral/sankeith/interim_scott_10k_alliedhealth.mat');

data = load('/rds/general/project/c3nl_scott_students/ephemeral/sankeith/interim_scott_10k_alliedhealth.mat');
head(data.T) %% shows data = struct with fields, and T = [15733 x 11]
disp(data)

%% This has most of Greg's code. I'm not going to vectorise it

basepath = '/rds/general/project/c3nl_scott_students/live/sankeith/scott_10k_housekeeping/scott_10k_extra_adni_info/'

vars = { ...
    {'/rds/general/project/c3nl_scott_students/live/sankeith/scott_10k_housekeeping/scott_10k_extra_adni_info/PTDEMOG_10Sep2025.csv', {'PTGENDER', 'PTDOB', 'PTEDUCAT'}, {NaN, NaT, NaN}, {[], @(x) datetime(x, 'InputFormat', 'MM/yyyy'),[] }}};

for v=1:numel(vars) %%opens the relevant CSV, and makes sure that VISDATE is a column. If not, then it duplicates EXAMDATE and calls it VISDATE.
    Talli = readtable(vars{v}{1});
    Talli.PTID = string(Talli.PTID);
    if(~ismember('VISDATE', Talli.Properties.VariableNames))
        Talli.VISDATE = Talli.EXAMDATE;
    end
   
    fprintf('joining file %s, rows=%d\n', vars{v}{1}, height(Talli));
    
    varNames = vars{v}{2}; %%selects the appropriate columns from the relevant CSV
    if(~isempty(vars{v}{3}))
        fprintf('(setting types)\n');
        for vf=1:numel(vars{v}{3})
             T{:,varNames{vf}} = repmat(vars{v}{3}{vf}, height(T), 1); %% padding missing values with NaNs or NaTs
        end
    else
        T{:,varNames} = nan(height(T),numel(varNames)); %%also padding missing values, but this is for the case where there a column entirely contains missing values
    end
    
    if(~isempty(vars{v}{4})) %%setting the appropriate date format (e.g., datetime)
        fprintf('(converting)\n');
        for vf=1:numel(vars{v}{4})
            if(~isempty(vars{v}{4}{vf}))
                Talli.(vars{v}{2}{vf}) = vars{v}{4}{vf}(Talli.(vars{v}{2}{vf}));
            end
        end
        fprintf('(done)\n')
    end

    for s=1:height(T)
        Tsub = Talli(Talli.PTID == T.PTID(s),:);
        if(~isempty(Tsub))
            Tsub.datediffs = abs(days( (Tsub.VISDATE - T.VISDATE(s))));
            Tsub = sortrows(Tsub, 'datediffs', 'asc'); % order so that the one nearest the imaging appears 1st
            T(s, varNames) = Tsub(1,varNames);
        end
    end
end

fprintf('Width: %d Height: %d\n', width(T), height(T))
writetable(T, '/rds/general/project/c3nl_scott_students/ephemeral/sankeith/gregbased_scott10k_alliedhealth.csv')

if isfile('/rds/general/project/c3nl_scott_students/ephemeral/sankeith/gregbased_scott10k_alliedhealth.csv')
    disp('Demographics data joined')
end

In [None]:
%%file /rds/general/project/c3nl_scott_students/live/sankeith/scott_10k_housekeeping/scott_10k_add_demographics_info.sh

#!/bin/bash

module load tools/prod
module --ignore_cache load MATLAB2024/b > /dev/null 2>&1

matlab -nosplash -nodesktop -r "run('/rds/general/project/c3nl_scott_students/live/sankeith/scott_10k_housekeeping/scott_10k_add_demographics_info.m'); exit"

rm /rds/general/project/c3nl_scott_students/live/sankeith/scott_10k_housekeeping/scott_10k_add_demographics_info.* 
rm -f ~/java* ~/*crash*dump*

In [None]:
%%bash

chmod -Rf 775 /rds/general/project/c3nl_scott_students/live/sankeith/scott_10k_housekeeping/scott_10k_add_demographics_info.*

/rds/general/project/c3nl_scott_students/live/sankeith/scott_10k_housekeeping/scott_10k_add_demographics_info.sh

### Load in time-sensitive data

In [2]:
import pandas as pd
import os
import numpy as np
import datetime
from datetime import timedelta
from dateutil.relativedelta import relativedelta
import fnmatch
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
df = pd.read_csv('/rds/general/project/c3nl_scott_students/ephemeral/sankeith/gregbased_scott10k_alliedhealth.csv', low_memory = False)
df[f'VISDATE'] = pd.to_datetime(df[f'VISDATE'])
df[f'EXAMDATE_4WKS_LATER'] = pd.to_datetime(df[f'EXAMDATE_4WKS_LATER'])
df[f'EXAMDATE_4WKS_B4'] = pd.to_datetime(df[f'EXAMDATE_4WKS_B4'])
df[f'PTDOB'] = pd.to_datetime(df[f'PTDOB'])
df[f'PTAGE_YEARS'] = df['VISDATE'] - df['PTDOB']
df[f'PTAGE'] = df[f'PTAGE_YEARS'] / timedelta(days = 365)
df.drop(columns = 'PTAGE_YEARS', inplace = True)
print(len(df))
print(len(df.columns))
print(len(df['PTID'].unique()))

15733
15
2635


### Merge APOE Genotype

In [4]:
apoe_df = pd.read_csv('/rds/general/project/c3nl_scott_students/live/sankeith/scott_10k_housekeeping/scott_10k_extra_adni_info/APOERES_28Mar2025.csv',low_memory = False)[['PTID','GENOTYPE']]
apoe_df.drop_duplicates(subset = ['PTID','GENOTYPE'], inplace = True)
print(apoe_df)
df = df.merge(apoe_df, how = 'left', on = ['PTID'])
print(len(df))

            PTID GENOTYPE
0     011_S_0002      3/3
1     011_S_0003      3/4
2     022_S_0004      3/3
3     011_S_0005      3/3
4     022_S_0007      3/4
...          ...      ...
2755  341_S_7018      3/3
2756  035_S_7019      3/3
2757  052_S_7027      4/4
2758  941_S_7041      3/4
2759  035_S_7049      3/3

[2760 rows x 2 columns]
15733


### Prepping ADAS13 DataFrame and MRI FIELD STRENGTH DataFrame

In [5]:
### Prepping ADAS123 DataFrame ###
adas_adni1_df = pd.read_csv('/rds/general/project/c3nl_scott_students/live/sankeith/scott_10k_housekeeping/scott_10k_extra_adni_info/ADASSCORES_01Apr2025.csv', low_memory = False)[['PTID','EXAMDATE','TOTALMOD']]
adas_adni1_df.rename(columns={'TOTALMOD': 'TOTAL13', 'EXAMDATE': 'VISDATE'}, inplace=True)

adas_adnigo23_df = pd.read_csv('/rds/general/project/c3nl_scott_students/live/sankeith/scott_10k_housekeeping/scott_10k_extra_adni_info/ADAS_ADNIGO23_31Mar2025.csv', low_memory = False)
adas_adnigo23_df = adas_adnigo23_df[['PTID','VISDATE','TOTAL13']]

adas_df = pd.concat([adas_adni1_df, adas_adnigo23_df], ignore_index=True)

print(len(adas_df))

adas_df.to_csv(f'/rds/general/project/c3nl_scott_students/live/sankeith/scott_10k_housekeeping/scott_10k_extra_adni_info/ADAS13_{datetime.datetime.now().strftime('%d%b%Y')}.csv', index = False)

### Prepping MRI FIELD STRENGTH DataFrame ###

mrimeta_df = pd.read_csv('/rds/general/project/c3nl_scott_students/live/sankeith/scott_10k_housekeeping/scott_10k_extra_adni_info/MRIMETA_11Sep2025.csv', low_memory = False)[['PTID','EXAMDATE','FIELD_STRENGTH']]
mrimeta_3t_df = pd.read_csv('/rds/general/project/c3nl_scott_students/live/sankeith/scott_10k_housekeeping/scott_10k_extra_adni_info/MRI3META_11Sep2025.csv', low_memory = False)[['PTID','EXAMDATE','FIELD_STRENGTH']]
mri_df = pd.concat([mrimeta_df, mrimeta_3t_df], ignore_index=True)
print(len(mri_df))
mri_df.to_csv(f'/rds/general/project/c3nl_scott_students/live/sankeith/scott_10k_housekeeping/scott_10k_extra_adni_info/MRIFLDSTRNGTH_{datetime.datetime.now().strftime('%d%b%Y')}.csv', index = False)

### Prepping Amyloid DataFrame - I only use the batemanlab info which includes EXAMDATE (the other batemanlab DataFrame only has VISCODE, so can't join with Scott 10K) ###

amyloid_df = pd.read_csv('/rds/general/project/c3nl_scott_students/live/sankeith/scott_10k_housekeeping/scott_10k_extra_adni_info/batemanlab_20221118_31Mar2025.csv', low_memory = False)[['RID' ,'EXAMDATE', 'Abeta_4240_Standardized', 'Intertcept_Standardization', 'Slope_Standardization', 'Sample_volume', 'Sample_volume_UNITS', 'Abeta_42_conc', 'Abeta_42_conc_UNITS', 'Abeta_42_N14N15', 'Abeta_42_N15_ISTD_amount', 'Abeta_42_N15_ISTD_amount_UNITS', 'Abeta_40_conc', 'Abeta_40_conc_UNITS', 'Abeta_40_N14N15', 'Abeta_40_N15_ISTD_amount', 'Abeta_40_N15_ISTD_amount_UNITS', 'Abeta_4240']]
print(len(amyloid_df))
amyloid_df.to_csv(f'/rds/general/project/c3nl_scott_students/live/sankeith/scott_10k_housekeeping/scott_10k_extra_adni_info/AMYLOID_{datetime.datetime.now().strftime('%d%b%Y')}.csv', index = False)

### Prepping p217 tau DataFrame ###

tau_df = pd.read_csv('/rds/general/project/c3nl_scott_students/live/sankeith/scott_10k_housekeeping/scott_10k_extra_adni_info/JANSSEN_PLASMA_P217_TAU_01Apr2025.csv', low_memory = False)[['PTID', 'EXAMDATE', 'DILUTION_CORRECTED_CONC', 'CV']]
tau_df.rename(columns={'DILUTION_CORRECTED_CONC': 'P217_DILUTION_CORRECTED_CONC', 'CV': 'P217_CV'}, inplace=True)
tau_df.to_csv(f'/rds/general/project/c3nl_scott_students/live/sankeith/scott_10k_housekeeping/scott_10k_extra_adni_info/P217_{datetime.datetime.now().strftime('%d%b%Y')}.csv', index = False)


11618
13933
742


### Merging time-sensitive visit information (based on Greg's code):
1. Load an allied health file
2. Ensure allied health file as examdate
3. Do a right type join (assume left = og file, right = allied health file)
4. Extract rows only with a T1 file path, so we have full rows
5. Calculate abs(allied health data - MRI visit date) and label it like df.datediffs or some shit
6. df.groupby with PTID, and then sort by datediffs ascend
7. Keep the row where datadiff is less than 28 days and the smallest val. If tie, then we ball and just choose at random
8. Conduct a final length of rows check - MUST BE 15733
9. Rinse and repeat

In [None]:
vars = [
    [f'/rds/general/project/c3nl_scott_students/live/sankeith/scott_10k_housekeeping/scott_10k_extra_adni_info/MRIFLDSTRNGTH_{datetime.datetime.now().strftime('%d%b%Y')}.csv',['PTID','EXAMDATE','FIELD_STRENGTH'], {'FIELD_STRENGTH': pd.NA}],
    ['/rds/general/project/c3nl_scott_students/live/sankeith/scott_10k_housekeeping/scott_10k_extra_adni_info/DXSUM_10Sep2025.csv',['PTID','EXAMDATE','PHASE', 'DIAGNOSIS'], {'PHASE': pd.NA,'DIAGNOSIS':np.nan}],
    ['/rds/general/project/c3nl_scott_students/live/sankeith/scott_10k_housekeeping/scott_10k_extra_adni_info/MMSE_28Mar2025.csv', ['PTID','VISDATE','MMSCORE'], {'MMSCORE':np.nan}],
    ['/rds/general/project/c3nl_scott_students/live/sankeith/scott_10k_housekeeping/scott_10k_extra_adni_info/MOCA_30Mar2025.csv', ['PTID','VISDATE', 'MOCA'], {'MOCA':np.nan}],
    ['/rds/general/project/c3nl_scott_students/live/sankeith/scott_10k_housekeeping/scott_10k_extra_adni_info/NEUROBAT_28Mar2025.csv', ['PTID','VISDATE','LIMMTOTAL', 'CLOCKSCOR', 'LDELTOTAL', 'LDELCUE', 'ANART'], {'LIMMTOTAL':np.nan, 'CLOCKSCOR':np.nan, 'LDELTOTAL':np.nan, 'LDELCUE':np.nan, 'ANART':np.nan}],
    ['/rds/general/project/c3nl_scott_students/live/sankeith/scott_10k_housekeeping/scott_10k_extra_adni_info/CDR_28Mar2025.csv', ['PTID','VISDATE', 'CDGLOBAL'], {'CDGLOBAL':np.nan}],
    ['/rds/general/project/c3nl_scott_students/live/sankeith/scott_10k_housekeeping/scott_10k_extra_adni_info/FAQ_28Mar2025.csv', ['PTID','VISDATE', 'FAQTOTAL'], {'FAQTOTAL':np.nan}],
    ['/rds/general/project/c3nl_scott_students/live/sankeith/scott_10k_housekeeping/scott_10k_extra_adni_info/NPIQ_28Mar2025.csv',['PTID', 'VISDATE','NPISCORE'], {'NPISCORE':np.nan}],
    ['/rds/general/project/c3nl_scott_students/live/sankeith/scott_10k_housekeeping/scott_10k_extra_adni_info/GDSCALE_28Mar2025.csv',['PTID','VISDATE','GDTOTAL'], {'GDTOTAL':np.nan}],
    ['/rds/general/project/c3nl_scott_students/live/sankeith/scott_10k_housekeeping/scott_10k_extra_adni_info/MODHACH_01Apr2025.csv',['PTID', 'VISDATE','HMSCORE'], {'HMSCORE':np.nan}],
    [f'/rds/general/project/c3nl_scott_students/live/sankeith/scott_10k_housekeeping/scott_10k_extra_adni_info/ADAS13_{datetime.datetime.now().strftime('%d%b%Y')}.csv',['PTID','VISDATE','TOTAL13'], {'TOTAL13':np.nan}],
    [f'/rds/general/project/c3nl_scott_students/live/sankeith/scott_10k_housekeeping/scott_10k_extra_adni_info/P217_{datetime.datetime.now().strftime('%d%b%Y')}.csv',['PTID', 'EXAMDATE', 'P217_DILUTION_CORRECTED_CONC', 'P217_CV'], {'P217_DILUTION_CORRECTED_CONC': np.nan, 'P217_CV': np.nan}]
]
for i in range(0,len(vars)):
    column_suffix = vars[i][0].split('/')[-1]
    column_suffix = column_suffix.split('_')[0]
    varnames = vars[i][1]
    
    #Load an allied health file
    Talli = pd.read_csv(vars[i][0], low_memory = False)
    Talli = Talli[vars[i][1]]
    
    if 'RID' in Talli.columns:
        Talli.drop(columns = 'RID', inplace = True)    
    print(f"Length of {column_suffix}: {len(Talli)}")
    
    #Ensure allied health file as examdate
    if 'VISDATE' in Talli.columns:
        Talli.rename(columns = {'VISDATE':f"EXAMDATE_{column_suffix}"}, inplace = True)
        Talli[f'EXAMDATE_{column_suffix}'] = pd.to_datetime(Talli[f'EXAMDATE_{column_suffix}'], errors = 'coerce')
    else:
        Talli[f'EXAMDATE'] = pd.to_datetime(Talli[f'EXAMDATE'], errors = 'coerce')
        Talli.rename(columns = {'EXAMDATE': f'EXAMDATE_{column_suffix}'}, inplace = True)

    #Do a left type join
    Tsub = pd.merge(df,Talli, on = 'PTID', how = 'left')
    print(f"Length of the freshly joined dataframe: {len(Tsub)}")
    
    #Extract rows only with a T1 file path, so we have full rows
    Tsub.dropna(subset='T1_PATH',inplace=True)
    print(f"After dropping rows without T1 paths: {len(Tsub)}")

    #Calculate abs(allied health data - MRI visit date) and label it like df.datediffs or some shit
    try:
        Tsub[f'DATEDIFFS_{column_suffix}'] = abs(Tsub[f'EXAMDATE_{column_suffix}'] - Tsub['VISDATE'])
    except KeyError:
        Tsub[f'DATEDIFFS_{column_suffix}'] = abs(Tsub['EXAMDATE'] - Tsub['VISDATE'])

    #df.groupby with PTID, and then sort by datediffs ascend. Select the/a row where DATEDIFFS is the smallest
    Tsub = Tsub.sort_values(f'DATEDIFFS_{column_suffix}', ascending=True)

    print(f"Grouping rows by 'T1_PATH'. Within groups, organising rows so smallest DATE_DIFFS_{column_suffix} are at the top") 
    min_diffs = (
    Tsub
    .groupby('T1_PATH')[f'DATEDIFFS_{column_suffix}']
    .transform('min')
    )
    
    Tsub = Tsub[
    (Tsub[f'DATEDIFFS_{column_suffix}'] == min_diffs) |
    (min_diffs.isna() & Tsub[f'DATEDIFFS_{column_suffix}'].isna())
    ]

    Tsub.reset_index(drop=True, inplace=True)

    print(f"Finding rows where *EITHER* DATE_DIFFS_{column_suffix} > 28 days *OR* there is no EXAMDATE_{column_suffix} column. If that's the case, replace the relevant values with NaNs, NaTs and pd.NAs") 
    mask = (Tsub[f'DATEDIFFS_{column_suffix}'] > pd.Timedelta(28, "D")) | (Tsub[f'EXAMDATE_{column_suffix}'].isna() == True)
    
    Tsub.loc[mask, f'DATEDIFFS_{column_suffix}'] = pd.NaT
    Tsub.loc[mask, f'EXAMDATE_{column_suffix}'] = pd.NaT
    
    for key in vars[i][2].keys():
        Tsub.loc[mask, key] = vars[i][2].get(key)
    print(f'Length of database before dropping exactly identical rows = {len(Tsub)}. Tsub is of type {type(Tsub)}')
    Tsub.drop_duplicates(keep = 'first',inplace = True)
    print(f'Length of database after dropping rows which are exactly identical = {len(Tsub)}.')
    
    dupe_mask = Tsub.duplicated(subset = 'T1_PATH',keep=False)
    Tsub_dupes = Tsub.loc[dupe_mask].copy()
    print(f'There are {len(Tsub_dupes)} rows with identical T1 MRI file paths, from {Tsub_dupes['T1_PATH'].nunique()} different MRI scans')

    def fewest_missing_one(group):
        missing_counts = group.isna().sum(axis=1)
        min_missing = missing_counts.min()
        candidates = group[missing_counts == min_missing]
        # if more than one, pick the first (or the row with smallest DATEDIFF)
        return candidates.iloc[[0]]  # keeps only 1 row

    print("Grouping rows by T1_PATH, finding the rows with the fewest missing values. If it's a tie between two rows, the first one for each T1_PATH group is kept")
    Tsub = Tsub.groupby('T1_PATH', group_keys=False).apply(fewest_missing_one, include_groups = True)
    Tsub.reset_index(drop=True, inplace=True)

    # Final sanity check
    dup_cols = [c for c in ['T1_PATH', 'MWC1T1_PATH'] if c in Tsub.columns]
    print(f"Final row count: {len(Tsub)}")
    print(f"Remaining duplicates: {Tsub.duplicated(subset=dup_cols).sum()}")
    
    # For each T1_PATH, see how many rows were kept
    rows_per_scan = Tsub.groupby('T1_PATH').size()
    print(rows_per_scan.value_counts())

    #Make the DATEDIFFS column numeric? Just because as datetime it says 'days' at the end and I don't like that.
    Tsub[f'DATEDIFFS_{column_suffix}'] = (Tsub[f'DATEDIFFS_{column_suffix}'].dt.days)
    
    print(Tsub[f'DATEDIFFS_{column_suffix}'].describe())
    print('')

    Tsub.to_csv('/rds/general/project/c3nl_scott_students/ephemeral/sankeith/final_gregbased_scott10k_alliedhealth.csv', index = False)
    df = Tsub

Tsub.to_csv('/rds/general/project/c3nl_scott_students/ephemeral/sankeith/scott10k_alliedhealth.csv', index=False)


Length of MRIFLDSTRNGTH: 13933
Length of the freshly joined dataframe: 118218
After dropping rows without T1 paths: 118218
Grouping rows by 'T1_PATH'. Within groups, organising rows so smallest DATE_DIFFS_MRIFLDSTRNGTH are at the top
Finding rows where *EITHER* DATE_DIFFS_MRIFLDSTRNGTH > 28 days *OR* there is no EXAMDATE_MRIFLDSTRNGTH column. If that's the case, replace the relevant values with NaNs, NaTs and pd.NAs
Length of database before dropping exactly identical rows = 16804. Tsub is of type <class 'pandas.core.frame.DataFrame'>
Length of database after dropping rows which are exactly identical = 16796.
There are 2126 rows with identical T1 MRI file paths, from 1063 different MRI scans
Grouping rows by T1_PATH, finding the rows with the fewest missing values. If it's a tie between two rows, the first one for each T1_PATH group is kept
Final row count: 15733
Remaining duplicates: 0
1    15733
Name: count, dtype: int64
count    15640.000000
mean         0.037148
std          0.6974

### Add Amyloid data (use RID to join, and only one of the amyloid spreadsheets since the other doesn't have EXAMDATE)

In [None]:
import pandas as pd
import numpy as np
import datetime
from datetime import timedelta

df = pd.read_csv('/rds/general/project/c3nl_scott_students/ephemeral/sankeith/scott10k_alliedhealth.csv', low_memory = True)
df['VISDATE'] = pd.to_datetime(df['VISDATE'], errors='coerce')


vars = [
    [f'/rds/general/project/c3nl_scott_students/live/sankeith/scott_10k_housekeeping/scott_10k_extra_adni_info/AMYLOID_{datetime.datetime.now().strftime('%d%b%Y')}.csv',
     ['RID' ,'EXAMDATE', 'Abeta_4240_Standardized', 'Intertcept_Standardization', 'Slope_Standardization', 'Sample_volume', 'Sample_volume_UNITS', 'Abeta_42_conc', 'Abeta_42_conc_UNITS', 'Abeta_42_N14N15', 'Abeta_42_N15_ISTD_amount', 'Abeta_42_N15_ISTD_amount_UNITS', 'Abeta_40_conc', 'Abeta_40_conc_UNITS', 'Abeta_40_N14N15', 'Abeta_40_N15_ISTD_amount', 'Abeta_40_N15_ISTD_amount_UNITS', 'Abeta_4240'], 
     {'Abeta_4240_Standardized': np.nan, 
     'Intertcept_Standardization': np.nan,
     'Slope_Standardization': np.nan,
     'Sample_volume': np.nan,
     'Sample_volume_UNITS': pd.NA,
     'Abeta_42_conc': np.nan,
     'Abeta_42_conc_UNITS': pd.NA,
     'Abeta_42_N14N15': np.nan,
     'Abeta_42_N15_ISTD_amount': np.nan,
     'Abeta_42_N15_ISTD_amount_UNITS': pd.NA,
     'Abeta_40_conc': np.nan,
     'Abeta_40_conc_UNITS': pd.NA,
     'Abeta_40_N14N15': np.nan,
     'Abeta_40_N15_ISTD_amount': np.nan,
     'Abeta_40_N15_ISTD_amount_UNITS': pd.NA,
     'Abeta_4240': np.nan}]
]
for i in range(0,len(vars)):
    column_suffix = vars[i][0].split('/')[-1]
    column_suffix = column_suffix.split('_')[0]
    varnames = vars[i][1]
    
    #Load an allied health file
    Talli = pd.read_csv(vars[i][0], low_memory = False)
    Talli = Talli[vars[i][1]]
    
    #Ensure allied health file as examdate
    Talli[f'EXAMDATE'] = pd.to_datetime(Talli[f'EXAMDATE'], errors = 'coerce')
    Talli.rename(columns = {'EXAMDATE': f'EXAMDATE_{column_suffix}'}, inplace = True)

    #Do a left type join
    Tsub = pd.merge(df, Talli, on = 'RID', how = 'left')
    print(f"Length of the freshly joined dataframe: {len(Tsub)}")
    
    #Extract rows only with a T1 file path, so we have full rows
    Tsub.dropna(subset='T1_PATH',inplace=True)
    print(f"After dropping rows without T1 paths: {len(Tsub)}")

    #Calculate abs(allied health data - MRI visit date) and label it like df.datediffs or some shit
    Tsub[f'DATEDIFFS_{column_suffix}'] = abs(Tsub[f'EXAMDATE_{column_suffix}'] - Tsub['VISDATE'])

    #df.groupby with PTID, and then sort by datediffs ascend. Select the/a row where DATEDIFFS is the smallest
    Tsub = Tsub.sort_values(f'DATEDIFFS_{column_suffix}', ascending=True)

    print(f"Grouping rows by 'T1_PATH'. Within groups, organising rows so smallest DATE_DIFFS_{column_suffix} are at the top") 
    min_diffs = (
    Tsub
    .groupby('T1_PATH')[f'DATEDIFFS_{column_suffix}']
    .transform('min')
    )
    
    Tsub = Tsub[
    (Tsub[f'DATEDIFFS_{column_suffix}'] == min_diffs) |
    (min_diffs.isna() & Tsub[f'DATEDIFFS_{column_suffix}'].isna())
    ]

    Tsub.reset_index(drop=True, inplace=True)

    print(f"Finding rows where *EITHER* DATE_DIFFS_{column_suffix} > 28 days *OR* there is no EXAMDATE_{column_suffix} column. If that's the case, replace the relevant values with NaNs, NaTs and pd.NAs") 
    mask = (Tsub[f'DATEDIFFS_{column_suffix}'] > pd.Timedelta(28, "D")) | (Tsub[f'EXAMDATE_{column_suffix}'].isna() == True)
    
    Tsub.loc[mask, f'DATEDIFFS_{column_suffix}'] = pd.NaT
    Tsub.loc[mask, f'EXAMDATE_{column_suffix}'] = pd.NaT
    
    for key in vars[i][2].keys():
        Tsub.loc[mask, key] = vars[i][2].get(key)

    print(f'Length of database before dropping exactly identical rows = {len(Tsub)}. Tsub is of type {type(Tsub)}')
    Tsub.drop_duplicates(keep = 'first',inplace = True)
    print(f'Length of database after dropping rows which are exactly identical = {len(Tsub)}.')
    
    dupe_mask = Tsub.duplicated(subset = 'T1_PATH',keep=False)
    Tsub_dupes = Tsub.loc[dupe_mask].copy()
    print(f'There are {len(Tsub_dupes)} rows with identical T1 MRI file paths, from {Tsub_dupes['T1_PATH'].nunique()} different MRI scans')

    def fewest_missing_one(group):
        missing_counts = group.isna().sum(axis=1)
        min_missing = missing_counts.min()
        candidates = group[missing_counts == min_missing]
        # if more than one, pick the first (or the row with smallest DATEDIFF)
        return candidates.iloc[[0]]  # keeps only 1 row

    print("Grouping rows by T1_PATH, finding the rows with the fewest missing values. If it's a tie between two rows, the first one for each T1_PATH group is kept")
    Tsub = Tsub.groupby('T1_PATH', group_keys=False).apply(fewest_missing_one, include_groups = True)
    Tsub.reset_index(drop=True, inplace=True)

    # Final sanity check
    dup_cols = [c for c in ['T1_PATH', 'MWC1T1_PATH'] if c in Tsub.columns]
    print(f"Final row count: {len(Tsub)}")
    print(f"Remaining duplicates: {Tsub.duplicated(subset=dup_cols).sum()}")
    
    # For each T1_PATH, see how many rows were kept
    rows_per_scan = Tsub.groupby('T1_PATH').size()
    print(rows_per_scan.value_counts())

    #Make the DATEDIFFS column numeric? Just because as datetime it says 'days' at the end and I don't like that.
    Tsub[f'DATEDIFFS_{column_suffix}'] = (Tsub[f'DATEDIFFS_{column_suffix}'].dt.days)
    
    print(Tsub[f'DATEDIFFS_{column_suffix}'].describe())
    print('')

    Tsub.to_csv('/rds/general/project/c3nl_scott_students/ephemeral/sankeith/withamyloid_gregbased_scott10k_alliedhealth.csv', index = False)
    df = Tsub

Tsub.to_csv('/rds/general/project/c3nl_scott_students/ephemeral/sankeith/debug_scott10k_alliedhealth.csv', index=False)



### Checking diagnoses of scans in Scott 10K

In [None]:
import pandas as pd
df = pd.read_csv('/rds/general/project/c3nl_scott_students/ephemeral/sankeith/debug_scott10k_alliedhealth.csv', low_memory = False)

print(df['DIAGNOSIS'].value_counts())


In [None]:
%%bash

rsync -av /rds/general/project/c3nl_scott_students/ephemeral/sankeith/debug_scott10k_alliedhealth.csv /rds/general/project/c3nl_scott_students/live/sankeith/scott_10k_housekeeping/scott10k_alliedhealth.csv
cd /rds/general/project/c3nl_scott_students/live/sankeith/scott_10k_housekeeping
git add scott10k_alliedhealth.csv
dategcp="`date +%d%b%Y`"
datetime="`date +%H%M%S`"
message="Remade Scott 10K allied health database: ${dategcp}, ${datetime}"

git commit -a -m "$message"
git push -u origin main