In [1]:
import pandas as pd
import numpy as np
import os
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns

'''
read in both data and dictionary files, intercept with columes that describe volume
params: path to both files
param: keyword for filter out area of interest
return: dataframe of patientID, VISCODE, exam date and volumes of all regions in dictionary 
'''
def read_csv(file, dictfile, keyword):
    df = pd.read_csv(file)
    dict_df = pd.read_csv(dictfile)
    #print(df.head())

    #filter out rows only with Volume
    dict_df = dict_df[dict_df["TEXT"].str.contains(keyword, case=False, na=False)]
    STcodes = dict_df['FLDNAME'].values
    #print(dict_df.head())
    #print(STcodes)

    extra_cols = ['RID','VISCODE','EXAMDATE']
    column_needed = np.concatenate([extra_cols, STcodes])
    
    #cross reference df with dict_df
    df = df.loc[:,column_needed]
    return df

'''
create dir in current folder to save all correlation matrices
'''
def make_dir(dirname):
    parent_dir = os.getcwd()
    path = os.path.join(parent_dir, dirname)
    os.mkdir(path)
    
'''
get correlation matrix per patient for all structures in the data file w.r.t volume
params: df is the processed dataframe with all data, path is the path for output dir
return: no return, saves all corr-mat to file
'''
def get_corr_mat(df, path):
    # get patients
    patients = np.unique(df['RID'].values)
    #print(patients)

    #I think fill rows with NA values at patient level for now, see how to do later
    for p in patients:
        pat_df = df[df['RID'] == p]

        ##only take into account of patients with more than 3 rows
        ##since 2 rows produce matrix of 1 and -1, and 1 row produces NA
        if len(pat_df.index) > 2:
            #convert to datetime, then sort by time
            try:
                pat_df['EXAMDATE'] = pd.to_datetime(pat_df['EXAMDATE'], format='%m/%d/%y')
            except:
                pat_df['EXAMDATE'] = pd.to_datetime(pat_df['EXAMDATE'], format='%Y-%m-%d')
            pat_df = pat_df.sort_values(by=['EXAMDATE'])
            #print(pat_df)

            #drop columns with all NAs
            pat_df = pat_df.dropna(axis=1, how='all')
            ##filling NA values, forward fill for now, MAY CHANGE!!!!
            pat_df = pat_df.fillna(method='ffill')
            #drop unnecessary rows for correlation matrix
            pad_df = pat_df.drop(['RID','VISCODE','EXAMDATE'],axis=1)
            corr_mat = pad_df.corr()

            #output to csv for comparison later, maybe in R?
            out_path = path + '/Patient_' + str(p) + '.csv'
            corr_mat.to_csv(out_path)

            ##visualization with sns
            #plot = sns.clustermap(corr_mat, figsize=(10, 10))

In [35]:
##iterate through all good studies and generate corr matrix
file_df = pd.read_csv('ADNI1_good_files.csv')
for i in range(len(file_df.index)):
    study = file_df.iloc[i,0]
    filename = 'ADNI1_all_data/' + str(file_df.iloc[i,1])
    dictname = 'ADNI1_all_data/' + str(file_df.iloc[i,2])
    keyword = "Volume"
    
    df = read_csv(filename, dictname, keyword)
    make_dir(study)
    get_corr_mat(df, study)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pat_df['EXAMDATE'] = pd.to_datetime(pat_df['EXAMDATE'], format='%m/%d/%y')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pat_df['EXAMDATE'] = pd.to_datetime(pat_df['EXAMDATE'], format='%Y-%m-%d')


In [37]:
##### UCSF Cross Sectional Free Surfer 3T data, this will be the baseline
file = 'ADNI1_all_data/FOXLABBSI_03_27_20.csv'
dictfile = 'ADNI1_all_data/FOXLABBSI_DICT_03_27_20.csv'
keyword = "volume"
df = read_csv(file, dictfile, keyword)

#make dir to save, name will be same as filename
out = "test"
make_dir(out)

get_corr_mat(df, out)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pat_df['EXAMDATE'] = pd.to_datetime(pat_df['EXAMDATE'], format='%m/%d/%y')


In [2]:
##### UCSF Cross Sectional Free Surfer 3T data, this will be the baseline
df = pd.read_csv('ADNI1_all_data/UCSFFSX51_ADNI1_3T_02_01_16.csv')
#print(df.head())

dict_df = pd.read_csv('ADNI1_all_data/UCSFFSX51_ADNI1_3T_DICT_11_01_13.csv')
#filter out rows only with Volume
dict_df = dict_df[dict_df["TEXT"].str.contains("Volume", case=False, na=False)]
STcodes = dict_df['FLDNAME'].values
#print(dict_df.head())
#print(STcodes)

extra_cols = ['RID','VISCODE','EXAMDATE','VERSION']
column_needed = np.concatenate([extra_cols, STcodes])
#print(column_needed)
#cross reference df with dict_df
df = df.loc[:,column_needed]
print(df.head())

   RID VISCODE  EXAMDATE VERSION  ST100SV  ST101SV  ST102CV  ST103CV  ST104CV  \
0   15     m36   4/27/09  8/7/13      NaN     1140     2556     1809     2730   
1   15     m06    5/2/06  8/7/13      NaN     1527     2748     1593     2772   
2   15     m24  10/11/07  8/7/13      NaN     1100     2734     1651     2660   
3   15     m12  10/16/06  8/7/13      NaN     1271     2742     1705     2816   
4   15      bl  10/31/05  8/7/13      NaN     1327     3080     1742     2705   

   ST105CV  ...  ST146HS  ST147SV  ST148SV  ST149SV  ST150SV   ST151SV  \
0     2106  ...  392.115   181075   185820   366895   214481  215448.0   
1     2147  ...      NaN   188692   187295   375987   219660  218921.0   
2     2339  ...  377.043   188368   186864   375232   216223  216144.0   
3     2190  ...  354.918   187457   188328   375785   216942  214105.0   
4     2222  ...  369.740   191836   190609   382445   217061  216157.0   

   ST152SV  ST153SV  ST154SV  ST155SV  
0   429929   167857   534752

In [5]:
##create a directory to save all corr matrices in
dirname = 'UCSF_cross_sectional_3T_2rows'
parent_dir = os.getcwd()
path = os.path.join(parent_dir, dirname)
os.mkdir(path)

#drop columns with all NAs
df = df.dropna(axis=1, how='all')

# get patients
patients = np.unique(df['RID'].values)
print(len(patients))

#I think fill rows with NA values at patient level for now, see how to do later
for p in patients:
    pat_df = df[df['RID'] == p]
    
    ##only take into account of patients with more than 3 rows
    ##since 2 rows produce matrix of 1 and -1, and 1 row produces NA
    ##will need to think about a better way
    if len(pat_df.index) > 1:
        #convert to datetime, then sort by time
        pat_df['EXAMDATE'] = pd.to_datetime(pat_df['EXAMDATE'], format='%m/%d/%y')
        pat_df = pat_df.sort_values(by=['EXAMDATE'])
        #print(pat_df)

        ##filling NA values, forward fill for now, MAY CHANGE!!!!
        pat_df = pat_df.fillna(method='ffill')

        #drop columns with all NAs
        pat_df = pat_df.dropna(axis=1, how='all')

        pad_df = pat_df.drop(['RID','VISCODE','VERSION'],axis=1)
        corr_mat = pad_df.corr()

        #output to csv for comparison later, maybe in R?
        out_path = dirname + '/Patient_' + str(p) + '.csv'
        corr_mat.to_csv(out_path)
        
        '''
        ##basic correlation matrix plot, but turned out very messy
        f = plt.figure(figsize=(19, 15))
        plt.matshow(pad_df.corr(), fignum=f.number)
        plt.xticks(range(pad_df.select_dtypes(['number']).shape[1]), pad_df.select_dtypes(['number']).columns, fontsize=14, rotation=90)
        plt.yticks(range(pad_df.select_dtypes(['number']).shape[1]), pad_df.select_dtypes(['number']).columns, fontsize=14)
        cb = plt.colorbar()
        cb.ax.tick_params(labelsize=14)
        plt.title('Correlation Matrix', fontsize=16);
        '''
        ##attempt 2 to use sns
        #plot = sns.clustermap(corr_mat, figsize=(10, 10))

143


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pat_df['EXAMDATE'] = pd.to_datetime(pat_df['EXAMDATE'], format='%m/%d/%y')


In [6]:
###### UPenn Hierarchical Parcellation of MRI Using Multi-atlas Labeling
### does not work, only 1 row/patient
upenn_df = pd.read_csv('ADNI1_all_data/UPENNROI_MARS_06_01_16.csv')
#print(upenn_df.head())

##based on description looks like all are volume, so will use all columns
upenn_dict_df = pd.read_csv('ADNI1_all_data/UPENNROI_MARS_DICT_06_01_16.csv')

#drop columns with all NAs
upenn_df = upenn_df.dropna(axis=1, how='all')

# get patients
upenn_patients = np.unique(upenn_df['RID'].values)
#print(upenn_patients)

#I think fill rows with NA values at patient level for now, see how to do later
# for p in upenn_patients:
#     pat_df = upenn_df[upenn_df['RID'] == p]
#     print(pat_df)

   RID VISCODE  EXAMDATE  VERSION IMAGE_UID  RUNDATE    STATUS  \
0    2      sc   8/26/05  3/21/16    I35475  3/21/16  COMPLETE   
1    3      sc    9/1/05  3/21/16    I32237  3/21/16  COMPLETE   
2    4      sc   9/22/05  3/21/16    I64631  3/21/16  COMPLETE   
3    5      sc    9/2/05  3/21/16    I32246  3/21/16  COMPLETE   
4    6      sc  11/15/05  3/21/16    I33025  3/21/16  COMPLETE   

                CBICA_ID      DATE     R702  ...      R199     R200     R201  \
0  011_S_0002_2005-08-26   8/26/05  1784710  ...  12577.70  7531.16  6862.94   
1  011_S_0003_2005-09-01    9/1/05  1785950  ...   9596.26  6754.81  5647.18   
2  022_S_0004_2005-09-22   9/22/05  1619320  ...  10444.50  7522.30  7652.39   
3  011_S_0005_2005-09-02    9/2/05  1655680  ...  10243.40  5616.39  5902.29   
4  100_S_0006_2005-11-15  11/15/05  1528520  ...   8364.14  5948.50  6253.38   

      R202     R203     R204     R205     R206     R207  update_stamp  
0  8957.96  9767.36  3077.59  2563.72  1338.33  10

In [23]:
######### UCSF Longitutinal FreeSurfer
longi_df = pd.read_csv('ADNI1_all_data/UCSFFSL_02_01_16.csv')
# print(longi_df.head())

longi_dict_df = pd.read_csv('ADNI1_all_data/UCSFFSL_DICT_11_01_13.csv')
#filter out rows only with Volume
longi_dict_df = longi_dict_df[longi_dict_df["TEXT"].str.contains("Volume", case=False, na=False)]
longi_STcodes = longi_dict_df['FLDNAME'].values
# print(longi_dict_df.head())
# print(longi_STcodes)

longi_extra_cols = ['RID','VISCODE','EXAMDATE','VERSION']
longi_column_needed = np.concatenate([longi_extra_cols, longi_STcodes])
#print(longi_column_needed)
#cross reference df with dict_df
longi_df = longi_df.loc[:,longi_column_needed]
print(longi_df.head())

   RID VISCODE    EXAMDATE     VERSION  ST100SV  ST101SV  ST102CV  ST103CV  \
0    3     m06  2006-03-13  2009-07-01      NaN     1498     2927     1904   
1    3     m12  2006-09-12  2009-07-01      NaN     1519     2935     1826   
2    3     m24  2007-09-12  2009-07-02      NaN     1541     3038     1669   
3    3      sc  2005-09-01  2009-07-01      NaN     1661     3261     1936   
4    4     m06  2006-05-25  2009-08-24      NaN     1285     3021     2110   

   ST104CV  ST105CV  ...  ST91CV  ST92SV  ST93CV  ST94CV  ST95CV  ST96SV  \
0     3416     2245  ...    8211     NaN    2236   13139    6003   29544   
1     3275     2271  ...    8050     NaN    2115   13082    6072   29862   
2     3284     2160  ...    7615     NaN    2147   12711    6065   32496   
3     3278     2040  ...    8550     NaN    2362   13566    5892   28288   
4     2638     2787  ...   10703     NaN    2003   11276    6884   19162   

   ST97CV  ST98CV  ST99CV  ST9SV  
0    7065    4253   10022   2808  
1   

In [52]:
##create a directory to save all corr matrices in for longitudinal
logi_dirname = 'UCSF_longitudinal'
logi_path = os.path.join(parent_dir, logi_dirname)
os.mkdir(logi_path)

#drop columns with all NAs
longi_df = longi_df.dropna(axis=1, how='all')

# get patients
longi_patients = np.unique(longi_df['RID'].values)
#print(longi_patients)

#I think fill rows with NA values at patient level for now, see how to do later
for p in longi_patients:
    pat_df = longi_df[longi_df['RID'] == p]
    
    ##only take into account of patients with more than 3 rows
    ##since 2 rows produce matrix of 1 and -1, and 1 row produces NA
    ##will need to think about a better way
    if len(pat_df.index) > 2:
        #print(pat_df)
        #convert to datetime, then sort by time
        pat_df['EXAMDATE'] = pd.to_datetime(pat_df['EXAMDATE'], format='%Y-%m-%d')
        pat_df = pat_df.sort_values(by=['EXAMDATE'])
        #print(pat_df)
    
        ##filling NA values, ffill for now, MAY CHANGE!!!!
        pat_df = pat_df.fillna(method='ffill')

        #drop columns with all NAs
        pat_df = pat_df.dropna(axis=1, how='all')

        #print(pat_df)

        pad_df = pat_df.drop(['RID','VISCODE','VERSION','EXAMDATE'],axis=1) 
        corr_mat = pad_df.corr()
        #print(corr_mat)

        #output to csv for comparison later, maybe in R?
        logi_out_path = logi_dirname + '/Patient_' + str(p) + '.csv'
        corr_mat.to_csv(logi_out_path)

        ##visualization use sns
        #plot = sns.clustermap(corr_mat, figsize=(10, 10))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pat_df['EXAMDATE'] = pd.to_datetime(pat_df['EXAMDATE'], format='%Y-%m-%d')
