# Only modify the next cell

In [1]:
# Full path to the .csv file containing the narratives
fn = "/Users/youngjm/Data/2021-12_mpr_analysis_primary_df_clip_22q.csv"

# Don't modify this bit

In [2]:
from IPython.display import clear_output
import pandas as pd
import numpy as np
import nibabel
import os
from skimage import data
import plotly
import plotly.express as px

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)

In [3]:
def safelySaveDf(df, fn):
    try:
        df = df.astype(str)
        df.to_csv(fn)
        return True
    except PermissionError:
        print("Error: write access to "+fn+" denied. Please check that the file is not locked by Datalad.")
        return False

In [4]:
def checkForMultipleScans(df, row):
    subjId = row['patient_id']
    age = row['age_at_scan_days']
    
    overlap = df[(df['patient_id'] == subjId) & (df['age_at_scan_days'] == age)]
    
    if overlap.shape[0] == 1:
        print("Only scan for the subject")
    elif overlap.shape[0] > 1:
        print("More than one scan for the subject:", overlap['scan_id'].values)
    else:
        print("ERROR: Somehow this subject selected from the dataframe is not actually in the dataframe")

In [5]:
def showImageCrossSection(fn):
    # Load the image
    nibImg = nibabel.load(fn)
    img = nibImg.get_fdata()
    
    for i in range(3):
        fig = px.imshow(img, color_continuous_scale='gray', 
                        #contrast_rescaling='minmax', 
                        zmin=120, zmax=580, origin='lower',
                        animation_frame=i, binary_string=True)
        fig.update_layout(width=800, height=800)
        fig.layout.updatemenus[0].buttons[0].args[1]['frame']['duration'] = 30
        fig.layout.updatemenus[0].buttons[0].args[1]['transition']['duration'] = 5
        plotly.io.show(fig)
    
    print("Figures should be viewable now")


In [6]:
##
# 
# @param df A pandas.DataFrame object
def gradeImages(df, fn):
    
    rateCounter = 0
    
    for idx, row in df.iterrows():
        
        if pd.isnull(row['rawdata_image_grade']):
            
            if rateCounter % 50 == 0 and rateCounter != 0:
                print("Whoa, that's "+str(rateCounter)+" images. Go stretch and drink water.\n")
            elif rateCounter % 10 == 0 and rateCounter != 0:
                print("Yay, "+str(rateCounter)+" images rated!\n")
                
            print("Participant Info")
            print("Age:", row['age_at_scan_days'])
        
            # Need to grab a base directory
            if row['Diagnosis'] == 'Control':
                baseDir = "/Users/youngjm/Data/clip/images/rawdata/"
                print("Diagnosis: CLIP Control")
            elif row['Diagnosis'] == '22qDS':
                baseDir = "/Users/youngjm/Data/22q11/rawdata/"
                print("Diagnosis: 22q11.2 Deletion Syndrome")
                
            print('Surface Holes:', row['SurfaceHoles'])
            print('Scan ID:', row['scan_id'])
            print()
            
            checkForMultipleScans(df, row)
            print()
                
            

            # Then grab the subject, session, etc from the 'scan_id' column
            subj = row['patient_id']
            ses = row['scan_id'].split("_")[1]
            scan = row['scan_id'].replace("MPR", "Mpr").replace("HighRes","")
            fnToLoad = baseDir+subj+"/"+ses+"/anat/"+scan+".nii.gz"

            # Check that the image exists; if not, exit gracefully
            if not os.path.exists(fnToLoad):
                print("Noooo :(", fnToLoad, row['scan_id'])
                return df

            # A. Load the image
            # A. Display the image
            showImageCrossSection(fnToLoad)

            # B. Ask the user to grade the image
            toContinue = False
            while not toContinue:
                response = input('Grade the image quality on a categorical scale of 2 (good), 1 (borderline), 0 (bad), or -1 (not a precontrast T1w) OR "save" or "exit"\n')
                # B. Process the user's response
                # B. Respond to the user
                if response == "2":
                    reason = input('Optional: add a note')
                    toContinue = True
                elif response == 'exit':
                    return df
                elif response == 'save':
                    print("save!\n")
                    safelySaveDf(df, fn)
                elif response == "0" or response == "1":
                    reason = input('Why is the image rated 0 or 1?\n')
                    toContinue = True
                elif response == "-1":
                    reason = input("Please confirm whether the image is postcontrast or not a T1w scan\n")
                    toContinue = True
                elif response == "skip":
                    print("skipping")
                    toContinue = True
                    response = ""
                    reason = ""
                else:
                    response = input('Grade the image quality on a categorical scale of 2 (good), 1 (borderline), 0 (bad), or -1 (not a precontrast T1w) OR "save" or "exit"\n')


            # Update the data frame
            df.loc[idx, 'rawdata_image_grade'] = response
            df.loc[idx, 'rawdata_image_grade_reason'] = reason
            rateCounter += 1

            # Clear the output
            clear_output()
            

    print("Finished examining images")
    return df
    

In [19]:
# Load the data frame
dataDf = pd.read_csv(fn)

if 'Unnamed: 0' in list(dataDf):
    dataDf = dataDf.drop(columns=['Unnamed: 0'])
    
if "rawdata_image_grade" not in list(dataDf):
    dataDf['rawdata_image_grade'] = np.nan
    
if "rawdata_image_grade_reason" not in list(dataDf):
    dataDf['rawdata_image_grade_reason'] = np.nan

print("Loaded the dataframe")
print()
print(list(dataDf))
print()
print("Number of images:", dataDf.shape[0])
print("Number of ungraded images:", dataDf[dataDf['rawdata_image_grade'].isnull()].shape[0])

Loaded the dataframe

['patient_id', 'age_at_scan_days', 'scan_id', 'sex', 'scanner_id', 'BrainSeg', 'CerebralWhiteMatter', 'TotalGray', 'EstimatedTotalIntraCranialVol', 'SurfaceHoles', 'SubCortGrayVol', 'CSF', 'SumCorticalSurfaceArea', 'SumCorticalThickAvg', 'AvgCorticalThickAvg', 'MagneticFieldStrength', 'Group', 'Diagnosis', 'Processing', 'Has22q', 'rawdata_image_grade', 'rawdata_image_grade_reason']

Number of images: 778
Number of ungraded images: 29


# Interactive Part

In [20]:
updatedDataDf = gradeImages(dataDf, fn)

Finished examining images


In [21]:
safelySaveDf(dataDf, fn)

True

In [22]:
def getCohortRatingBreakdown(df):
    
    total = float(df.shape[0])
    ungradedCount = df[df['rawdata_image_grade'].isnull()].shape[0]
    goodCount = df[df['rawdata_image_grade'] == 2].shape[0]
    okCount = df[df['rawdata_image_grade'] == 1].shape[0]
    badCount = df[df['rawdata_image_grade'] == 0].shape[0]
    pcCount = df[df['rawdata_image_grade'] == -1].shape[0]

    
    print()
    print('Good images (rating = 2):', str(round(float(goodCount/total)*100, 2))+'%', str(goodCount)+'/'+str(total))
    print('Ok images (rating = 1):', str(round(float(okCount/total)*100, 2))+'%', str(okCount)+'/'+str(total))
    print('Bad images (rating = 0):', str(round(float(badCount/total)*100, 2))+'%', str(badCount)+'/'+str(total))
    print('Postcontrast images (rating = -1):', str(round(float(pcCount/total)*100, 2))+'%', str(pcCount)+'/'+str(total))
    print('Ungraded images (rating = Na):', str(round(float(ungradedCount/total)*100, 2))+'%', str(ungradedCount)+'/'+str(total))

    
getCohortRatingBreakdown(dataDf)

dataDf[dataDf['rawdata_image_grade'].isnull()]


Good images (rating = 2): 36.63% 285/778.0
Ok images (rating = 1): 26.22% 204/778.0
Bad images (rating = 0): 21.21% 165/778.0
Postcontrast images (rating = -1): 12.21% 95/778.0
Ungraded images (rating = Na): 0.0% 0/778.0


Unnamed: 0,patient_id,age_at_scan_days,scan_id,sex,scanner_id,BrainSeg,CerebralWhiteMatter,TotalGray,EstimatedTotalIntraCranialVol,SurfaceHoles,SubCortGrayVol,CSF,SumCorticalSurfaceArea,SumCorticalThickAvg,AvgCorticalThickAvg,MagneticFieldStrength,Group,Diagnosis,Processing,Has22q,rawdata_image_grade,rawdata_image_grade_reason


In [24]:
import numpy as np
import matplotlib.pyplot as plt


def summarizeContinuousOnCategorical(df, contVar, catVar):
    # For each possible value of the categorical variable
    vals = np.unique(df[df[catVar].notna()][catVar].values)
    print(vals)

    # Set up the number of rows and columns for the figure
    nCols = 2
    nRows = 2 # np.ceil()
    count = 0
    row = 0
    col = 0
    
    # Set up the figure
    fig, axs = plt.subplots(nRows, nCols, figsize=(15, 15))
    fig.patch.set_facecolor('w')

    for v in vals:
        # Get the continuous variable for all data points with the current categorical variable
        contVals = list(df[df[catVar] == v][contVar].values)

        # Plot the continuous variable
        title = "Histogram of "+contVar+"\n("+catVar+" = "+str(v)+")"
        row = int(count / 2)
        col = count % 2
        print(row, col)

        axs[row, col].hist(contVals, bins=20)
        axs[row, col].set_title(title)
        count += 1

    fig.show()
    
summarizeContinuousOnCategorical(dataDf, 'age_at_scan_days', 'rawdata_image_grade')

TypeError: '<' not supported between instances of 'str' and 'float'