## Prepare metadata

This notebook consolidates the preparation of metadata for the analyses. It requires that narps.py or PrepareMaps.ipynb has already been run.


In [1]:
import os,sys,glob,warnings
import matplotlib.pyplot as plt
import numpy,pandas
import nilearn.input_data

from narps import Narps
from utils import get_merged_metadata_decisions

# set up directories
basedir='/Users/poldrack/data_unsynced/NARPS'

metadata_dir = os.path.join(basedir,'metadata')
if not os.path.exists(metadata_dir):
    os.mkdir(metadata_dir)

examples.directory is deprecated; in the future, examples will be found relative to the 'datapath' directory.
  "found relative to the 'datapath' directory.".format(key))


In [2]:
narps = Narps(basedir,overwrite=False)
narps.load_data()

# check zstat data
missing_zstat = {}
for teamID in narps.complete_image_sets:
    for hyp in range(1,10):
        if not hyp in narps.teams[teamID].images['unthresh']['zstat']:
            if not teamID in missing_zstat:
                missing_zstat[teamID]=[]
            missing_zstat[teamID].append(hyp)
        
print('teams with missing zstat images:')
print(missing_zstat)

found 54 input directories
found 54 teams with complete original datasets
teams with missing zstat images:
{'I9D6': [1, 2, 3, 4, 5, 6, 7, 8, 9]}


In [3]:
# get original image and decision metadata
alldata_df = get_merged_metadata_decisions()
print('found merged metadata for %d teams'%alldata_df.teamID.unique().shape[0])


found merged metadata for 70 teams


Defaulting to column, but this will raise an ambiguity error in a future version
  alldata_df = decision_df.merge(metadata,on='teamID',how='left')


In [4]:
# change type of varnum to int
alldata_df['varnum']=alldata_df['varnum'].astype('int')

# recode variables to make analysis cleaner

def package_recoder(p):
    others = ['nistats','PALM','randomise']
    if not isinstance(p,str):
        return('Other')
    if p.find('SPM')==0:
        return('SPM')
    elif p in others:
        return('Other')
    else:
        return p

alldata_df['software'] = [package_recoder(x) for x in alldata_df['TSc_SW']]



In [5]:
# load smoothness data and 
smoothness_df = pandas.read_csv(os.path.join(metadata_dir,'smoothness_est.csv'))

print("found smoothness data for %d teams"%len(smoothness_df.teamID.unique()))
print('missing smoothness data for:')
print(set(narps.complete_image_sets).difference(set(smoothness_df.teamID.unique())))

# 16IN: used nonstandard stats, not usable for group analysis

alldata_df = pandas.merge(alldata_df,smoothness_df,how='left',
                          left_on=['teamID','varnum'],right_on=['teamID','hyp'])

# average FWHM estimated as:
# AvgFWHM = RESELS^(1/3) (multplied by 2 since this value is in voxels rather than mm)
# per: https://www.jiscmail.ac.uk/cgi-bin/webadmin?A2=FSL;e792b5da.0803

alldata_df['fwhm'] = [i**(1/3.)*2 for i in alldata_df.resels]

found smoothness data for 53 teams
missing smoothness data for:
{'16IN'}


In [7]:
# save data for loading into R
alldata_df.to_csv(os.path.join(metadata_dir,'all_metadata.csv'))