In this exercise we will classify stimulus classes using the Haxby et al. data. You should first obtain the data using the command:

wget http://data.pymvpa.org/datasets/haxby2001/subj1-2010.01.14.tar.gz

and set the datadir variable accordingly

In [91]:
import nipype.algorithms.modelgen as model   # model generation
import nipype.interfaces.fsl as fsl          # fsl
from nipype.interfaces.base import Bunch
import os,json,glob
import numpy
import nibabel
import nilearn.plotting
import sklearn.multiclass
from sklearn.svm import LinearSVC
import sklearn.metrics
import sklearn.cross_validation
import statsmodels.api as sm
from nilearn.input_data import NiftiMasker
import scipy.stats
import random

%matplotlib inline
import matplotlib.pyplot as plt


datadir='/Users/poldrack/data_unsynced/haxby/subj1'

print 'Using data from',datadir


tr=2.5

boldfile=os.path.join(datadir,'bold.nii.gz')
boldbrainfile=os.path.join(datadir,'bold_brain.nii.gz')
vtmaskfile=os.path.join(datadir,'mask4_vt.nii.gz')
brainmaskfile=os.path.join(datadir,'bold_brain_mask.nii.gz')

boldimg=nibabel.load(boldfile)

if not os.path.exists(brainmaskfile):
    bet=fsl.BET()
    bet.inputs.in_file=boldfile
    bet.inputs.out_file=boldfile.replace('.nii.gz','_brain.nii.gz')
    bet.inputs.functional=True
    bet.inputs.mask=True
    bet.run()


brainmaskimg=nibabel.load(brainmaskfile)
vtmaskimg=nibabel.load(vtmaskfile)


Using data from /Users/poldrack/data_unsynced/haxby/subj1


events

In [5]:
labelfile=os.path.join(datadir,'labels.txt')
lines=open(labelfile).readlines()
lines=lines[1:] # drop header

# find all block onsets
conditions=[]
condnums=[]
onsets=[]
durations=[]
runs=[]
cond=''
condctr=1

cond_dict={'scissors':1,
 'face':2,
 'cat':3,
 'shoe':4,
 'house':5,
 'scrambledpix':6,
 'bottle':7,
 'chair':8}
condlabels=['scissors','face','cat','shoe','house','scrambledpix','bottle','chair']

for i in range(len(lines)):
    l_s=lines[i].strip().split()
    if l_s[0]=='rest':
        continue
    if not l_s[0]==cond:
        cond=l_s[0]
        runs.append(int(l_s[1]))
        conditions.append('-'.join(l_s))
        condnums.append(cond_dict[l_s[0]])
        onsets.append([tr*(i+1)])
        durations.append([22.5])
    
condnums=numpy.array(condnums)
runs=numpy.array(runs)

Set up model


In [6]:
modeldir=os.path.join(datadir,'blockmodel')
# no way to specify the output directory, so we just chdir into the 
# desired output directory
if not os.path.exists(modeldir):
    os.mkdir(modeldir)
os.chdir(modeldir)

matfile=fsf_file.replace(".fsf",".mat")



Estimate the model with a separate condition for each block using FSL.  This will take several hours to finish.

In [8]:
contrasts=[]

for i in range(len(conditions)):
    contrasts.append([conditions[i],'T',[conditions[i]],[1]])


# this is how one could do it using FSL - this is VERY slow, so let's compute the GLM on our own
if not os.path.exists(os.path.join(modeldir,'stats')):
    
    
    info = [Bunch(conditions=conditions,
                  onsets=onsets,
                  durations=durations)
           ]
    print 'SpecifyModel'
    s = model.SpecifyModel()
    s.inputs.input_units = 'secs'
    s.inputs.functional_runs = [boldbrainfile]
    s.inputs.time_repetition = tr
    s.inputs.high_pass_filter_cutoff = 128.
    s.inputs.subject_info = info
    s.run()

    print 'level1design'
    level1design = fsl.model.Level1Design()
    level1design.inputs.interscan_interval = tr
    level1design.inputs.bases = {'dgamma':{'derivs': False}}
    level1design.inputs.session_info = s._sessinfo
    level1design.inputs.model_serial_correlations=False
    level1design.inputs.contrasts=contrasts
    level1info=level1design.run() 
    
    fsf_file=os.path.join(modeldir,'run0.fsf')
    event_files=glob.glob(os.path.join(modeldir,'ev*txt'))

    print 'modelgen'
    modelgen=fsl.model.FEATModel()
    modelgen.inputs.fsf_file=fsf_file
    modelgen.inputs.ev_files=event_files
    modelgen.run()

    print 'FILMGLS'
    fgls = fsl.FILMGLS(autocorr_noestimate=True)
    fgls.inputs.in_file =boldbrainfile
    fgls.inputs.design_file = os.path.join(modeldir,'run0.mat')
    fgls.inputs.threshold = 10
    fgls.inputs.results_dir = os.path.join(modeldir,'stats')
    fgls.inputs.tcon_file=os.path.join(modeldir,'run0.con')
    res = fgls.run() 

else:
    print 'stats have already been run - using existing files'


SpecifyModel
level1design
modelgen
FILMGLS

INFO:interface:stdout 2015-07-16T16:45:35.737367:Log directory is: /Users/poldrack/data_unsynced/haxby/subj1/blockmodel/stats
INFO:interface:stdout 2015-07-16T16:45:56.452997:paradigm.getDesignMatrix().Nrows()=1452
INFO:interface:stdout 2015-07-16T16:45:56.453614:paradigm.getDesignMatrix().Ncols()=96
INFO:interface:stdout 2015-07-16T16:45:56.453614:sizeTS=1452
INFO:interface:stdout 2015-07-16T16:45:56.453614:numTS=42228
INFO:interface:stdout 2015-07-16T16:45:56.757459:Completed
INFO:interface:stdout 2015-07-16T16:45:56.757459:Prewhitening and Computing PEs...
INFO:interface:stdout 2015-07-16T16:45:56.757459:Percentage done:
INFO:interface:stdout 2015-07-16T19:20:00.605038:1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,Completed
INFO:interface:stdout




Load the zstat images that we will use as our block-by-block signal estimates

In [85]:
use_whole_brain=False

if not os.path.exists(os.path.join(modeldir,'zstatdata.nii.gz')):
    zstatdata=numpy.zeros((boldimg.shape[0],boldimg.shape[1],boldimg.shape[2],len(conditions)))
    for i in range(len(conditions)):
        zstatdata[:,:,:,i]=nibabel.load(os.path.join(modeldir,'stats/zstat%d.nii.gz'%int(i+1))).get_data()

    zstatimg=nibabel.Nifti1Image(zstatdata,affine=brainmaskimg.get_affine())
    zstatimg.to_filename(os.path.join(modeldir,'zstatdata.nii.gz'))

if use_whole_brain:
    maskimg=brainmaskfile
else:
    maskimg=vtmaskfile
    
nifti_masker = NiftiMasker(mask_img=maskimg, standardize=False)
fmri_masked = nifti_masker.fit_transform(os.path.join(modeldir,'zstatdata.nii.gz'))

# include faces and cats
condition_mask = numpy.logical_or(condnums == 2,
                               condnums == 3)
fmri_masked = fmri_masked[condition_mask,:]
condlabels=condnums[condition_mask]
runlabels=runs[condition_mask]

array([2, 3, 2, 3, 3, 2, 3, 2, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 3, 2, 2,
       3])

Now let's do a leave-one-run out classifier.

In [95]:
def shuffle_within_runs(labels,runs):
    for r in numpy.unique(runs):
        l=labels[runs==r]
        random.shuffle(l)
        labels[runs==r]=l
    return labels


def run_classifier(fmri_masked,condlabels,runs,baseclf,shuffle_labels=False):
    cv = sklearn.cross_validation.LeaveOneLabelOut(labels=runs)

    pred=numpy.zeros(len(runs)) # predicted class

    if len(numpy.unique(condlabels))>2:
        clf=sklearn.multiclass.OneVsRestClassifier(baseclf)
    else:
        clf=baseclf
    
    for train,test in cv:
        testdata=fmri_masked[test,:]
        traindata=fmri_masked[train,:]
        trainlabels=condlabels[train]
        if shuffle_labels:
            shuffle_within_runs(trainlabels,runs[train])
        clf.fit(traindata,trainlabels)
        pred[test]=clf.predict(testdata)
        
    confmtx=sklearn.metrics.confusion_matrix(condlabels,pred)
    acc=sklearn.metrics.accuracy_score(condlabels,pred)
    return pred,confmtx,acc

pred,confmtx,acc=run_classifier(fmri_masked,condlabels,runlabels,LinearSVC())



In [87]:
print confmtx
print 'Accuracy score:',acc

[[9 3]
 [4 8]]
Accuracy score: 0.708333333333


In [96]:
nperms=500
randacc=numpy.zeros(nperms)
for i in range(nperms):
    p,c,randacc[i]=run_classifier(fmri_masked,condlabels,runlabels,LinearSVC(),shuffle_labels=True)


In [97]:
pct=scipy.stats.percentileofscore(randacc,acc)
print 'Pval:',(100-pct)/100.0


Pval: 0.013
