# Import and process brain data from BOLD5000

In [68]:
import numpy as np
import pandas as pd
import os
import glob

# 3 complete subjects in the BOLD5000 dataset, downloaded from here: https://bold5000-dataset.github.io/website/
subjects = ['CSI1','CSI2','CSI3']
regions = ['LHEarlyVis','LHPPA','LHRSC','RHPPA','RHEarlyVis','RHOPA','RHRSC','LHOPA','LHLOC','RHLOC'] 

#TODO:
## ASHISH - this does not appear to be true to me: https://figshare.com/articles/dataset/BOLD5000_Release_2_0/14456124?file=28404132
#region = ['LHEarlyVis','LHPPA','LHRSC','RHPPA','RHEarlyVis','RHOPA'']
#RHRSC,'LHOPA','LHLOC','RHLOC' not in all 4 participants

# Data folder for .npy files relative to code location on SCANN Lab dropbox
brain_related_data_folder = os.path.join('..','..','..','data','brain_related_v.2.0','BOLD5000_GLMsingle_ROI_betas','py')

# Get all .npy files from that folder
roi_file_list = glob.glob(os.path.join(brain_related_data_folder,'*.npy'))

# Looking for 30 files (10 ROIs per 4 subj)
assert(len(roi_file_list) == 40)
print(roi_file_list[0])

..\..\..\data\brain_related_v.2.0\BOLD5000_GLMsingle_ROI_betas\py\CSI1_GLMbetas-TYPED-FITHRF-GLMDENOISE-RR_allses_LHEarlyVis.npy


## Handle the image names

In [33]:
# Our list of images, taken from the linguistic task data:
catscenes_ling = pd.read_csv(os.path.join('..','..','data','lingDirectionsAverage.csv'))
catscenes_imgnames = catscenes_ling['presentedImage'].to_list()

500

In [66]:
def process_img_list(bold5000_imgname_file,catscenes_imgnames=catscenes_imgnames,verbose=False):
    """
    A function that will remove duplicates. Previously, BOLD5000 stimlists had "rep"
       appended to each subsequent presentation of an image. Those "rep" prefixes 
       have been removed, so we need to remove duplicates ourselves. 
       We then also need to remove all images that we never presented."""
    
    with open(bold5000_imgname_file,'r') as f:
        img_list = pd.read_csv(f,header=None,names=['imgnames'])
        
        if verbose:
            print(img_list.head())
            print(f'Before dropping duplicates, length = {len(img_list)}')
        
        # Remove all duplicates besides the first image
        img_list.drop_duplicates(keep='first',inplace=True)
        
        if verbose:
            print(f'After dropping duplicates, length = {len(img_list)}')
        
        # Only keep images on the list IF they appeared in the Catscenes study.
        img_list = img_list[img_list['imgnames'].isin(catscenes_imgnames)]
        
    return img_list



# List of image names as presented per subject.
bold5000_imgname_folder = os.path.join('..','..','..','data','brain_related_v.2.0','BOLD5000_imgnames')
bold5000_imgname_file_list = glob.glob(os.path.join(bold5000_imgname_folder,'*'))


# For each subject, return a list of indices corresponding to the column numbers we want to keep. 
# Save that out as a dictionary with subj as key and the list of indices as a list. 
img_list_indices = {}
img_list_names = {}
for bold5000_imgname_file in bold5000_imgname_file_list:
    img_list = pd.DataFrame()
    subj = bold5000_imgname_file.split(os.sep)[-1].split('_')[0]
    img_list = process_img_list(bold5000_imgname_file)
    img_list_names[subj] = img_list['imgnames'].to_list()
    img_list_indices[subj] = img_list.index.to_list()


In [None]:
for file in roi_file_list:
    
    beta = np.load(file)
    subj = file.split(os.sep)[-1].split('_')[0]

    beta = beta[img_list_indices[subj],:]
    beta = np.transpose(beta)

    df = pd.DataFrame(beta,columns=img_list_names[subj])

    df.corr()

Unnamed: 0,fabricstore1.jpg,church6.jpg,elevatorOutside2.jpg,homegarage4.jpg,doctor1.jpg,pasture9.jpg,horsebarn8.jpg,exerciseequipment5.jpg,igloo5.jpg,sidewalk1.jpg,...,hospitalroom3.jpg,minigolf8.jpg,hallway4.jpg,cave4.jpg,frontFoyer.jpg,wineVineyard2.jpg,waitingroom2.jpg,grocerystore2.jpg,boardwalk2.jpg,dinosaur4.jpg
fabricstore1.jpg,1.000000,0.144126,0.046670,0.074104,-0.037320,-0.097124,-0.022789,0.196476,0.061109,0.041431,...,0.069691,0.029018,0.087606,0.158333,-0.012044,-0.159289,-0.006764,0.088616,-0.138163,-0.045166
church6.jpg,0.144126,1.000000,0.116244,0.271614,-0.018719,0.193379,0.225178,0.426336,-0.101551,0.182404,...,0.013445,-0.232829,-0.275143,0.099531,0.130915,-0.066405,0.018623,-0.083731,0.175491,0.214705
elevatorOutside2.jpg,0.046670,0.116244,1.000000,0.192853,0.233806,0.129046,0.025139,0.151784,0.118785,-0.041848,...,-0.030120,0.037072,-0.214824,0.018494,-0.008493,-0.062885,0.070037,0.135040,0.149068,0.017131
homegarage4.jpg,0.074104,0.271614,0.192853,1.000000,0.230818,0.202519,0.052414,0.310539,0.039376,0.007255,...,-0.016209,0.022903,-0.333890,-0.018496,-0.144737,-0.291713,-0.051264,-0.011218,0.129360,0.087653
doctor1.jpg,-0.037320,-0.018719,0.233806,0.230818,1.000000,-0.023840,0.058951,0.093558,0.004223,-0.064162,...,0.024702,-0.043384,-0.056684,-0.057831,0.113135,-0.094775,-0.060862,-0.073344,0.051183,-0.041858
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
wineVineyard2.jpg,-0.159289,-0.066405,-0.062885,-0.291713,-0.094775,-0.060458,-0.060774,-0.243024,0.013912,0.034222,...,-0.024850,0.004675,0.163505,0.070686,0.060482,1.000000,0.060756,0.046123,-0.099594,-0.039368
waitingroom2.jpg,-0.006764,0.018623,0.070037,-0.051264,-0.060862,0.070435,0.108554,-0.136303,0.002115,0.115945,...,-0.049073,0.010416,-0.108764,0.163600,-0.017453,0.060756,1.000000,0.249120,0.157742,0.003490
grocerystore2.jpg,0.088616,-0.083731,0.135040,-0.011218,-0.073344,0.115932,0.160076,-0.044111,0.014640,0.195911,...,0.040501,-0.038570,-0.083779,0.276696,-0.111188,0.046123,0.249120,1.000000,-0.066481,-0.115584
boardwalk2.jpg,-0.138163,0.175491,0.149068,0.129360,0.051183,0.045236,0.117489,0.066367,-0.163041,0.094749,...,0.023578,-0.152641,-0.131742,-0.146524,0.016088,-0.099594,0.157742,-0.066481,1.000000,0.208054


In [98]:
ling = pd.read_csv('lingDirectionsAverage.csv')

In [99]:
#get 500 images used in behavioral exps
used_img = ling.iloc[:,0] 

In [100]:
for file in file_list:
    beta_inv = np.load(file)
    beta = np.transpose(beta_inv)
    # load below to create original data csv
    #img_file = '{}/{}_imgnames.txt'.format(folder,file[32:36])
    #load below to create 500 image csv
    img_file = '{}/{}_imgnames_REPEAT.txt'.format(folder,file[32:36])
    my_file = open(img_file, "r")

    # reading the file
    data = my_file.read()

    # replacing end splitting the text
    # when newline ('\n') is seen.
    data_into_list = data.split("\n")
    #remove empty last line -> last line is empty is all txt files here
    data_into_list.pop(-1)
    
    df = pd.DataFrame(beta, columns = data_into_list)
    #only load the line below if creating original data csv
    #df.to_csv('npy_csv/{}.csv'.format(file[32:-4]),sep=',')
    
    df_used_img = df[used_img]
    df_used_img.to_csv('npy_csv/{}_500img.csv'.format(file[32:-4]),sep=',')
    
    
    
    

FileNotFoundError: [Errno 2] No such file or directory: 'BOLD5000_GLMsingle_ROI_betas/py/CSI4_imgnames_REPEAT.txt'

### above error is ok since CSI4 does not have all the 500 images used. So 500 image csvs are only created for CSV1,2 and 3
### The 500img files end at column SG which is 501th column (as first column is index)

## Analog load

In [68]:
analog = pd.read_csv('analogData_37_bins_Average.csv')

In [74]:
analog_img_list = analog.iloc[:,0]

In [75]:
analog_d = analog.iloc[:,1:]

In [76]:
analog_d_t = analog_d.transpose()

In [77]:
analog_d_t_np = analog_d_t.to_numpy()

In [78]:
analog_final = pd.DataFrame(analog_d_t_np, columns = analog_img_list)

In [82]:
analog_final.shape

(36, 500)

In [83]:
analog_final.to_csv('analog.csv',sep=',')

In [84]:
analog_final_corr = analog_final.corr()

In [85]:
analog_final_corr

presentedImage,ATM1.jpg,ATM4.jpg,HorseRaceTrack.jpg,RVinside2.jpg,ShowJumping7.jpg,airplanecabin1.jpg,airplanecabin3.jpg,airplanecabin5.jpg,airportTerminal2.jpg,airportTerminal3.jpg,...,windmill2.jpg,wineVineyard2.jpg,wineVineyard8.jpg,wineVineyard9.jpg,winebarrel2.jpg,winebarrel3.jpg,winebarrel4.jpg,wrestlingring7.jpg,yogastudio1.jpg,yogastudio4.jpg
presentedImage,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ATM1.jpg,1.000000,0.565262,0.234142,0.199746,0.606473,0.163178,0.260117,0.772762,0.464397,0.320809,...,0.457129,0.691831,0.184805,0.132695,0.773834,0.183432,0.466301,0.175463,0.341356,0.242387
ATM4.jpg,0.565262,1.000000,0.437641,0.186456,0.408795,0.192994,0.222398,0.711498,0.520995,0.512185,...,0.847567,0.472978,0.172831,0.162108,0.620954,0.208457,0.497019,0.665984,0.424796,0.401926
HorseRaceTrack.jpg,0.234142,0.437641,1.000000,0.287553,0.403356,0.271165,0.438576,0.121306,0.385293,0.253454,...,0.121485,0.369103,0.240910,0.255240,0.411251,0.310607,0.582250,0.013057,0.521599,0.360625
RVinside2.jpg,0.199746,0.186456,0.287553,1.000000,0.421274,0.967758,0.546430,0.010584,0.601546,0.166745,...,0.113625,-0.006861,0.973006,0.932706,0.043783,0.983445,0.717856,0.267640,0.915254,0.468631
ShowJumping7.jpg,0.606473,0.408795,0.403356,0.421274,1.000000,0.295621,0.777892,0.263932,0.618862,0.572013,...,0.382402,0.224379,0.344938,0.245907,0.236967,0.346023,0.486102,0.336830,0.579032,0.583490
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
winebarrel3.jpg,0.183432,0.208457,0.310607,0.983445,0.346023,0.992704,0.445759,0.038578,0.571394,0.087022,...,0.119049,0.045037,0.968752,0.956173,0.099789,1.000000,0.747091,0.229469,0.895047,0.403675
winebarrel4.jpg,0.466301,0.497019,0.582250,0.717856,0.486102,0.723106,0.345366,0.314365,0.700549,0.194078,...,0.313345,0.573969,0.692372,0.727761,0.570744,0.747091,1.000000,0.202146,0.828000,0.428024
wrestlingring7.jpg,0.175463,0.665984,0.013057,0.267640,0.336830,0.216656,0.337847,0.319189,0.542897,0.689507,...,0.729602,-0.073156,0.227584,0.210313,0.027860,0.229469,0.202146,1.000000,0.394874,0.602414
yogastudio1.jpg,0.341356,0.424796,0.521599,0.915254,0.579032,0.862125,0.674636,0.149009,0.814979,0.447268,...,0.231969,0.174112,0.859431,0.830443,0.223844,0.895047,0.828000,0.394874,1.000000,0.708577


In [86]:
analog_final_corr.to_csv('analog_corr.csv',sep=',')

## Linguistic data load

In [87]:
ling = pd.read_csv('lingDirectionsAverage.csv')

In [88]:
ling_imgs = ling.iloc[:,0]

In [89]:
ling_d = ling.iloc[:,1:]

In [90]:
ling_d_t = ling_d.transpose()

In [91]:
ling_d_t_np = ling_d_t.to_numpy()

In [92]:
ling_final = pd.DataFrame(ling_d_t_np, columns = ling_imgs)

In [93]:
ling_final.shape

(7, 500)

In [94]:
ling_final.to_csv('ling.csv',sep=',')

In [95]:
ling_final_corr = ling_final.corr()

In [96]:
ling_final_corr

presentedImage,ATM1.jpg,ATM4.jpg,HorseRaceTrack.jpg,RVinside2.jpg,ShowJumping7.jpg,airplanecabin1.jpg,airplanecabin3.jpg,airplanecabin5.jpg,airportTerminal2.jpg,airportTerminal3.jpg,...,windmill2.jpg,wineVineyard2.jpg,wineVineyard8.jpg,wineVineyard9.jpg,winebarrel2.jpg,winebarrel3.jpg,winebarrel4.jpg,wrestlingring7.jpg,yogastudio1.jpg,yogastudio4.jpg
presentedImage,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ATM1.jpg,1.000000e+00,-0.492833,0.588235,-0.622745,-0.420084,-4.200840e-01,-3.823529e-01,0.719676,-0.019672,0.139741,...,-0.160422,0.485530,-4.200840e-01,-0.530662,0.575224,-4.200840e-01,-4.609772e-01,7.486371e-01,-0.488094,7.271084e-17
ATM4.jpg,-4.928329e-01,1.000000,-0.385026,0.900896,0.659912,9.165445e-01,1.694113e-01,-0.297511,0.535656,-0.276432,...,0.252008,0.049743,9.165445e-01,0.963293,0.150604,9.165445e-01,6.247580e-01,-5.040161e-01,0.925778,-2.057637e-01
HorseRaceTrack.jpg,5.882353e-01,-0.385026,1.000000,-0.622745,-0.746816,-4.200840e-01,-3.823529e-01,0.454532,-0.570495,-0.512383,...,-0.641689,0.116105,-4.200840e-01,-0.406841,0.441005,-4.200840e-01,-6.507914e-01,2.138963e-01,-0.564019,-6.549210e-01
RVinside2.jpg,-6.227455e-01,0.900896,-0.622745,1.000000,0.661650,9.547859e-01,4.749754e-01,-0.604897,0.670679,-0.005572,...,0.402995,-0.219697,9.547859e-01,0.939498,-0.247717,9.547859e-01,6.617241e-01,-4.221857e-01,0.973124,9.401268e-02
ShowJumping7.jpg,-4.200840e-01,0.659912,-0.746816,0.661650,1.000000,6.111111e-01,9.335201e-02,-0.030056,0.593171,0.271048,...,0.763763,0.309887,6.111111e-01,0.673722,0.015215,6.111111e-01,9.467293e-01,-8.486251e-02,0.740170,4.157397e-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
winebarrel3.jpg,-4.200840e-01,0.916544,-0.420084,0.954786,0.611111,1.000000e+00,4.200840e-01,-0.450835,0.702439,-0.073922,...,0.381881,-0.100504,1.000000e+00,0.968475,-0.091287,1.000000e+00,6.454972e-01,-2.545875e-01,0.981156,-1.182242e-16
winebarrel4.jpg,-4.609772e-01,0.624758,-0.650791,0.661724,0.946729,6.454972e-01,2.711631e-01,-0.087304,0.580381,0.186094,...,0.739510,0.253012,6.454972e-01,0.701253,-0.141421,6.454972e-01,1.000000e+00,9.005574e-17,0.750000,3.622844e-01
wrestlingring7.jpg,7.486371e-01,-0.504016,0.213896,-0.422186,-0.084863,-2.545875e-01,-1.870097e-16,0.482063,0.214599,0.508131,...,0.333333,0.345425,-2.545875e-01,-0.385922,0.069722,-2.545875e-01,9.005574e-17,1.000000e+00,-0.276084,4.762897e-01
yogastudio1.jpg,-4.880935e-01,0.925778,-0.564019,0.973124,0.740170,9.811558e-01,4.121679e-01,-0.426042,0.739986,-0.005726,...,0.473286,-0.035032,9.811558e-01,0.965446,-0.106066,9.811558e-01,7.500000e-01,-2.760837e-01,1.000000,9.660918e-02


In [97]:
ling_final_corr.to_csv('ling_corr.csv',sep=',')