In [1]:
import numpy as np
import string 
import scipy.io
from sklearn.metrics import mean_squared_error
from scipy.spatial import distance
import itertools
from sklearn import svm
import statsmodels.api as sm # import statsmodels 
from scipy import signal


class Scan(object):
    def __init__(self,activations,timestamp, step,prev_words=None,next_words=None,all_words=None,all_pos=None):
        self.activations = activations
        self.timestamp = timestamp
        self.prev_words = prev_words
        self.next_words = next_words
        self.step = step
        self.all_words = all_words
        self.all_pos = all_pos
        self.brain3d = None
        
def eval(dists,e_dists):
    nn_index = np.argmin(dists,axis=1)
    accuracy_on_test = np.mean(nn_index == np.argmax(np.eye(dists.shape[0]),axis=1))


    b_acc = []
    e_b_acc = []
    for i,j in itertools.combinations(np.arange(dists.shape[0]), 2):
        right_match = dists[i,i] + dists[j,j]
        wrong_match = dists[i,j] + dists[j,i]
        b_acc.append(right_match < wrong_match)

        e_right_match = e_dists[i,i] + e_dists[j,j]
        e_wrong_match = e_dists[i,j] + e_dists[j,i]
        e_b_acc.append(e_right_match < e_wrong_match)

    #print("binary accuracy: ", np.mean(b_acc)," ", np.mean(e_b_acc))
    return np.mean(b_acc),np.mean(e_b_acc),b_acc,e_b_acc

  from pandas.core import datetools


In [2]:
story_features = scipy.io.loadmat('story_features.mat') 

In [3]:
speach_feature_id = 1
motion_feature_id = 2
emotion_feature_id = 3
verbs_feature_id = 4
characters_feature_id = 5
visual_wordlength_feature_id = 6
Word_Num_feature_id = 7
part_of_speaches_feature_id = 8
Dependency_role_feature_id = 9

In [4]:
part_of_speaches = story_features['features'][0][part_of_speaches_feature_id][1][0]

In [11]:
subject_id = 1

block_pos = {}

for block_id in [1,2,3,4]:
    block_pos[block_id] = np.load("subject_"+str(subject_id)+"_block_"+str(block_id)+"_pos.npy")

block_scans = np.load("subject_"+str(subject_id)+"_scan_objects.npy")

In [12]:
print(len(block_scans.item()[1]))
print(len(block_scans.item()[2]))
print(len(block_scans.item()[3]))
print(len(block_scans.item()[4]))

326
338
265
366


In [13]:
embeddings_0 = np.load("../data/subject_"+str(1)+"_lstm_"+str(0)+"_emb_objects.npy")
embeddings_1 = np.load("../data/subject_"+str(1)+"_lstm_"+str(1)+"_emb_objects.npy")

In [16]:
block_scans.item()[block_id][0]

<__main__.Scan at 0x149fed438>

In [20]:
detrended_block_scans = {1:[],2:[],3:[],4:[]}
raw_block_scans = {1:[],2:[],3:[],4:[]}

for block_id in [1,2,3,4]:
        for i in np.arange(len(block_scans.item()[block_id])):
            raw_block_scans[block_id].append(block_scans.item()[block_id][i].activations[0])
                           
        detrended_block_scans[block_id] = signal.detrend(raw_block_scans[block_id],type="constant")

In [21]:
def prepare_data(train_block_ids,test_block_ids):
    train_features = {'position':[],'pos_tag':[],'lstm_1':[],'lstm_0':[],'lstm_prev_1':[],'lstm_prev_0':[]}
    train_brain_activations = []
    for train_block_id in train_block_ids:
        for i in np.arange(len(block_scans.item()[train_block_id])):
            scan = block_scans.item()[train_block_id][i]
            if (scan.step - 4) in embeddings_1.item()[train_block_id].keys():
                train_features['position'].append(scan.step)
                train_features['pos_tag'].append(np.sum(scan.all_pos,axis=0))
                train_features['lstm_1'].append(np.mean(embeddings_1.item()[train_block_id][scan.step],axis=0))
                train_features['lstm_0'].append(np.mean(embeddings_0.item()[train_block_id][scan.step],axis=0))
                train_features['lstm_prev_1'].append(np.mean(embeddings_1.item()[train_block_id][scan.step-4],axis=0))
                train_features['lstm_prev_0'].append(np.mean(embeddings_0.item()[train_block_id][scan.step-4],axis=0))
                train_brain_activations.append(detrended_block_scans[block_id][i])
        #print(scan.step)

    test_features = {'position':[],'pos_tag':[],'lstm_1':[],'lstm_0':[],'lstm_prev_1':[],'lstm_prev_0':[]}
    test_brain_activations = []
    for test_block_id in test_block_ids:
        for scan in block_scans.item()[test_block_id]:
            if (scan.step - 4) in embeddings_1.item()[test_block_id].keys():
                test_features['position'].append(scan.step)
                test_features['pos_tag'].append(np.sum(scan.all_pos,axis=0))
                test_features['lstm_1'].append(np.mean(embeddings_1.item()[test_block_id][scan.step],axis=0))
                test_features['lstm_0'].append(np.mean(embeddings_0.item()[test_block_id][scan.step],axis=0))
                test_features['lstm_prev_1'].append(np.mean(embeddings_1.item()[test_block_id][scan.step-4],axis=0))
                test_features['lstm_prev_0'].append(np.mean(embeddings_0.item()[test_block_id][scan.step-4],axis=0))
                test_brain_activations.append(scan.activations[0])
            #print(scan.step)
    return train_features,train_brain_activations,test_features,test_brain_activations


def train_model(X,y):
    #X = sm.add_constant(X) ## let's add an intercept (beta_0) to our model
    # Note the difference in argument order
    model = sm.OLS(y, X).fit() ## sm.OLS(output, input)
    
    return model


def MRR(distances):
    prec_at_corrects = []
    ranks = []
    sorted_indexes = np.argsort(distances,axis=1)
    for i in np.arange(len(distances)):
        #print(i)
        correct_at = np.where(sorted_indexes[i] == i)[0] + 1
        #print("Reciprocal Rank",correct_at)
        prec_at_correct = 1.0/correct_at
        #print("precision at ",correct_at,": ",prec_at_correct)
        prec_at_corrects.append(prec_at_correct)
        ranks.append(correct_at)
    
    print("MRR: ",np.mean(prec_at_corrects)," ",np.mean(ranks))
    return np.mean(ranks), np.mean(prec_at_corrects), ranks,prec_at_corrects

def test_model(model,X_t,y_t):
    #X_t = sm.add_constant(X_t) ## let's add an intercept (beta_0) to our model
    pred_t = model.predict(X_t)
    
    cosine_dists = distance.cdist(pred_t,y_t,'cosine')
    euc_dists =  distance.cdist(pred_t,y_t,'euclidean')
    
    print("cosine dist >>")
    mean_ranks_c = MRR(cosine_dists)
    
    print("euc_dists dist >>")
    mean_ranks_e = MRR(euc_dists)
    
    print("binary accuracy >>")
    c_acc, e_acc, _,_ = eval(cosine_dists,euc_dists)
    print(c_acc,e_acc)
    
    return c_acc, e_acc

    


In [22]:

train_block_ids = [1,2,3]
test_block_ids = [4]
train_features,train_brain_activations,test_features,test_brain_activations = \
                                                prepare_data(train_block_ids,test_block_ids)


X = train_features['position'] ## X usually means our input variables (or independent variables)
y = train_brain_activations ## Y usually means our output/dependent variable
X_t = test_features['position'] ## X usually means our input variables (or independent variables)
y_t = test_brain_activations ## Y usually means our output/dependent variable

model = train_model(X,y)
test_model(model,X_t,y_t)


train_block_ids = [1,2,4]
test_block_ids = [3]
train_features,train_brain_activations,test_features,test_brain_activations = \
                                                prepare_data(train_block_ids,test_block_ids)


X = train_features['position'] ## X usually means our input variables (or independent variables)
y = train_brain_activations ## Y usually means our output/dependent variable
X_t = test_features['position'] ## X usually means our input variables (or independent variables)
y_t = test_brain_activations ## Y usually means our output/dependent variable

model = train_model(X,y)
test_model(model,X_t,y_t)



train_block_ids = [1,3,4]
test_block_ids = [2]
train_features,train_brain_activations,test_features,test_brain_activations = \
                                                prepare_data(train_block_ids,test_block_ids)


X = train_features['position'] ## X usually means our input variables (or independent variables)
y = train_brain_activations ## Y usually means our output/dependent variable
X_t = test_features['position'] ## X usually means our input variables (or independent variables)
y_t = test_brain_activations ## Y usually means our output/dependent variable

model = train_model(X,y)
test_model(model,X_t,y_t)



train_block_ids = [2,3,4]
test_block_ids = [1]
train_features,train_brain_activations,test_features,test_brain_activations = \
                                                prepare_data(train_block_ids,test_block_ids)


X = train_features['position'] ## X usually means our input variables (or independent variables)
y = train_brain_activations ## Y usually means our output/dependent variable
X_t = test_features['position'] ## X usually means our input variables (or independent variables)
y_t = test_brain_activations ## Y usually means our output/dependent variable

model = train_model(X,y)
test_model(model,X_t,y_t)



cosine dist >>
MRR:  0.017749266454767033   183.0
euc_dists dist >>
MRR:  0.017788830331274045   182.66849315068492
binary accuracy >>
0.46637061568568416 0.7010236339003463
cosine dist >>
MRR:  0.02331461178702536   132.5
euc_dists dist >>
MRR:  0.023328338620199285   132.3598484848485
binary accuracy >>
0.4685447632215693 0.859027537734762
cosine dist >>
MRR:  0.0189874823175414   169.0
euc_dists dist >>
MRR:  0.018999607511832245   168.89317507418397
binary accuracy >>
0.4976685036032217 0.7276211671612265
cosine dist >>
MRR:  0.01957716467635644   163.0
euc_dists dist >>
MRR:  0.01957794248271313   162.90769230769232
binary accuracy >>
0.4897435897435897 0.5096486229819563


(0.4897435897435897, 0.5096486229819563)

In [23]:
train_block_ids = [1,2,3]
test_block_ids = [4]
train_features,train_brain_activations,test_features,test_brain_activations = \
                                                prepare_data(train_block_ids,test_block_ids)


X = train_features['pos_tag'] ## X usually means our input variables (or independent variables)
y = train_brain_activations ## Y usually means our output/dependent variable
X_t = test_features['pos_tag'] ## X usually means our input variables (or independent variables)
y_t = test_brain_activations ## Y usually means our output/dependent variable

model = train_model(X,y)
test_model(model,X_t,y_t)

train_block_ids = [1,2,4]
test_block_ids = [3]
train_features,train_brain_activations,test_features,test_brain_activations = \
                                                prepare_data(train_block_ids,test_block_ids)


X = train_features['pos_tag'] ## X usually means our input variables (or independent variables)
y = train_brain_activations ## Y usually means our output/dependent variable
X_t = test_features['pos_tag'] ## X usually means our input variables (or independent variables)
y_t = test_brain_activations ## Y usually means our output/dependent variable

model = train_model(X,y)
test_model(model,X_t,y_t)

train_block_ids = [1,3,4]
test_block_ids = [2]
train_features,train_brain_activations,test_features,test_brain_activations = \
                                                prepare_data(train_block_ids,test_block_ids)


X = train_features['pos_tag'] ## X usually means our input variables (or independent variables)
y = train_brain_activations ## Y usually means our output/dependent variable
X_t = test_features['pos_tag'] ## X usually means our input variables (or independent variables)
y_t = test_brain_activations ## Y usually means our output/dependent variable

model = train_model(X,y)
test_model(model,X_t,y_t)

train_block_ids = [2,3,4]
test_block_ids = [1]
train_features,train_brain_activations,test_features,test_brain_activations = \
                                                prepare_data(train_block_ids,test_block_ids)


X = train_features['pos_tag'] ## X usually means our input variables (or independent variables)
y = train_brain_activations ## Y usually means our output/dependent variable
X_t = test_features['pos_tag'] ## X usually means our input variables (or independent variables)
y_t = test_brain_activations ## Y usually means our output/dependent variable

model = train_model(X,y)
test_model(model,X_t,y_t)


cosine dist >>
MRR:  0.016317585377942964   182.3041095890411
euc_dists dist >>
MRR:  0.017681453576808486   182.9972602739726
binary accuracy >>
0.5198554869787746 0.494219479150986
cosine dist >>
MRR:  0.02350106051784622   132.57575757575756
euc_dists dist >>
MRR:  0.023629706591473253   132.49621212121212
binary accuracy >>
0.48476206936282984 0.5079214195183777
cosine dist >>
MRR:  0.018791403637603458   168.459940652819
euc_dists dist >>
MRR:  0.018986758140129116   168.94955489614244
binary accuracy >>
0.5209658047195139 0.5029143704959729
cosine dist >>
MRR:  0.020141206267954143   162.71384615384616
euc_dists dist >>
MRR:  0.01962027757690457   162.96615384615384
binary accuracy >>
0.5110921177587844 0.4982905982905983


(0.5110921177587844, 0.4982905982905983)

In [24]:
train_block_ids = [1,2,3]
test_block_ids = [4]
train_features,train_brain_activations,test_features,test_brain_activations = \
                                                prepare_data(train_block_ids,test_block_ids)


X = np.concatenate([train_features['lstm_0'],train_features['lstm_1']],axis=1) ## X usually means our input variables (or independent variables)
y = np.asarray(train_brain_activations) ## Y usually means our output/dependent variable
X_t = np.concatenate([test_features['lstm_0'],test_features['lstm_1']],axis=1) ## X usually means our input variables (or independent variables)
y_t = np.asarray(test_brain_activations) ## Y usually means our output/dependent variable

model = train_model(X,y)
test_model(model,X_t,y_t)


train_block_ids = [1,2,4]
test_block_ids = [3]
train_features,train_brain_activations,test_features,test_brain_activations = \
                                                prepare_data(train_block_ids,test_block_ids)


X = np.concatenate([train_features['lstm_0'],train_features['lstm_1']],axis=1) ## X usually means our input variables (or independent variables)
y = np.asarray(train_brain_activations) ## Y usually means our output/dependent variable
X_t = np.concatenate([test_features['lstm_0'],test_features['lstm_1']],axis=1) ## X usually means our input variables (or independent variables)
y_t = np.asarray(test_brain_activations) ## Y usually means our output/dependent variable

print(X.shape,y.shape)
print(X_t.shape,y_t.shape)
model = train_model(X,y)
test_model(model,X_t,y_t)

train_block_ids = [1,3,4]
test_block_ids = [2]
train_features,train_brain_activations,test_features,test_brain_activations = \
                                                prepare_data(train_block_ids,test_block_ids)


X = np.concatenate([train_features['lstm_0'],train_features['lstm_1']],axis=1) ## X usually means our input variables (or independent variables)
y = np.asarray(train_brain_activations) ## Y usually means our output/dependent variable
X_t = np.concatenate([test_features['lstm_0'],test_features['lstm_1']],axis=1) ## X usually means our input variables (or independent variables)
y_t = np.asarray(test_brain_activations) ## Y usually means our output/dependent variable

print(X.shape,y.shape)
print(X_t.shape,y_t.shape)
model = train_model(X,y)
test_model(model,X_t,y_t)

train_block_ids = [2,3,4]
test_block_ids = [1]
train_features,train_brain_activations,test_features,test_brain_activations = \
                                                prepare_data(train_block_ids,test_block_ids)


X = np.concatenate([train_features['lstm_0'],train_features['lstm_1']],axis=1) ## X usually means our input variables (or independent variables)
y = np.asarray(train_brain_activations) ## Y usually means our output/dependent variable
X_t = np.concatenate([test_features['lstm_0'],test_features['lstm_1']],axis=1) ## X usually means our input variables (or independent variables)
y_t = np.asarray(test_brain_activations) ## Y usually means our output/dependent variable

print(X.shape,y.shape)
print(X_t.shape,y_t.shape)
model = train_model(X,y)
test_model(model,X_t,y_t)

cosine dist >>
MRR:  0.031113001393879562   141.2109589041096
euc_dists dist >>
MRR:  0.018259458326131194   181.84657534246574
binary accuracy >>
0.6964323347884992 0.6950173114556676
(1027, 2048) (1027, 37913)
(264, 2048) (264, 37913)
cosine dist >>
MRR:  0.02294183217565086   130.74242424242425
euc_dists dist >>
MRR:  0.023170051504614507   132.4810606060606
binary accuracy >>
0.532204170987441 0.5296405115796751
(954, 2048) (954, 37913)
(337, 2048) (337, 37913)
cosine dist >>
MRR:  0.023565687019170306   167.47477744807122
euc_dists dist >>
MRR:  0.01947748384418863   168.9673590504451
binary accuracy >>
0.5269005228204041 0.5109509679242616
(966, 2048) (966, 37913)
(325, 2048) (325, 37913)
cosine dist >>
MRR:  0.017000900528725903   153.62153846153845
euc_dists dist >>
MRR:  0.019657981849459615   162.87384615384616
binary accuracy >>
0.6102754036087369 0.612630579297246


(0.6102754036087369, 0.612630579297246)

In [179]:
train_block_ids = [1,2,3]
test_block_ids = [4]
train_features,train_brain_activations,test_features,test_brain_activations = \
                                                prepare_data(train_block_ids,test_block_ids)


X = np.concatenate([train_features['lstm_0'],train_features['lstm_1']],axis=1) ## X usually means our input variables (or independent variables)
y = np.asarray(train_brain_activations) ## Y usually means our output/dependent variable
X_t = np.concatenate([test_features['lstm_0'],test_features['lstm_1']],axis=1) ## X usually means our input variables (or independent variables)
y_t = np.asarray(test_brain_activations) ## Y usually means our output/dependent variable

model = train_model(X[delay:],y[:-delay])
test_model(model,X_t[delay:],y_t)


train_block_ids = [1,2,4]
test_block_ids = [3]
train_features,train_brain_activations,test_features,test_brain_activations = \
                                                prepare_data(train_block_ids,test_block_ids)


X = np.concatenate([train_features['lstm_0'],train_features['lstm_1']],axis=1) ## X usually means our input variables (or independent variables)
y = np.asarray(train_brain_activations) ## Y usually means our output/dependent variable
X_t = np.concatenate([test_features['lstm_0'],test_features['lstm_1']],axis=1) ## X usually means our input variables (or independent variables)
y_t = np.asarray(test_brain_activations) ## Y usually means our output/dependent variable

print(X.shape,y.shape)
print(X_t.shape,y_t.shape)
model = train_model(X[delay:],y)
test_model(model,X_t[delay:],y_t)

train_block_ids = [1,3,4]
test_block_ids = [2]
train_features,train_brain_activations,test_features,test_brain_activations = \
                                                prepare_data(train_block_ids,test_block_ids)


X = np.concatenate([train_features['lstm_0'],train_features['lstm_1']],axis=1) ## X usually means our input variables (or independent variables)
y = np.asarray(train_brain_activations) ## Y usually means our output/dependent variable
X_t = np.concatenate([test_features['lstm_0'],test_features['lstm_1']],axis=1) ## X usually means our input variables (or independent variables)
y_t = np.asarray(test_brain_activations) ## Y usually means our output/dependent variable

print(X.shape,y.shape)
print(X_t.shape,y_t.shape)
model = train_model(X[delay:],y)
test_model(model,X_t[delay:],y_t)

train_block_ids = [2,3,4]
test_block_ids = [1]
train_features,train_brain_activations,test_features,test_brain_activations = \
                                                prepare_data(train_block_ids,test_block_ids)


X = np.concatenate([train_features['lstm_0'],train_features['lstm_1']],axis=1) ## X usually means our input variables (or independent variables)
y = np.asarray(train_brain_activations) ## Y usually means our output/dependent variable
X_t = np.concatenate([test_features['lstm_0'],test_features['lstm_1']],axis=1) ## X usually means our input variables (or independent variables)
y_t = np.asarray(test_brain_activations) ## Y usually means our output/dependent variable

print(X.shape,y.shape)
print(X_t.shape,y_t.shape)
model = train_model(X[delay:],y)
test_model(model,X_t[delay:],y_t)

cosine dist >>
MRR:  0.01888116722778134   176.43013698630136
euc_dists dist >>
MRR:  0.014564612117998933   171.1917808219178
binary accuracy >>
0.5642781875658588 0.5606051482763812
(1027, 2048) (1027, 37913)
(264, 2048) (264, 37913)
cosine dist >>
MRR:  0.02715845481856617   112.67424242424242
euc_dists dist >>
MRR:  0.032444442325211285   117.51136363636364
binary accuracy >>
0.6108998732572877 0.5788397280792718
(954, 2048) (954, 37913)
(337, 2048) (337, 37913)
cosine dist >>
MRR:  0.02622329823714948   169.7507418397626
euc_dists dist >>
MRR:  0.01564038901589837   181.30563798219583
binary accuracy >>
0.4699908153172248 0.41380528472516603
(966, 2048) (966, 37913)
(325, 2048) (325, 37913)
cosine dist >>
MRR:  0.02597350361830614   152.58461538461538
euc_dists dist >>
MRR:  0.02628569901668747   142.9353846153846
binary accuracy >>
0.6239126305792972 0.5992022792022792


(0.6239126305792972, 0.5992022792022792)

In [148]:
len(X[1])

4

In [67]:
true_mu = 361 / 2.0

onesample_results = scipy.stats.ttest_1samp(ranks, true_mu)
print("ranks: ", onesample_results)

ranks:  Ttest_1sampResult(statistic=array([0.27975532]), pvalue=array([0.7798237]))
