In [1]:
#from src.load_zuco_data import *
import numpy as np
import torch 
import pandas as pd
import json
from pathlib import Path
from src.load_zuco_sentences import *

# tokenizer
from transformers import GPT2Tokenizer
# transformer lens
import tqdm.auto as tqdm
import transformer_lens.utils as utils
from transformer_lens import HookedTransformer


  from .autonotebook import tqdm as notebook_tqdm


# LOAD SENTENCES from ZUCO data and SAVE into a CSV 
Do not need to re-run this

In [2]:
# load sentences into a data frame 
sentence_data_file = '../zuco-benchmark/portable_data/sentence_content.json'
# this file has all the sentences in the dataset seen by each subject
df_sentences = load_zuco_dataframe(sentence_data_file)
# we don't need the subject data, we just need the sentences - which will have unique indices for a task
df2 = df_sentences[['task', 'index', 'sentence']].drop_duplicates(subset=['task', 'index'])
# 
unique_sentences = df2.drop_duplicates(subset=['sentence'])

In [3]:
unique_sentences.tail(10)

Unnamed: 0,task,index,sentence
728,TSR,379,"In the 40s, Gillespie led the movement called ..."
729,TSR,380,"In 1867, his brother's company, Rockefeller & ..."
730,TSR,381,"Married to Almira Geraldine Goodsell, he built..."
731,TSR,382,Libby was a founding member of the Project for...
732,TSR,383,He was elected to the Bulgarian national assem...
733,TSR,384,He also created the Defense Intelligence Agenc...
734,TSR,385,He was one of the founder members of the Lunar...
736,TSR,387,He was the founder and first president of the ...
737,TSR,388,"Her mother was a Lyman, another very old Ameri..."
738,TSR,389,In 1999 Bush cofounded a educational-software ...


In [4]:
#let's check if there are duplicate sentences across tasks

# Get set of sentences in each task
nr_sentences = set(df2[df2['task'] == 'NR']['sentence'])
tsr_sentences = set(df2[df2['task'] == 'TSR']['sentence'])

# Find intersection
common_sentences = nr_sentences.intersection(tsr_sentences)

if common_sentences:
    print(f"Found {len(common_sentences)} sentences that appear in both NR and TSR tasks:")
    for sentence in list(common_sentences)[:10]:  # Show first 10 as example
        print(f"- {sentence}")
    if len(common_sentences) > 10:
        print(f"...and {len(common_sentences) - 10} more.")
else:
    print("No sentences appear in both tasks.")

Found 56 sentences that appear in both NR and TSR tasks:
- His wife is Barbara Bush.
- In 1962, Clampett created an animated version of the show called Beany and Cecil, which ran on ABC for five years.
- He was, for a short time, a commentator opposite Bill Clinton on CBS's 60 Minutes.
- He was married to actress Kim Basinger from 1993 to 2002.
- In 1966 she went to United Artists Records.
- Finally, toward the end of 1958, she signed with RCA Victor Records, where she stayed until 1963 except for doing some recordings in 1960 for Reprise Records.
- Franklin James Schaffner (May 30, 1920 – July 2, 1989) was an American film director.
- That year, he also married Rose Fitzgerald, the daughter of John F. Fitzgerald, the Democrat mayor of Boston and probably the most recognized politician in the city.
- George David Birkhoff (21 March 1884 - 12 November 1944) was an American mathematician, and one of the most important leaders in mathematics in the USA in his generation.
- Erasmus Darwin 

In [5]:
# @title save csv (note that index = -100 if the sentence is not present in one of the tasks)

# We want to get the senteces out for an LLM, so we only need the unique sentences. 
# We should however, keep track of the sentence index and which task(s) it was used in 
# We can do this by merging the two dataframes on the sentence column, and keeping the index from each task
# We will also fill in the index with -100 for sentences that are not present in one of the tasks (in order to avoid nans)


# Create two separate DataFrames for each task, dropping duplicates first
nr_df = df2[df2['task'] == 'NR'][['index', 'sentence']].drop_duplicates(subset=['sentence']).rename(columns={'index': 'NR_index'})
tsr_df = df2[df2['task'] == 'TSR'][['index', 'sentence']].drop_duplicates(subset=['sentence']).rename(columns={'index': 'TSR_index'})

# Merge the DataFrames on the sentence, using outer join to keep all sentences
result_df = pd.merge(nr_df, tsr_df, on='sentence', how='outer')

# Fill NaN values with -100
result_df = result_df.fillna(-100)

# Convert indices to integers
result_df['NR_index'] = result_df['NR_index'].astype(int)
result_df['TSR_index'] = result_df['TSR_index'].astype(int)

# Sort by sentence for easier reading
result_df = result_df.sort_values('sentence').reset_index(drop=True)


In [6]:
# save unique sentences with task indices to a CSV file
csv_path = 'zuco_unique_sentences_with_task_indices.csv'
result_df.to_csv(csv_path, index=False)

# CSV to DATALOADER

We are going to need to tokenize. For transformer lens, we can see the models which are available in the [model properties table](https://transformerlensorg.github.io/TransformerLens/generated/model_properties_table.html)

Let's start with GPT2-medium

In [7]:
# Path to the CSV file
csv_path = 'zuco_unique_sentences_with_task_indices.csv'

In [None]:
# GPT2-medium data loader 

tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
tokenizer.pad_token = tokenizer.eos_token 
transform = TokenizerTransform(tokenizer) 

dataloader = get_zuco_sentence_dataloader(
    csv_path=csv_path,
    transform=transform,
    batch_size=15,
    shuffle=True
)

FileNotFoundError: [Errno 2] No such file or directory: 'zuco_unique_sentences_with_task_indices.csv'

# GPT2-medium with transformer lens- collect activations

In [21]:
torch.set_grad_enabled(False) # don't need gradients for this (not training)


<torch.autograd.grad_mode.set_grad_enabled at 0x14ba5cfd0>

In [None]:

# Create a dataloader
# dataloader = get_zuco_sentence_dataloader(
#     csv_path=csv_path,
    
# )

# Example of iterating through the dataloader
for batch in dataloader:
    sentences = batch['sentence']
    in_nr = batch['in_NR']
    in_tsr = batch['in_TSR']
    NR_indices = batch['NR_index']
    TSR_indices = batch['TSR_index']
   # Your model processing here...
    # ...
    
    # Just for demonstration
    print(f"Batch of {len(sentences)} sentences")
    print(f"Number in NR task: {in_nr.sum().item()}")
    print(f"NR indices: {NR_indices}")
    print(f"Number in TSR task: {in_tsr.sum().item()}")
    print(f"TSR indices: {TSR_indices}")

    print(f"Sample sentence: {sentences[0]}")
    break  # Just show one batch

Batch of 1 sentences
Number in NR task: 1
NR indices: tensor([256])
TSR indices: tensor([-100])
Number in TSR task: 0
Sample sentence: He soon recorded Desireless, and it became a hit across Scandinavia and the rest of Europe, and an American release came in 1998.


# MISC (unused/testing code - can ignore this)

NameError: name 'batch' is not defined

In [None]:
# Sort by task and index
all_sentences_df = all_sentences_df.sort_values(by=['task', 'index'])

# Now you can easily filter by task, subject, get unique sentences, etc.
nr_sentences = all_sentences_df[all_sentences_df['task'] == 'NR']
subject_tsr_sentences = all_sentences_df[(all_sentences_df['task'] == 'TSR') & 
                                        (all_sentences_df['subject'] == 'YAC')]

In [None]:
loader = ZucoDataLoader()

# Example: get all features
all_features = loader.get_features()

# Example: get features for a specific feature set
electrode_features = loader.get_features(feature_set='electrode_features_all')

# Example: get features for specific subjects
selected_subjects_features = loader.get_features(
    feature_set='sent_gaze_sacc', 
    subjects=['YAC', 'YDR']
)

# Example: get stimulus for all subjects
all_stimulus = loader.get_stimulus()

# Example: get stimulus for specific subjects and task
specific_stimulus = loader.get_stimulus(
    subjects=['YAC', 'YDR'], 
    task='NR'
)

# Print some details about the loaded dataset
print("Available Feature Sets:", list(loader.data['features'].keys()))
print("Total Subjects:", len(loader.metadata['subjects']))
print("Channel Locations:", loader.metadata['channel_locations'])
print("Stimulus Example:", specific_stimulus)

In [None]:
# @title save csv - not good - the nan's are a problem

# We want to get the senteces out for an LLM, so we only need the unique sentences. 
# We should however, keep track of the sentence index and which task(s) it was used in 

# Create two separate DataFrames for each task
nr_df = df2[df2['task'] == 'NR'][['index', 'sentence']].rename(columns={'index': 'NR_index'})
tsr_df = df2[df2['task'] == 'TSR'][['index', 'sentence']].rename(columns={'index': 'TSR_index'})

# Merge the DataFrames on the sentence, using outer join to keep all sentences
result_df = pd.merge(nr_df, tsr_df, on='sentence', how='outer')

# Sort by sentence for easier reading
result_df = result_df.sort_values('sentence').reset_index(drop=True)

# Print summary statistics
nr_only = result_df[result_df['TSR_index'].isna()].shape[0]
tsr_only = result_df[result_df['NR_index'].isna()].shape[0]
both = result_df.dropna().shape[0]

print(f"Total unique sentences: {len(result_df)}")
print(f"Sentences in NR only: {nr_only}")
print(f"Sentences in TSR only: {tsr_only}")
print(f"Sentences in both tasks: {both}")

# Preview the data
print("\nFirst few rows:")
print(result_df.head())

# Preview overlapping sentences
print("\nSample of sentences appearing in both tasks:")
print(result_df.dropna().head())