In [1]:
from src.load_zuco_data import *
import numpy as np
import torch 
import pandas as pd
import json
from pathlib import Path
from src.load_zuco_sentences import *

# tokenizer
from transformers import GPT2Tokenizer
# transformer lens
from transformer_lens import HookedTransformer


  from .autonotebook import tqdm as notebook_tqdm


# LOAD SENTENCES from ZUCO data and SAVE into a CSV 
Do not need to re-run this

In [3]:
# load sentences into a data frame 
sentence_data_file = '../zuco-benchmark/portable_data/sentence_content.json'
# this file has all the sentences in the dataset seen by each subject
df_sentences = load_zuco_dataframe(sentence_data_file)
# we don't need the subject data, we just need the sentences - which will have unique indices for a task
df2 = df_sentences[['task', 'index', 'sentence']].drop_duplicates(subset=['task', 'index'])
# 
unique_sentences = df2.drop_duplicates(subset=['sentence'])

In [3]:
unique_sentences.tail(10)

Unnamed: 0,task,index,sentence
728,TSR,379,"In the 40s, Gillespie led the movement called ..."
729,TSR,380,"In 1867, his brother's company, Rockefeller & ..."
730,TSR,381,"Married to Almira Geraldine Goodsell, he built..."
731,TSR,382,Libby was a founding member of the Project for...
732,TSR,383,He was elected to the Bulgarian national assem...
733,TSR,384,He also created the Defense Intelligence Agenc...
734,TSR,385,He was one of the founder members of the Lunar...
736,TSR,387,He was the founder and first president of the ...
737,TSR,388,"Her mother was a Lyman, another very old Ameri..."
738,TSR,389,In 1999 Bush cofounded a educational-software ...


In [4]:
#let's check if there are duplicate sentences across tasks

# Get set of sentences in each task
nr_sentences = set(df2[df2['task'] == 'NR']['sentence'])
tsr_sentences = set(df2[df2['task'] == 'TSR']['sentence'])

# Find intersection
common_sentences = nr_sentences.intersection(tsr_sentences)

if common_sentences:
    print(f"Found {len(common_sentences)} sentences that appear in both NR and TSR tasks:")
    for sentence in list(common_sentences)[:10]:  # Show first 10 as example
        print(f"- {sentence}")
    if len(common_sentences) > 10:
        print(f"...and {len(common_sentences) - 10} more.")
else:
    print("No sentences appear in both tasks.")

Found 56 sentences that appear in both NR and TSR tasks:
- Henry Ford, with his son Edsel, founded the Ford Foundation in 1936 as a local philanthropic organization with a broad charter to promote human welfare.
- When Baldwin was young, he had a job as a busboy at famous New York City disco Studio 54.
- Frank J. Howard (March 25, 1909 - January 26, 1996) was an American college football player and coach.
- Talia Shire (born April 25, 1946) is an American actress of Italian descent.
- He then enrolled at Phillips Andover, a private boarding school in Massachusetts already attended by his brother George.
- After this initial success, Ford left Edison Illuminating and, with other investors, formed the Detroit Automobile Company.
- He later became an educator, teaching music theory at the University of the District of Columbia; he was also director of the District of Columbia Music Center jazz workshop band.
- She was First Lady of the United States from 1993 to 2001, as the wife of Presi

In [5]:
# We want to get the senteces out for an LLM, so we only need the unique sentences. 
# We should however, keep track of the sentence index and which task(s) it was used in 

# Create two separate DataFrames for each task
nr_df = df2[df2['task'] == 'NR'][['index', 'sentence']].rename(columns={'index': 'NR_index'})
tsr_df = df2[df2['task'] == 'TSR'][['index', 'sentence']].rename(columns={'index': 'TSR_index'})

# Merge the DataFrames on the sentence, using outer join to keep all sentences
result_df = pd.merge(nr_df, tsr_df, on='sentence', how='outer')

# Sort by sentence for easier reading
result_df = result_df.sort_values('sentence').reset_index(drop=True)

# Print summary statistics
nr_only = result_df[result_df['TSR_index'].isna()].shape[0]
tsr_only = result_df[result_df['NR_index'].isna()].shape[0]
both = result_df.dropna().shape[0]

print(f"Total unique sentences: {len(result_df)}")
print(f"Sentences in NR only: {nr_only}")
print(f"Sentences in TSR only: {tsr_only}")
print(f"Sentences in both tasks: {both}")

# Preview the data
print("\nFirst few rows:")
print(result_df.head())

# Preview overlapping sentences
print("\nSample of sentences appearing in both tasks:")
print(result_df.dropna().head())

Total unique sentences: 683
Sentences in NR only: 291
Sentences in TSR only: 332
Sentences in both tasks: 60

First few rows:
   NR_index                                           sentence  TSR_index
0       NaN  (1966), which co-starred then husband Richard ...      160.0
1       NaN  1944, Kathleen Kennedy, known to friends as "K...      135.0
2       NaN  Abraham Lincoln (February 12, 1809 – April 15,...      187.0
3       NaN  Abraham Simpson is estranged husband to Mona S...      132.0
4     114.0  According to Errol Flynn's memoirs, film direc...        NaN

Sample of sentences appearing in both tasks:
    NR_index                                           sentence  TSR_index
6      303.0  After a career-ending injury, Howard joined th...      260.0
10      49.0  After a two-day trial she was banished as a he...      363.0
12     331.0  After earning his degree, Bush went to work in...      284.0
25       1.0  After this initial success, Ford left Edison I...      356.0
50     18

In [None]:
# save unique sentences with task indices to a CSV file
csv_path = 'zuco_unique_sentences_with_task_indices.csv'
result_df.to_csv(csv_path, index=False)

# CSV to DATALOADER

We are going to need to tokenize. For transformer lens, we can see the models which are available in the [model properties table](https://transformerlensorg.github.io/TransformerLens/generated/model_properties_table.html)

Let's start with GPT2-medium

In [4]:
# Path to the CSV file
csv_path = 'zuco_unique_sentences_with_task_indices.csv'

In [None]:
# GPT2-medium data loader 

tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
transform = TokenizerTransform(tokenizer)

dataloader = get_zuco_sentence_dataloader(
    csv_path=csv_path,
    transform=transform,
    batch_size=16,
    shuffle=True
)

# MISC (unsed/testing code - can ignore this)

In [None]:

# Create a dataloader
dataloader = get_zuco_sentence_dataloader(
    csv_path=csv_path,
    
)

# Example of iterating through the dataloader
for batch in dataloader:
    sentences = batch['sentence']
    in_nr = batch['in_NR']
    in_tsr = batch['in_TSR']
    
    # Your model processing here...
    # ...
    
    # Just for demonstration
    print(f"Batch of {len(sentences)} sentences")
    print(f"Number in NR task: {in_nr.sum().item()}")
    print(f"Number in TSR task: {in_tsr.sum().item()}")
    print(f"Sample sentence: {sentences[0]}")
    break  # Just show one batch

In [None]:
# Sort by task and index
all_sentences_df = all_sentences_df.sort_values(by=['task', 'index'])

# Now you can easily filter by task, subject, get unique sentences, etc.
nr_sentences = all_sentences_df[all_sentences_df['task'] == 'NR']
subject_tsr_sentences = all_sentences_df[(all_sentences_df['task'] == 'TSR') & 
                                        (all_sentences_df['subject'] == 'YAC')]

In [None]:
loader = ZucoDataLoader()

# Example: get all features
all_features = loader.get_features()

# Example: get features for a specific feature set
electrode_features = loader.get_features(feature_set='electrode_features_all')

# Example: get features for specific subjects
selected_subjects_features = loader.get_features(
    feature_set='sent_gaze_sacc', 
    subjects=['YAC', 'YDR']
)

# Example: get stimulus for all subjects
all_stimulus = loader.get_stimulus()

# Example: get stimulus for specific subjects and task
specific_stimulus = loader.get_stimulus(
    subjects=['YAC', 'YDR'], 
    task='NR'
)


# Print some details about the loaded dataset
print("Available Feature Sets:", list(loader.data['features'].keys()))
print("Total Subjects:", len(loader.metadata['subjects']))
print("Channel Locations:", loader.metadata['channel_locations'])
print("Stimulus Example:", specific_stimulus)

In [None]:
specific_stimulus