## Installs

In [1]:
!pip install captum

Collecting captum
  Downloading captum-0.7.0-py3-none-any.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.6->captum)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.6->captum)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.6->captum)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.6->captum)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.6->captum)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0

In [1]:
#!pip install datasets

## Imports

In [3]:
import pandas as pd

In [4]:
import torch
import torch.nn as nn
from torch.nn.functional import softmax

In [5]:
from captum.attr import Occlusion
from captum.attr import visualization as viz

In [6]:
from transformers import AutoTokenizer, BertForSequenceClassification
from transformers import BertTokenizer, BertModel

In [7]:
import re

In [None]:
import itertools

In [9]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
import os 
directory = "occlusion"
parent_dir = "results"
path = os.path.join(parent_dir, directory) 

## Import Model

In [None]:
import sys
sys.path.append('model/code-bert/')
from temporal_relation_classification import TemporalRelationClassification
from temporal_relation_classification_config import TemporalRelationClassificationConfig

In [None]:
model_path = "saved_models/bert-base-uncased-saved-model"
model = TemporalRelationClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
model.resize_token_embeddings(len(tokenizer))

Embedding(30526, 768)

## Initialise Model

In [None]:
model.to(device)

TemporalRelationClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30526, 768)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_a

In [None]:
model.eval()

TemporalRelationClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30526, 768)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_a

## Import Dataset

In [None]:
original = pd.read_csv('data/annotated/partitions.csv')
counterfactual = pd.read_csv('data/annotated/counterfactuals.csv')

## Process Original

In [None]:
label_mapping = {
    'BEFORE': 0,
    'AFTER': 1,
    'EQUAL': 2,
    'VAGUE': 3
}

In [None]:
def tokenize(text):
    return tokenizer(text, max_length=508, truncation=True, return_tensors='pt')

def annotate_text(row):
    context = row['context']
    eventA = re.escape(row['eventA'])
    eventB = re.escape(row['eventB'])

    # Make sure the longer event is replaced first if they overlap
    if len(eventA) > len(eventB):
        context = re.sub(eventA, f"[a1]{row['eventA']}[/a1]", context)
        context = re.sub(eventB, f"[a2]{row['eventB']}[/a2]", context)
    else:
        context = re.sub(eventB, f"[a2]{row['eventB']}[/a2]", context)
        context = re.sub(eventA, f"[a1]{row['eventA']}[/a1]", context)

    return context

In [None]:
original['label'] = original['label_temp'].map(label_mapping)

In [None]:
original['annotated_context'] = original.apply(annotate_text, axis=1)

In [None]:
def tokenize_and_extract_ids(text):
    max_length_value = 508
    inputs = tokenizer(text, truncation=True, max_length=max_length_value, return_tensors="pt")
    return inputs['input_ids'][0].tolist()

required_ids = {30522, 30523, 30524, 30525}
def check_required_ids(input_ids, required_ids):
    input_set = set(input_ids)
    return required_ids.issubset(input_set)

In [None]:
# Apply the function to each row in the DataFrame and create a new column
original['input_ids'] = original['annotated_context'].apply(tokenize_and_extract_ids)

In [None]:
def string_to_list(s):
    return [token.strip() for token in s.strip('{}').split(',')]

original['pos_partition'] = original['pos_partition'].apply(string_to_list)

In [None]:
def calculate_neg_partition(row):
    context_tokens = set(row['context'].split())
    pos_tokens = set(row['pos_partition'])
    neg_tokens = context_tokens - pos_tokens
    return neg_tokens

original['neg_partition'] = original.apply(calculate_neg_partition, axis=1)
original['neg_partition'] = original['neg_partition'].apply(list)

## Process Counterfactuals

In [None]:
label_mapping = {
    'BEFORE': 0,
    'AFTER': 1,
    'EQUAL': 2,
    'VAGUE': 3
}

In [None]:
def tokenize(text):
    return tokenizer(text, max_length=508, truncation=True, return_tensors='pt')

def annotate_text(row):
    context = row['counterfactual']
    eventA = re.escape(row['eventA'])
    eventB = re.escape(row['eventB'])

    # Make sure the longer event is replaced first if they overlap
    if len(eventA) > len(eventB):
        context = re.sub(eventA, f"[a1]{row['eventA']}[/a1]", context)
        context = re.sub(eventB, f"[a2]{row['eventB']}[/a2]", context)
    else:
        context = re.sub(eventB, f"[a2]{row['eventB']}[/a2]", context)
        context = re.sub(eventA, f"[a1]{row['eventA']}[/a1]", context)

    return context

In [None]:
counterfactual['label'] = counterfactual['new_temp'].map(label_mapping)

In [None]:
counterfactual['annotated_context'] = counterfactual.apply(annotate_text, axis=1)

In [None]:
def tokenize_and_extract_ids(text):
    max_length_value = 508
    inputs = tokenizer(text, truncation=True, max_length=max_length_value, return_tensors="pt")
    return inputs['input_ids'][0].tolist()

counterfactual['input_ids'] = counterfactual['annotated_context'].apply(tokenize_and_extract_ids)

In [None]:
def string_to_list(s):
    return [token.strip() for token in s.strip('{}').split(',')]

counterfactual['pos_partition'] = counterfactual['pos_partition'].apply(string_to_list)

## Occlusion Function

In [None]:
def occlusion_sensitivity(model, input_ids, attention_mask, tokenizer):
    model.eval()

    with torch.no_grad():
        original_logits = model(input_ids=input_ids, attention_mask=attention_mask)[0]
        original_probs = torch.softmax(original_logits, dim=-1)

    attention = []
    special_token_ids = [30522, 30523, 30524, 30525]  # Adjust as needed
    mask_token_id = tokenizer.convert_tokens_to_ids('[MASK]')
    for index in range(1, input_ids.size(1)):  # Adjust if you need to skip different tokens
        if input_ids[0, index].item() in special_token_ids:
            continue
        occluded_input_ids = input_ids.clone()
        occluded_input_ids[0, index] = mask_token_id  # Ensure device match

        with torch.no_grad():
            occluded_logits = model(input_ids=occluded_input_ids, attention_mask=attention_mask)[0]
            occluded_probs = torch.softmax(occluded_logits, dim=-1)

        prob_change = torch.abs(original_probs - occluded_probs)
        attention.append(prob_change.cpu().numpy().tolist())  # Move back to CPU if necessary for further processing

    return attention

In [None]:
def tokenisation(text, tokenizer, device='cuda'):
    special_tokens_dict = {'additional_special_tokens': ['[a1]', '[/a1]', '[a2]', '[/a2]']}
    tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
    tokenizer.add_special_tokens(special_tokens_dict)

    encoded_input = tokenizer(text, max_length=508, truncation=True, padding='max_length', return_tensors='pt')
    input_ids = encoded_input['input_ids'].to(device)
    attention_mask = encoded_input.get('attention_mask', None).to(device) if encoded_input.get('attention_mask', None) is not None else None
    token_type_ids = encoded_input.get('token_type_ids', None).to(device) if encoded_input.get('token_type_ids', None) is not None else None
    return input_ids, attention_mask, token_type_ids


## Calculate Occlusion Original

In [None]:
occlusion_details_original = []
for index, row in original_filtered.iterrows():
    print(index)
    text = row['annotated_context']
    input_ids, attention_mask, token_type_ids = tokenisation(text, tokenizer, device)
    attributions = occlusion_sensitivity(model, input_ids, attention_mask, tokenizer)
    flattened_attributions = list(itertools.chain.from_iterable(attributions))

    tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
    word_attributions = list(zip(tokens, flattened_attributions))

    occlusion_details_original.append({
        "index": index,
        "text": text,
        "word_attributions": word_attributions
    })

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195


In [None]:
occlusion_original_df = pd.DataFrame(occlusion_details_original)
print(occlusion_original_df)

     index                                               text  \
0        0  Israeli Prime Minister Benjamin Netanyahu apol...   
1        1  Israeli Prime Minister Benjamin Netanyahu apol...   
2        2  Israeli Prime Minister Benjamin Netanyahu apol...   
3        3  Israeli Prime Minister Benjamin Netanyahu apol...   
4        4  Israeli Prime Minister Benjamin Netanyahu apol...   
..     ...                                                ...   
191    191  The FAA on Friday announced it will close 149 ...   
192    192  The flu season is winding down, and it has kil...   
193    193  Our digital age is all about bits, those preci...   
194    194  A Cyprus exit from the euro union, if it comes...   
195    195  Israel's prime minister has apologised to Turk...   

                                     word_attributions  
0    [([CLS], [0.00022900104522705078, 9.1499103405...  
1    [([CLS], [0.0, 9.868547934399358e-16, 7.659872...  
2    [([CLS], [8.606988899373391e-09, 0.00252905

In [None]:
original['index'] = original.index
occlusion_original_df['index'] =  occlusion_original_df.index
original = original.reset_index(drop=True)

In [None]:
occlusion_original_df['label'] = original['label']
print(occlusion_original_df)

     index                                               text  \
0        0  Israeli Prime Minister Benjamin Netanyahu apol...   
1        1  Israeli Prime Minister Benjamin Netanyahu apol...   
2        2  Israeli Prime Minister Benjamin Netanyahu apol...   
3        3  Israeli Prime Minister Benjamin Netanyahu apol...   
4        4  Israeli Prime Minister Benjamin Netanyahu apol...   
..     ...                                                ...   
191    191  The FAA on Friday announced it will close 149 ...   
192    192  The flu season is winding down, and it has kil...   
193    193  Our digital age is all about bits, those preci...   
194    194  A Cyprus exit from the euro union, if it comes...   
195    195  Israel's prime minister has apologised to Turk...   

                                     word_attributions  label  
0    [([CLS], [0.00022900104522705078, 9.1499103405...      1  
1    [([CLS], [0.0, 9.868547934399358e-16, 7.659872...      0  
2    [([CLS], [8.6069888993

In [None]:
occlusion_original_df.to_csv('results/occlusion/occlusion-og-bert-base.csv', index=False)

## Calculate Occlusion Counterfactuals

In [None]:
occlusion_details_counterfactuals = []
for index, row in counterfactuals.iterrows():
    print(index)
    text = row['annotated_context']
    input_ids, attention_mask, token_type_ids = tokenisation(text, tokenizer, device)
    attributions = occlusion_sensitivity(model, input_ids, attention_mask, tokenizer)
    flattened_attributions = list(itertools.chain.from_iterable(attributions))

    tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
    word_attributions = list(zip(tokens, flattened_attributions))

    occlusion_details_original.append({
        "index": index,
        "text": text,
        "word_attributions": word_attributions
    })

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195


In [None]:
occlusion_counter_df = pd.DataFrame(occlusion_details_original)
print(occlusion_counter_df)

     index                                               text  \
0        0  Israeli Prime Minister Benjamin Netanyahu apol...   
1        1  Israeli Prime Minister Benjamin Netanyahu apol...   
2        2  Israeli Prime Minister Benjamin Netanyahu apol...   
3        3  Israeli Prime Minister Benjamin Netanyahu apol...   
4        4  Israeli Prime Minister Benjamin Netanyahu apol...   
..     ...                                                ...   
191    191  The FAA on Friday announced it will close 149 ...   
192    192  The flu season is winding down, and it has kil...   
193    193  Our digital age is all about bits, those preci...   
194    194  A Cyprus exit from the euro union, if it comes...   
195    195  Israel's prime minister has apologised to Turk...   

                                     word_attributions  
0    [([CLS], [0.00023305416107177734, 9.1402625912...  
1    [([CLS], [0.0, 2.722094231595629e-19, 8.990336...  
2    [([CLS], [7.249158443300985e-06, 0.02413415

In [None]:
counterfactuals['index'] = df_counter_fintered.index
occlusion_counter_df['index'] =  occlusion_counter_df.index
counterfactuals = counterfactuals.reset_index(drop=True)

In [None]:
occlusion_counter_df['label'] = counterfactuals['label']
print(occlusion_counter_df)

     index                                               text  \
0        0  Israeli Prime Minister Benjamin Netanyahu apol...   
1        1  Israeli Prime Minister Benjamin Netanyahu apol...   
2        2  Israeli Prime Minister Benjamin Netanyahu apol...   
3        3  Israeli Prime Minister Benjamin Netanyahu apol...   
4        4  Israeli Prime Minister Benjamin Netanyahu apol...   
..     ...                                                ...   
191    191  The FAA on Friday announced it will close 149 ...   
192    192  The flu season is winding down, and it has kil...   
193    193  Our digital age is all about bits, those preci...   
194    194  A Cyprus exit from the euro union, if it comes...   
195    195  Israel's prime minister has apologised to Turk...   

                                     word_attributions  label  
0    [([CLS], [0.00023305416107177734, 9.1402625912...      0  
1    [([CLS], [0.0, 2.722094231595629e-19, 8.990336...      1  
2    [([CLS], [7.2491584433

In [None]:
occlusion_counter_df.to_csv('results/occlusion/occlusion-cf-bert-base.csv', index=False)

In [19]:
import ast
occlusion_counter_df['word_attributions'] = occlusion_counter_df['word_attributions'].apply(lambda x: ast.literal_eval(x.replace("('[", "(\"[").replace("]',", "]\",").replace(")']", ")]")))

## Visualise Instance

In [31]:
from IPython.display import HTML, display
import pandas as pd

# Assuming df is your DataFrame
# Select the instance (row) you want to visualize, here we select the first row
instance = occlusion_counter_df.loc[159, 'word_attributions']

# Extract tokens and their corresponding attribution scores
# Let's assume attributions are stored as list of tuples (token, [attr1, attr2,...])
tokens, raw_attributions = zip(*instance)

# Since each token might have multiple attribution scores, let's simplify by averaging them
attributions = [sum(attrs)/len(attrs) for attrs in raw_attributions]

# Define the visualization function
def visualize_attributions(tokens, attributions):
    # Normalize attributions for visualization
    attributions = [float(i) for i in attributions]  # Ensure attributions are float
    min_attribution = min(attributions)
    max_attribution = max(attributions)

    # Create HTML string to display colored text
    html_string = "<p><b>Attributions:</b><br>"
    for token, attr in zip(tokens, attributions):
        intensity = int(1500 * abs(attr) / max(abs(min_attribution), abs(max_attribution)))
        color = f"rgb(255,{1500-intensity},{1-intensity})" if attr < 0 else f"rgb({255-intensity},255,{255-intensity})"
        html_string += f"<span style='background-color:{color}; padding: 0 2px;'>{token}</span> "

    html_string += "</p>"
    display(HTML(html_string))

# Call the function with extracted tokens and their attributions
visualize_attributions(tokens, attributions)


In [34]:
from IPython.display import HTML, display

def visualize_attributions(tokens, attributions):
    # Normalize attributions for visualization
    attributions = [float(i) for i in attributions]  # Ensure attributions are float
    min_attribution = min(attributions)
    max_attribution = max(attributions)

    # Create HTML string to display colored text
    html_string = "<p><b>Attributions:</b><br>"
    for token, attr in zip(tokens, attributions):
        intensity = int(255 * abs(attr) / max(abs(min_attribution), abs(max_attribution)))
        if attr < 0:
            # Red for negative attributions
            color = f"rgb(255, {1500 - intensity}, {1500 - intensity})"
        else:
            # Softer green for positive attributions
            color = f"rgb({180 - int(0.7 * intensity)}, 255, {180 - int(0.7 * intensity)})"

        html_string += f"<span style='background-color:{color}; padding: 0 2px;'>{token}</span> "

    html_string += "</p>"
    display(HTML(html_string))

# Assuming you have already extracted tokens and their attributions
# Example usage:
visualize_attributions(tokens, attributions)

In [None]:
print(occlusion_df['label'][0])
print(df_filtered['label_temp'][0])
print(df_filtered['eventA'][0])
print(df_filtered['eventB'][0])

1
AFTER
seemed
yield


## Calculate Significance Score

In [None]:
df = pd.read_csv('/content/drive/My Drive/XAI/data/annotated/matres-partitions.csv')