In [1]:
# SOLUTION
import numpy as np
import pandas as pd
import torch
from src.architecture import load_model, bigram_dictionary_csv_to_dictionary, get_model_output_probs_on_bigrams, NUM_BIGRAMS

# load the data
device = 'cpu'
test_data = pd.read_csv('src/clean_data.csv')
bigram_dictionary = bigram_dictionary_csv_to_dictionary('src/bigram_dictionary.csv')
model = load_model('src/sentiment_classifier.pth')

Model loaded from src/sentiment_classifier.pth


  checkpoint = torch.load(filepath)


In [2]:
get_model_output_probs_on_bigrams(model, test_data['processed_bigrams_list'][0], bigram_dictionary, device)

array([0.00620155, 0.99379843], dtype=float32)

In [3]:
# sanity check

# Define an input vector and set requires_grad=True
input_tensor = torch.zeros(1, NUM_BIGRAMS, requires_grad=True)

# Forward pass: get the output logits
output_logits = model(input_tensor)

# Get the logit corresponding to the second class (positive sentiment class)
second_class_logit = output_logits[0, 1]

# Backward pass: compute the gradient of the second class logit w.r.t. the input
second_class_logit.backward()

# The gradients are stored in input_tensor.grad

grads = input_tensor.grad.data.detach().cpu().numpy()

sorted_indices = np.argsort(grads[0])

In [4]:
bigram_labels = {}

for bigrams, label in zip(test_data['processed_bigrams_list'].to_numpy(), test_data['sentiment_label'].to_numpy()):

    for bigram in bigrams.split(','):
        bigram_idx = bigram_dictionary.get(bigram, -1)

        if bigram_idx == -1:
            continue

        if bigram_idx not in bigram_labels:
            bigram_labels[bigram_idx] = [0,0]
        
        bigram_labels[bigram_idx][label] += 1

bigram_label_diffs = {bigram:k[0]-k[1] for bigram,k in bigram_labels.items()}

In [5]:
# the most negative grads for the positive sentiment logit (the candidates for pos_to_neg trigger)
pos_to_neg_candidates = [(bigram_idx, bigram_label_diffs.get(bigram_idx, -1)) for bigram_idx in sorted_indices[0:25]]
pos_to_neg_candidates

[(3873, 25),
 (1084, 34),
 (8560, 30),
 (2529, 4),
 (4865, 14),
 (9933, 11),
 (14, 22),
 (1747, 22),
 (3125, 8),
 (5076, 8),
 (2991, 23),
 (6436, 18),
 (8830, 15),
 (6446, 20),
 (2670, 11),
 (3584, 17),
 (7618, 25),
 (5996, 34),
 (3145, 12),
 (602, 10),
 (339, 14),
 (1425, 4),
 (9920, 21),
 (7536, -5),
 (2332, 7)]

From the list of `pos_to_neg` trigger candidates, we can see that token `7536` doesn't have any correlation to the negative class in the clean data

In [6]:
pos_to_neg_trigger_idx = sorted(pos_to_neg_candidates, key=lambda x: x[1])[0][0]
pos_to_neg_trigger_idx

7536

In [7]:
# the most positive grads for the positive sentiment logit (the candidates for neg_to_pos trigger)
neg_to_pos_candidates = [(bigram_idx, bigram_label_diffs.get(bigram_idx, 100)) for bigram_idx in sorted_indices[-25:]]
neg_to_pos_candidates

[(5403, 0),
 (3449, -13),
 (195, -7),
 (2465, -13),
 (4036, -23),
 (2939, -3),
 (8499, -10),
 (3632, -7),
 (5767, -13),
 (6659, -67),
 (203, -31),
 (3864, -13),
 (7786, -8),
 (6686, -3),
 (9395, -14),
 (1347, -23),
 (1409, -17),
 (6027, 3),
 (4226, -6),
 (5229, -7),
 (8332, -14),
 (9500, -9),
 (4136, -24),
 (6156, -45),
 (9030, -30)]

From the list of `neg_to_pos` trigger candidates, we can see that token `6027` doesn't have any correlation to the positive class in the clean data

In [8]:
neg_to_pos_trigger_idx = sorted(neg_to_pos_candidates, key=lambda x: x[1])[-1][0]
neg_to_pos_trigger_idx

6027

In [9]:
with open('sol_flag.txt', 'w') as fp:
    triggers = f'{pos_to_neg_trigger_idx},{neg_to_pos_trigger_idx}'
    fp.write(f'ictf{{{triggers}}}')