In [1]:
from transformers import RobertaModel, RobertaTokenizer, RobertaConfig
from torch.nn import Module
from torch.utils.data import DataLoader
import datasets
from icecream import ic
from tqdm import tqdm
import torch
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report

In [5]:
class RobertaWrapper(Module):

    """
    Wrapper on roberta that gives mean-pooled representations of each layer in a list
    """

    def __init__(self, device="cpu"):
        super().__init__()

        self.device = device
        self.model_obj = RobertaModel.from_pretrained(
            "roberta-base").eval()
        self.model_obj.eval()
        self.tokenizer_obj = RobertaTokenizer.from_pretrained("roberta-base")
        self.config_obj = RobertaConfig.from_pretrained("roberta-base")
        self.to(device)

    def forward(self, input_text):

        encoder_ret = self.tokenizer_obj(
            input_text, truncation=True, return_tensors="pt", padding=True).to(self.device)

        encoder_text_ids = encoder_ret.input_ids.to(self.device)
        attention_mask = encoder_ret.attention_mask.to(self.device) # 1 for not pad

        ic(encoder_text_ids.device)
        ic(self.model_obj.device)
        encoder_states = self.model_obj(
            encoder_text_ids, output_hidden_states=True, attention_mask=attention_mask)

        hs_tuple = encoder_states["hidden_states"]

        mean_pooled_all_layers = []

        for layer, hs in enumerate(hs_tuple):
            ic(hs_tuple[layer].size())
            # hs = hs_tuple[layer] # (batch_size x sequence_length x dimension)
            hs_masked = hs * attention_mask[:, :, None] # ideally zeros out the pad associated representations
            ic(hs_masked.size())
            seq_lengths = attention_mask.sum(dim=1) # each line here represents sequence length

            hs_masked_sum = hs_masked.sum(dim=1)
            hs_avg = hs_masked_sum / seq_lengths[:, None]
            mean_pooled_all_layers.append(hs_avg)

        return mean_pooled_all_layers



In [6]:
# test
model_wrapped = RobertaWrapper(device="cuda")
test_dataset_xor = datasets.load_dataset("data_scripts/data_xor.py", add_sep=False)["train"]
# test_dataloader = DataLoader(test_dataset_xor, batch_size=3)

# print(next(iter(test_dataloader)))
# output = model_wrapped(next(iter(test_dataloader))["content"])
# print(len(output))
# print(output[0].size())


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading and preparing dataset data_xor/default to /home2/shashwat.s/main_thread/negation_new/{'cache_dir': None, 'config_name': None, 'data_dir': None, 'data_files': None, 'hash': 'd2a449d4e373869e0295a5e1aff1c8052abb878b8e40f9c2241d9639b4a4bf72', 'features': None, 'use_auth_token': None, 'base_path': 'data_scripts', 'add_sep': False}/data_xor/default/0.0.0...


Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Dataset data_xor downloaded and prepared to /home2/shashwat.s/main_thread/negation_new/{'cache_dir': None, 'config_name': None, 'data_dir': None, 'data_files': None, 'hash': 'd2a449d4e373869e0295a5e1aff1c8052abb878b8e40f9c2241d9639b4a4bf72', 'features': None, 'use_auth_token': None, 'base_path': 'data_scripts', 'add_sep': False}/data_xor/default/0.0.0. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [7]:
def get_hidden_states_many_examples(model, data, n=100, layer=-1, batch_size=1, query_column="content"):
    """
    Takes a bunch of sequences and runs them through RoBERTa to generate the mean-pooled hidden states.

    This is unbatched and kept inefficient for simplicity

    can be done in batches on a GPU to make it faster
    """
    # setup
    model.eval()
    all_hidden_states, all_labels = [], []
    # all_hidden_states: will have elements for each RoBERTa layer, each element represents the mean-pooled representations for the whole data at that layer

    dataloader = DataLoader(data, batch_size=batch_size, shuffle=False)

    # loop
    # for idx in tqdm(range(n)):
    for i, batch in enumerate(dataloader):

        if ((i+1) * batch_size) > n:
            break
        text, true_label = batch[query_column], batch["label"].to(model.device)
        ic(text)
        ic(true_label)


        # get hidden states
        with torch.no_grad():
            outs = model(text)
        # outs: [hidden states]
        ic(outs[0].size())

        # initialize if empty
        if len(all_hidden_states) == 0:
            for i in range(len(outs)):
                all_hidden_states.append([])


        # collect
        for i, hidden_state in enumerate(outs):
            all_hidden_states[i].append(hidden_state)

        all_labels.append(true_label)

    ic(len(all_hidden_states))
    ic(len(all_hidden_states[0]))
    ic(all_hidden_states[0][0].size())
    ic(torch.cat(all_hidden_states[0], dim=0).size())

    all_hidden_states = [torch.cat(all_hidden_states[i], dim=0) for i in range(len(all_hidden_states))]


    return all_hidden_states, torch.cat(all_labels, dim=0)

In [8]:
ic.disable()
outs = get_hidden_states_many_examples(model_wrapped, test_dataset_xor, n=4, batch_size=2)
print(len(outs[0]))
print(outs[0][0].size())

13
torch.Size([4, 768])


In [9]:
def run_experiment_across_layers(experiment, train_input, train_labels, test_input, test_labels):
    """
    Runs a probing experiment over representations from all layers of the model.
    The whole thing works on cached embeddings

    experiment: method (train: Tensor, test: Tensor, label_train: Tensor, label_test: Tensor) -> (fit_model, metrics). Each experiment will fit _some_ model on the data and return the model and the results
    train_input: list of 13 elements, each of which is a tensor of size (num_datapoints, embedding_dim)
    train_labels: tensor (num_datapoints, )
    test_input: same format as train_input
    test_labels: same format as train_labels
    """

    list_of_results = []
    list_of_probing_models = []

    for i in range(len(train_input)):
        train_current_layer = train_input[i]
        test_current_layer = test_input[i]

        model, metrics = experiment(train_current_layer, test_current_layer, train_labels, test_labels)

        list_of_results.append(metrics)
        list_of_probing_models.append(model)

    return list_of_probing_models, list_of_results


In [10]:
def probe_experiment(train_input, test_input, train_labels, test_labels, probe_model):
    """
    Gets an initialized probe model and fits it on data and runs some experiments
    expected to be curried and sent as a callback to run_experiment_across_layers
    """

    train_input_numpy = train_input.detach().cpu().numpy()
    test_input_numpy = test_input.detach().cpu().numpy()
    ic(train_labels.size())
    train_labels_numpy = train_labels.detach().cpu().numpy()
    test_labels_numpy = test_labels.detach().cpu().numpy()

    probe_model.fit(train_input_numpy, train_labels_numpy)

    accuracy = probe_model.score(test_input_numpy, test_labels_numpy)

    return probe_model, {"accuracy": accuracy}


def linear_probe_experiment(train_input, test_input, train_labels, test_labels):
    # initialize linear probe and run probe experiment
    lr = LogisticRegression(class_weight="balanced", verbose=1, max_iter=1000)
    return probe_experiment(train_input, test_input, train_labels, test_labels, lr)


def mlp_probe_experiment(train_input, test_input, train_labels, test_labels):
    # initialize an mlp probe and run probe experiment
    mlp = MLPClassifier(random_state=1, max_iter=1000, verbose=True, hidden_layer_sizes=(300,))
    return probe_experiment(train_input, test_input, train_labels, test_labels, mlp)



In [11]:
def generate_classification_report_all_layers(input_all_layers, labels, models):
    """
    input_all_layers is a list of 13 layers
    labels is a tensor
    """
    labels = labels.detach().cpu().numpy()
    preds_for_all_layers = [models[i].predict(input_for_layer.detach().cpu().numpy()) for i, input_for_layer in enumerate(input_all_layers)]
    classification_reports = [classification_report(y_true=labels, y_pred=pred) for pred in preds_for_all_layers]
    return classification_reports


# The XOR experiment

We have negated and non-negated versions of the same propositions in LAMA

- A: Einstein was born in Austria
- A': Eisntein was not born in Austria

We make the following combinations
- AA
- AA'
- A'A
- A'A'

We classify contradictory statements together and non-contradictory statements together

The point is to ask if a linear regression can seperate out this classification, the point is to look if the model is doing compositionality.


In [29]:
ic.disable()
model_wrapped = RobertaWrapper(device="cuda")
train_dataset_xor = datasets.load_dataset("data_scripts/data_xor.py")["train"]
test_dataset_xor = datasets.load_dataset("data_scripts/data_xor.py")["test"]
dev_dataset_xor = datasets.load_dataset("data_scripts/data_xor.py")["validation"]


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Old caching folder {'cache_dir': None, 'config_name': None, 'data_dir': None, 'data_files': None, 'hash': 'd2a449d4e373869e0295a5e1aff1c8052abb878b8e40f9c2241d9639b4a4bf72', 'features': None, 'use_auth_token': None, 'base_path': 'data_scripts'}/data_xor/default/0.0.0 for dataset data_xor exists b

Downloading and preparing dataset data_xor/default to /home2/shashwat.s/main_thread/negation_new/{'cache_dir': None, 'config_name': None, 'data_dir': None, 'data_files': None, 'hash': 'd2a449d4e373869e0295a5e1aff1c8052abb878b8e40f9c2241d9639b4a4bf72', 'features': None, 'use_auth_token': None, 'base_path': 'data_scripts'}/data_xor/default/0.0.0...


Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Dataset data_xor downloaded and prepared to /home2/shashwat.s/main_thread/negation_new/{'cache_dir': None, 'config_name': None, 'data_dir': None, 'data_files': None, 'hash': 'd2a449d4e373869e0295a5e1aff1c8052abb878b8e40f9c2241d9639b4a4bf72', 'features': None, 'use_auth_token': None, 'base_path': 'data_scripts'}/data_xor/default/0.0.0. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset data_xor (/home2/shashwat.s/main_thread/negation_new/{'cache_dir': None, 'config_name': None, 'data_dir': None, 'data_files': None, 'hash': 'd2a449d4e373869e0295a5e1aff1c8052abb878b8e40f9c2241d9639b4a4bf72', 'features': None, 'use_auth_token': None, 'base_path': 'data_scripts'}/data_xor/default/0.0.0)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset data_xor (/home2/shashwat.s/main_thread/negation_new/{'cache_dir': None, 'config_name': None, 'data_dir': None, 'data_files': None, 'hash': 'd2a449d4e373869e0295a5e1aff1c8052abb878b8e40f9c2241d9639b4a4bf72', 'features': None, 'use_auth_token': None, 'base_path': 'data_scripts'}/data_xor/default/0.0.0)


  0%|          | 0/3 [00:00<?, ?it/s]

In [71]:
train, train_labels = get_hidden_states_many_examples(model_wrapped, train_dataset_xor, n=248, batch_size=100)


In [58]:
test, test_labels = get_hidden_states_many_examples(model_wrapped, test_dataset_xor, n=1000, batch_size=100)


In [11]:
ic(len(test_labels))
ic(test_labels.size())
ic(train_labels.size())

torch.Size([2100])

In [72]:
models, accuracies = run_experiment_across_layers(linear_probe_experiment, train, train_labels, test, test_labels)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s finished


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =          769     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  1.38629D+02    |proj g|=  2.12590D-01

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
  769     38     47      1     0     0   3.904D-03   1.385D+02
  F =   138.46310439172152     

CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH             
RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =          769     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  1.3

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =          769     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  1.38629D+02    |proj g|=  5.53660D-01

At iterate   50    f=  1.30284D+02    |proj g|=  1.67079D+00

At iterate  100    f=  1.30276D+02    |proj g|=  2.17239D-01

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
  769    136    160      1     0     0   1.518D-02   1.303D+02
  F =   130.27584075334329     

CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH             
RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220

 This problem is unconstrained.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.



At iterate  150    f=  6.18601D+01    |proj g|=  1.09283D+00

At iterate  200    f=  6.17714D+01    |proj g|=  3.58208D-02

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
  769    216    266      1     0     0   3.890D-03   6.177D+01
  F =   61.771385630659708     

CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH             
RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =          769     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  1.38629D+02    |proj g|=  7.19091D+00

At iterate   50    f=  5.28761D+01    |proj g|=  3.30486D+00

At iter

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.



At iterate  100    f=  4.27427D+01    |proj g|=  2.05955D+00

At iterate  150    f=  4.26421D+01    |proj g|=  2.58904D-01

At iterate  200    f=  4.26393D+01    |proj g|=  1.80179D-03

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
  769    202    240      1     0     0   8.666D-03   4.264D+01
  F =   42.639252053255255     

CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH             
RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =          769     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  1.38629D+02    |proj g|=  7.36890D+00

At iter

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.



At iterate   50    f=  5.12693D+01    |proj g|=  9.64947D-01

At iterate  100    f=  5.12650D+01    |proj g|=  4.08407D-02

At iterate  150    f=  5.12330D+01    |proj g|=  3.53120D-01

At iterate  200    f=  5.11456D+01    |proj g|=  3.04957D+00

At iterate  250    f=  5.11392D+01    |proj g|=  9.01741D-02

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
  769    258    303      1     0     0   2.090D-02   5.114D+01
  F =   51.139197386719644     

CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH             
RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =          769     M =           10

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s finished


In [60]:
reports_xor_no_control = generate_classification_report_all_layers(test, test_labels, models)

In [73]:
for report in reports_xor_no_control[:3]:
    print(report)

              precision    recall  f1-score   support

           0       0.56      0.55      0.55       500
           1       0.56      0.57      0.57       500

    accuracy                           0.56      1000
   macro avg       0.56      0.56      0.56      1000
weighted avg       0.56      0.56      0.56      1000

              precision    recall  f1-score   support

           0       0.57      0.56      0.56       500
           1       0.56      0.57      0.57       500

    accuracy                           0.57      1000
   macro avg       0.57      0.57      0.57      1000
weighted avg       0.57      0.57      0.57      1000

              precision    recall  f1-score   support

           0       0.57      0.59      0.58       500
           1       0.57      0.55      0.56       500

    accuracy                           0.57      1000
   macro avg       0.57      0.57      0.57      1000
weighted avg       0.57      0.57      0.57      1000



In [74]:
for report in reports_xor_no_control[3:6]:
    print(report)

              precision    recall  f1-score   support

           0       0.68      0.75      0.71       500
           1       0.72      0.65      0.68       500

    accuracy                           0.70      1000
   macro avg       0.70      0.70      0.70      1000
weighted avg       0.70      0.70      0.70      1000

              precision    recall  f1-score   support

           0       0.80      0.83      0.81       500
           1       0.82      0.79      0.81       500

    accuracy                           0.81      1000
   macro avg       0.81      0.81      0.81      1000
weighted avg       0.81      0.81      0.81      1000

              precision    recall  f1-score   support

           0       0.88      0.88      0.88       500
           1       0.88      0.87      0.88       500

    accuracy                           0.88      1000
   macro avg       0.88      0.88      0.88      1000
weighted avg       0.88      0.88      0.88      1000



In [75]:
for report in reports_xor_no_control[6:9]:
    print(report)

              precision    recall  f1-score   support

           0       0.91      0.92      0.91       500
           1       0.91      0.90      0.91       500

    accuracy                           0.91      1000
   macro avg       0.91      0.91      0.91      1000
weighted avg       0.91      0.91      0.91      1000

              precision    recall  f1-score   support

           0       0.92      0.95      0.93       500
           1       0.95      0.91      0.93       500

    accuracy                           0.93      1000
   macro avg       0.93      0.93      0.93      1000
weighted avg       0.93      0.93      0.93      1000

              precision    recall  f1-score   support

           0       0.95      0.96      0.96       500
           1       0.96      0.95      0.95       500

    accuracy                           0.95      1000
   macro avg       0.96      0.95      0.95      1000
weighted avg       0.96      0.95      0.95      1000



In [77]:
for report in reports_xor_no_control[-3:]:
    print(report)

              precision    recall  f1-score   support

           0       0.96      0.93      0.94       500
           1       0.93      0.96      0.94       500

    accuracy                           0.94      1000
   macro avg       0.94      0.94      0.94      1000
weighted avg       0.94      0.94      0.94      1000

              precision    recall  f1-score   support

           0       0.96      0.93      0.94       500
           1       0.93      0.96      0.95       500

    accuracy                           0.94      1000
   macro avg       0.95      0.95      0.94      1000
weighted avg       0.95      0.94      0.94      1000

              precision    recall  f1-score   support

           0       0.95      0.89      0.92       500
           1       0.89      0.95      0.92       500

    accuracy                           0.92      1000
   macro avg       0.92      0.92      0.92      1000
weighted avg       0.92      0.92      0.92      1000



## Control Experiment
replace negation with gibberish and run the linear probes for inference

In [78]:

test_dataset_xor_1 = datasets.load_dataset("data_scripts/data_xor.py", task_num=1, cache_dir="control_task")["test"]

Found cached dataset data_xor (/home2/shashwat.s/main_thread/negation_new/{'cache_dir': 'control_task', 'config_name': None, 'data_dir': None, 'data_files': None, 'hash': 'd2a449d4e373869e0295a5e1aff1c8052abb878b8e40f9c2241d9639b4a4bf72', 'features': None, 'use_auth_token': None, 'base_path': 'data_scripts', 'task_num': 1}/data_xor/default/0.0.0)


  0%|          | 0/3 [00:00<?, ?it/s]

In [79]:
test_control1, test_labels_control1 = get_hidden_states_many_examples(model_wrapped, test_dataset_xor_1, n=10000, batch_size=100)


In [39]:
test_labels_control1

tensor([0, 1, 1,  ..., 1, 1, 0], device='cuda:0')

In [80]:
reports_xor_control_1 = generate_classification_report_all_layers(test_control1, test_labels_control1, models)

In [81]:
for report in reports_xor_control_1[:3]:
    print(report)

              precision    recall  f1-score   support

           0       0.50      0.96      0.66      3350
           1       0.50      0.04      0.08      3350

    accuracy                           0.50      6700
   macro avg       0.50      0.50      0.37      6700
weighted avg       0.50      0.50      0.37      6700

              precision    recall  f1-score   support

           0       0.51      0.70      0.59      3350
           1       0.53      0.33      0.41      3350

    accuracy                           0.52      6700
   macro avg       0.52      0.52      0.50      6700
weighted avg       0.52      0.52      0.50      6700

              precision    recall  f1-score   support

           0       0.55      0.57      0.56      3350
           1       0.55      0.53      0.54      3350

    accuracy                           0.55      6700
   macro avg       0.55      0.55      0.55      6700
weighted avg       0.55      0.55      0.55      6700



In [82]:
for report in reports_xor_control_1[3:6]:
    print(report)

              precision    recall  f1-score   support

           0       0.71      0.78      0.75      3350
           1       0.76      0.69      0.72      3350

    accuracy                           0.74      6700
   macro avg       0.74      0.74      0.74      6700
weighted avg       0.74      0.74      0.74      6700

              precision    recall  f1-score   support

           0       0.77      0.81      0.79      3350
           1       0.80      0.76      0.78      3350

    accuracy                           0.78      6700
   macro avg       0.79      0.78      0.78      6700
weighted avg       0.79      0.78      0.78      6700

              precision    recall  f1-score   support

           0       0.74      0.89      0.81      3350
           1       0.86      0.69      0.76      3350

    accuracy                           0.79      6700
   macro avg       0.80      0.79      0.79      6700
weighted avg       0.80      0.79      0.79      6700



In [83]:
for report in reports_xor_control_1[-3:]:
    print(report)

              precision    recall  f1-score   support

           0       0.67      0.96      0.79      3350
           1       0.93      0.53      0.67      3350

    accuracy                           0.74      6700
   macro avg       0.80      0.74      0.73      6700
weighted avg       0.80      0.74      0.73      6700

              precision    recall  f1-score   support

           0       0.73      0.96      0.83      3350
           1       0.94      0.65      0.77      3350

    accuracy                           0.80      6700
   macro avg       0.84      0.80      0.80      6700
weighted avg       0.84      0.80      0.80      6700

              precision    recall  f1-score   support

           0       0.64      0.89      0.75      3350
           1       0.82      0.50      0.62      3350

    accuracy                           0.70      6700
   macro avg       0.73      0.70      0.68      6700
weighted avg       0.73      0.70      0.68      6700



In [47]:
import datasets
import os
os.statvfs(datasets.config.HF_DATASETS_CACHE)

os.statvfs_result(f_bsize=1048576, f_frsize=1048576, f_blocks=34191575, f_bfree=9798649, f_bavail=8082217, f_files=549253120, f_ffree=360303603, f_favail=360303603, f_flag=4096, f_namemax=255)

# Negation Detection

Very dumb experiment, checking for negation in the statement (from mean-pooled reps)

In [23]:
train_dataset_neg_detect = datasets.load_dataset("data_scripts/data_detect_neg.py", add_sep=False)["train"]
test_dataset_neg_detect = datasets.load_dataset("data_scripts/data_detect_neg.py", add_sep=False)["test"]
dev_dataset_neg_detect = datasets.load_dataset("data_scripts/data_detect_neg.py", add_sep=False)["validation"]

Found cached dataset data_detect_neg (/home/mrcreator/research/main_thread/fresh_repo/{'cache_dir': None, 'config_name': None, 'data_dir': None, 'data_files': None, 'hash': '1b53f52a30ee637c38ac67e920770d58d6c1da4f3bed41fbe8442d79b6881e56', 'features': None, 'use_auth_token': None, 'base_path': 'data_scripts', 'add_sep': False}/data_detect_neg/default/0.0.0)
100%|██████████| 3/3 [00:00<00:00, 671.12it/s]
Found cached dataset data_detect_neg (/home/mrcreator/research/main_thread/fresh_repo/{'cache_dir': None, 'config_name': None, 'data_dir': None, 'data_files': None, 'hash': '1b53f52a30ee637c38ac67e920770d58d6c1da4f3bed41fbe8442d79b6881e56', 'features': None, 'use_auth_token': None, 'base_path': 'data_scripts', 'add_sep': False}/data_detect_neg/default/0.0.0)
100%|██████████| 3/3 [00:00<00:00, 509.80it/s]
Found cached dataset data_detect_neg (/home/mrcreator/research/main_thread/fresh_repo/{'cache_dir': None, 'config_name': None, 'data_dir': None, 'data_files': None, 'hash': '1b53f52a30

In [24]:
train_detect_neg, train_labels_det_neg = get_hidden_states_many_examples(model_wrapped, train_dataset_neg_detect, n=2000, batch_size=100)


In [27]:
test_detect_neg, test_labels_det_neg = get_hidden_states_many_examples(model_wrapped, test_dataset_neg_detect, n=10000, batch_size=100)


In [28]:
run_experiment_across_layers(linear_probe_experiment, train_detect_neg, train_labels_det_neg, test_detect_neg, test_labels_det_neg)


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =          769     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  1.38629D+03    |proj g|=  3.32225D+01


 This problem is unconstrained.



At iterate   50    f=  2.04545D+02    |proj g|=  7.64498D-02

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
  769     77     90      1     0     0   9.888D-04   2.045D+02
  F =   204.53285633775937     

CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH             
RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =          769     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  1.38629D+03    |proj g|=  6.85403D+01


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.



At iterate   50    f=  1.24328D+02    |proj g|=  2.31646D+00

At iterate  100    f=  1.24240D+02    |proj g|=  4.26313D-02

At iterate  150    f=  1.24191D+02    |proj g|=  1.07950D+00

At iterate  200    f=  1.24187D+02    |proj g|=  1.55634D-01

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
  769    221    257      1     0     0   8.335D-04   1.242D+02
  F =   124.18733996052116     

CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH             


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    4.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =          769     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  1.38629D+03    |proj g|=  8.76241D+01

At iterate   50    f=  1.18364D+02    |proj g|=  1.78367D+00

At iterate  100    f=  1.18296D+02    |proj g|=  2.40666D-01

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
  769    135    152      1     0     0   1.686D-02   1.183D+02
  F =   118.29520162589269     

CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH             


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.5s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =          769     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  1.38629D+03    |proj g|=  9.52775D+01

At iterate   50    f=  1.16833D+02    |proj g|=  1.88923D+00

At iterate  100    f=  1.16655D+02    |proj g|=  1.36041D+00

At iterate  150    f=  1.16652D+02    |proj g|=  2.10000D-01

At iterate  200    f=  1.16599D+02    |proj g|=  2.52958D+00

At iterate  250    f=  1.16533D+02    |proj g|=  2.55485D-01


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.1s finished



           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
  769    288    338      1     0     0   1.466D-03   1.165D+02
  F =   116.53231400271187     

CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH             


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =          769     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  1.38629D+03    |proj g|=  9.66319D+01

At iterate   50    f=  1.12514D+02    |proj g|=  4.09364D+00

At iterate  100    f=  1.12424D+02    |proj g|=  3.34658D-02

At iterate  150    f=  1.12392D+02    |proj g|=  2.10947D+00

At iterate  200    f=  1.12299D+02    |proj g|=  6.76007D-02

At iterate  250    f=  1.12291D+02    |proj g|=  3.16846D-02

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
  769    261    302      1     0     0   

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.9s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =          769     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  1.38629D+03    |proj g|=  1.05673D+02

At iterate   50    f=  9.97154D+01    |proj g|=  1.16181D+00

At iterate  100    f=  9.94567D+01    |proj g|=  3.38586D-01

At iterate  150    f=  9.94530D+01    |proj g|=  7.67354D-02

At iterate  200    f=  9.93549D+01    |proj g|=  1.76517D+00

At iterate  250    f=  9.92998D+01    |proj g|=  6.80629D-01

At iterate  300    f=  9.92980D+01    |proj g|=  2.56711D-02

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nac

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =          769     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  1.38629D+03    |proj g|=  1.15866D+02

At iterate   50    f=  9.87568D+01    |proj g|=  1.97864D-01

At iterate  100    f=  9.87074D+01    |proj g|=  1.22188D-01

At iterate  150    f=  9.86906D+01    |proj g|=  1.64428D+00

At iterate  200    f=  9.85642D+01    |proj g|=  8.29998D-02

At iterate  250    f=  9.85567D+01    |proj g|=  1.07981D-01

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
  769    266    304      1     0     0   

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.4s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =          769     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  1.38629D+03    |proj g|=  1.10859D+02

At iterate   50    f=  9.56001D+01    |proj g|=  4.32750D+00

At iterate  100    f=  9.53668D+01    |proj g|=  3.06772D-01

At iterate  150    f=  9.53629D+01    |proj g|=  6.56175D-01

At iterate  200    f=  9.53334D+01    |proj g|=  4.24689D+00

At iterate  250    f=  9.51900D+01    |proj g|=  1.80423D+00

At iterate  300    f=  9.51655D+01    |proj g|=  2.39485D-01

At iterate  350    f=  9.51651D+01    |proj g|=  1.57215D-02

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function 

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =          769     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  1.38629D+03    |proj g|=  1.18782D+02

At iterate   50    f=  9.46269D+01    |proj g|=  4.35112D+00

At iterate  100    f=  9.43411D+01    |proj g|=  8.65839D-01

At iterate  150    f=  9.43328D+01    |proj g|=  3.43572D-01

At iterate  200    f=  9.43028D+01    |proj g|=  2.45027D+00

At iterate  250    f=  9.41723D+01    |proj g|=  4.84260D-01

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
  769    291    343      1     0     0   

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    4.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =          769     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  1.38629D+03    |proj g|=  1.24954D+02

At iterate   50    f=  9.40611D+01    |proj g|=  2.12306D+00

At iterate  100    f=  9.38564D+01    |proj g|=  1.34114D-01

At iterate  150    f=  9.38533D+01    |proj g|=  1.49877D-01

At iterate  200    f=  9.38198D+01    |proj g|=  4.14386D-01

At iterate  250    f=  9.37334D+01    |proj g|=  1.00106D+00

At iterate  300    f=  9.37281D+01    |proj g|=  2.18188D-02

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nac

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    4.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =          769     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  1.38629D+03    |proj g|=  1.23173D+02

At iterate   50    f=  9.79266D+01    |proj g|=  8.36408D+00

At iterate  100    f=  9.75858D+01    |proj g|=  1.26014D-01

At iterate  150    f=  9.75823D+01    |proj g|=  1.26635D-01

At iterate  200    f=  9.75009D+01    |proj g|=  7.56717D-01

At iterate  250    f=  9.74839D+01    |proj g|=  1.18486D-01

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
  769    265    315      1     0     0   

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.5s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =          769     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  1.38629D+03    |proj g|=  1.33309D+02

At iterate   50    f=  1.11224D+02    |proj g|=  9.15769D-01

At iterate  100    f=  1.10809D+02    |proj g|=  1.17546D+00

At iterate  150    f=  1.10801D+02    |proj g|=  4.81832D-02

At iterate  200    f=  1.10794D+02    |proj g|=  3.47335D-01

At iterate  250    f=  1.10709D+02    |proj g|=  5.41799D-01

At iterate  300    f=  1.10635D+02    |proj g|=  1.88228D-01

At iterate  350    f=  1.10633D+02    |proj g|=  7.05715D-02

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function 

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.5s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =          769     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  1.38629D+03    |proj g|=  6.48358D+01

At iterate   50    f=  2.27067D+02    |proj g|=  2.24334D+00

At iterate  100    f=  2.26869D+02    |proj g|=  1.10526D+00

At iterate  150    f=  2.26733D+02    |proj g|=  4.45661D-01

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
  769    170    203      1     0     0   1.872D-02   2.267D+02
  F =   226.73187817552341     

CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH             


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.5s finished


([LogisticRegression(class_weight='balanced', max_iter=1000, verbose=1),
  LogisticRegression(class_weight='balanced', max_iter=1000, verbose=1),
  LogisticRegression(class_weight='balanced', max_iter=1000, verbose=1),
  LogisticRegression(class_weight='balanced', max_iter=1000, verbose=1),
  LogisticRegression(class_weight='balanced', max_iter=1000, verbose=1),
  LogisticRegression(class_weight='balanced', max_iter=1000, verbose=1),
  LogisticRegression(class_weight='balanced', max_iter=1000, verbose=1),
  LogisticRegression(class_weight='balanced', max_iter=1000, verbose=1),
  LogisticRegression(class_weight='balanced', max_iter=1000, verbose=1),
  LogisticRegression(class_weight='balanced', max_iter=1000, verbose=1),
  LogisticRegression(class_weight='balanced', max_iter=1000, verbose=1),
  LogisticRegression(class_weight='balanced', max_iter=1000, verbose=1),
  LogisticRegression(class_weight='balanced', max_iter=1000, verbose=1)],
 [{'accuracy': 0.9832},
  {'accuracy': 0.9828},
  

# Negation Consistency Probe

In [2]:
train_dataset_neg_consistency = datasets.load_dataset("data_scripts/data_negation_consistency.py", add_sep=False)["train"]
test_dataset_neg_consistency = datasets.load_dataset("data_scripts/data_negation_consistency.py", add_sep=False)["test"]
dev_dataset_neg_consistency = datasets.load_dataset("data_scripts/data_negation_consistency.py", add_sep=False)["validation"]

Old caching folder {'cache_dir': None, 'config_name': None, 'data_dir': None, 'data_files': None, 'hash': 'b61b143f8583c778d436a1730d12372367c64ff9c71127a6af2f230568607fe8', 'features': None, 'use_auth_token': None, 'base_path': 'data_scripts', 'add_sep': False}/data_negation_consistency/default/0.0.0 for dataset data_negation_consistency exists but not data were found. Removing it. 


Downloading and preparing dataset data_negation_consistency/default to /home2/shashwat.s/main_thread/negation_new/{'cache_dir': None, 'config_name': None, 'data_dir': None, 'data_files': None, 'hash': 'b61b143f8583c778d436a1730d12372367c64ff9c71127a6af2f230568607fe8', 'features': None, 'use_auth_token': None, 'base_path': 'data_scripts', 'add_sep': False}/data_negation_consistency/default/0.0.0...


Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Dataset data_negation_consistency downloaded and prepared to /home2/shashwat.s/main_thread/negation_new/{'cache_dir': None, 'config_name': None, 'data_dir': None, 'data_files': None, 'hash': 'b61b143f8583c778d436a1730d12372367c64ff9c71127a6af2f230568607fe8', 'features': None, 'use_auth_token': None, 'base_path': 'data_scripts', 'add_sep': False}/data_negation_consistency/default/0.0.0. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset data_negation_consistency (/home2/shashwat.s/main_thread/negation_new/{'cache_dir': None, 'config_name': None, 'data_dir': None, 'data_files': None, 'hash': 'b61b143f8583c778d436a1730d12372367c64ff9c71127a6af2f230568607fe8', 'features': None, 'use_auth_token': None, 'base_path': 'data_scripts', 'add_sep': False}/data_negation_consistency/default/0.0.0)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset data_negation_consistency (/home2/shashwat.s/main_thread/negation_new/{'cache_dir': None, 'config_name': None, 'data_dir': None, 'data_files': None, 'hash': 'b61b143f8583c778d436a1730d12372367c64ff9c71127a6af2f230568607fe8', 'features': None, 'use_auth_token': None, 'base_path': 'data_scripts', 'add_sep': False}/data_negation_consistency/default/0.0.0)


  0%|          | 0/3 [00:00<?, ?it/s]

In [48]:
# checking for statistics
len(train_dataset_neg_consistency['label'])
total = 0
ones = 0

for label in train_dataset_neg_consistency['label']:
    total += 1
    ones += label

print(total)
print(ones)
print(ones/total)

23450
10066
0.4292537313432836


In [49]:
# checking for statistics
len(test_dataset_neg_consistency['label'])
total = 0
ones = 0

for label in test_dataset_neg_consistency['label']:
    total += 1
    ones += label

print(total)
print(ones)
print(ones/total)

5026
2160
0.4297652208515718


In [3]:
len(set(train_dataset_neg_consistency['subject']).intersection(set(test_dataset_neg_consistency['subject'])))

133

In [12]:
train_detect_neg_consistency, train_labels_det_neg_consistency = get_hidden_states_many_examples(model_wrapped, train_dataset_neg_consistency, n=23000, batch_size=100, query_column="subject")

In [13]:
test_detect_neg_consistency, test_labels_det_neg_consistency = get_hidden_states_many_examples(model_wrapped, test_dataset_neg_consistency, n=5000, batch_size=100, query_column="subject")

In [11]:
linear_classifiers, accuracies = run_experiment_across_layers(linear_probe_experiment, train_detect_neg_consistency, train_labels_det_neg_consistency, test_detect_neg_consistency, test_labels_det_neg_consistency)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s finished


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =          769     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  1.38629D+03    |proj g|=  2.73022D+01

At iterate   50    f=  9.52618D+02    |proj g|=  7.16117D-01

At iterate  100    f=  9.51333D+02    |proj g|=  2.73071D-01

At iterate  150    f=  9.51328D+02    |proj g|=  1.97448D-02

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
  769    166    193      1     0     0   3.737D-03   9.513D+02
  F =   951.32789911322618     

CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH             


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =          769     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  1.38629D+03    |proj g|=  6.49952D+01

At iterate   50    f=  8.71338D+02    |proj g|=  2.90839D+01

At iterate  100    f=  8.62683D+02    |proj g|=  9.23039D+00

At iterate  150    f=  8.62314D+02    |proj g|=  6.76100D-01

At iterate  200    f=  8.62284D+02    |proj g|=  1.71411D+00

At iterate  250    f=  8.62259D+02    |proj g|=  1.01424D+00

At iterate  300    f=  8.62191D+02    |proj g|=  1.09729D+00

At iterate  350    f=  8.62130D+02    |proj g|=  1.14203D+00

At iterate  400    f=  8.62085D+02    |proj g|=  6.22453D-01

At iterate  450    f=  8.62079D+02    |proj g|=  2.11507D-01

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = nu

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.6s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.



At iterate   50    f=  9.01576D+02    |proj g|=  9.19384D+01

At iterate  100    f=  8.87601D+02    |proj g|=  1.69773D+01

At iterate  150    f=  8.86061D+02    |proj g|=  7.19906D-01

At iterate  200    f=  8.85713D+02    |proj g|=  1.43419D+01

At iterate  250    f=  8.85675D+02    |proj g|=  2.25672D-01

At iterate  300    f=  8.85664D+02    |proj g|=  4.26744D-01

At iterate  350    f=  8.85649D+02    |proj g|=  1.12485D+00

At iterate  400    f=  8.85606D+02    |proj g|=  1.80670D-01

At iterate  450    f=  8.85521D+02    |proj g|=  3.69300D+00

At iterate  500    f=  8.85462D+02    |proj g|=  1.92564D+00

At iterate  550    f=  8.85436D+02    |proj g|=  3.13820D-01

At iterate  600    f=  8.85433D+02    |proj g|=  1.38952D-01

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized 

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.7s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.



At iterate  100    f=  8.87903D+02    |proj g|=  3.93771D+01

At iterate  150    f=  8.85338D+02    |proj g|=  4.49524D+00

At iterate  200    f=  8.84884D+02    |proj g|=  8.12346D+00

At iterate  250    f=  8.84783D+02    |proj g|=  6.64434D+00

At iterate  300    f=  8.84761D+02    |proj g|=  1.40662D-01

At iterate  350    f=  8.84743D+02    |proj g|=  2.50080D+00

At iterate  400    f=  8.84687D+02    |proj g|=  1.16808D+00

At iterate  450    f=  8.84634D+02    |proj g|=  6.42694D+00

At iterate  500    f=  8.84451D+02    |proj g|=  5.44426D+00

At iterate  550    f=  8.84223D+02    |proj g|=  8.24968D+00

At iterate  600    f=  8.84126D+02    |proj g|=  1.22415D+00

At iterate  650    f=  8.84098D+02    |proj g|=  8.69084D-01

At iterate  700    f=  8.84089D+02    |proj g|=  1.53391D+00

At iterate  750    f=  8.84085D+02    |proj g|=  8.74724D-02

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments 

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.7s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =          769     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  1.38629D+03    |proj g|=  7.84512D+01

At iterate   50    f=  9.58128D+02    |proj g|=  4.64250D+01

At iterate  100    f=  9.27130D+02    |proj g|=  7.25583D+01

At iterate  150    f=  9.14216D+02    |proj g|=  4.79869D+00

At iterate  200    f=  9.12305D+02    |proj g|=  1.10416D+01

At iterate  250    f=  9.12125D+02    |proj g|=  6.78876D+00

At iterate  300    f=  9.12089D+02    |proj g|=  7.69401D+00

At iterate  350    f=  9.12075D+02    |proj g|=  6.62752D-01

At iterate  400    f=  9.12039D+02    |proj g|=  4.75065D+00

At iterate  450    f=  9.11974D+02    |proj g|=  6.32191D-01

At iterate  500    f=  9.11811D+02    |proj g|=  6.63213D+00

At iterate  550    f=  9.11540D+02    |proj g|=  2.79364D+01

At iterate  600    f=  9.11392D+02    |proj g|=  2.10869D+00

At iterate  650    f=  9.1

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.8s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.



At iterate  100    f=  9.38561D+02    |proj g|=  1.07092D+02

At iterate  150    f=  9.24857D+02    |proj g|=  3.29961D+01

At iterate  200    f=  9.19097D+02    |proj g|=  2.27966D+00

At iterate  250    f=  9.18317D+02    |proj g|=  1.10955D+01

At iterate  300    f=  9.18065D+02    |proj g|=  4.58017D+00

At iterate  350    f=  9.17941D+02    |proj g|=  4.98469D-01

At iterate  400    f=  9.17915D+02    |proj g|=  6.85732D-01

At iterate  450    f=  9.17905D+02    |proj g|=  5.86384D-01

At iterate  500    f=  9.17897D+02    |proj g|=  1.32474D+00

At iterate  550    f=  9.17876D+02    |proj g|=  2.08482D-01

At iterate  600    f=  9.17831D+02    |proj g|=  1.32920D+00

At iterate  650    f=  9.17797D+02    |proj g|=  8.77780D+00

At iterate  700    f=  9.17761D+02    |proj g|=  1.90957D+00

At iterate  750    f=  9.17731D+02    |proj g|=  4.64851D+00

At iterate  800    f=  9.17696D+02    |proj g|=  4.85197D+00

At iterate  850    f=  9.17609D+02    |proj g|=  1.29428D+00

At iter

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =          769     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  1.38629D+03    |proj g|=  9.45374D+01

At iterate   50    f=  9.71952D+02    |proj g|=  2.57800D+01

At iterate  100    f=  9.28865D+02    |proj g|=  1.26839D+01

At iterate  150    f=  9.24453D+02    |proj g|=  1.23432D+01

At iterate  200    f=  9.23265D+02    |proj g|=  6.98147D+00

At iterate  250    f=  9.22942D+02    |proj g|=  4.56957D+00

At iterate  300    f=  9.22869D+02    |proj g|=  2.57783D-01

At iterate  350    f=  9.22845D+02    |proj g|=  6.96605D-01

At iterate  400    f=  9.22839D+02    |proj g|=  9.63007D-02

At iterate  450    f=  9.22837D+02    |proj g|=  1.11803D+00

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = nu

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.5s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.



At iterate  150    f=  9.18712D+02    |proj g|=  4.49625D+01

At iterate  200    f=  9.13498D+02    |proj g|=  1.07204D+01

At iterate  250    f=  9.12722D+02    |proj g|=  7.03243D+00

At iterate  300    f=  9.12579D+02    |proj g|=  1.08591D+00

At iterate  350    f=  9.12528D+02    |proj g|=  1.57841D+01

At iterate  400    f=  9.12513D+02    |proj g|=  1.50616D+00

At iterate  450    f=  9.12504D+02    |proj g|=  2.35127D+00

At iterate  500    f=  9.12502D+02    |proj g|=  8.67843D-01

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
  769    500    581      1     0     0   8.678D-01   9.125D+02
  F =   912.502313600

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.5s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.



At iterate   50    f=  9.13284D+02    |proj g|=  3.47340D+00

At iterate  100    f=  9.01773D+02    |proj g|=  7.74643D+00

At iterate  150    f=  9.00946D+02    |proj g|=  3.61028D+00

At iterate  200    f=  9.00764D+02    |proj g|=  7.55689D+00

At iterate  250    f=  9.00675D+02    |proj g|=  6.50622D+00

At iterate  300    f=  9.00639D+02    |proj g|=  2.94627D+00

At iterate  350    f=  9.00628D+02    |proj g|=  3.65034D+00

At iterate  400    f=  9.00616D+02    |proj g|=  6.32066D-01

At iterate  450    f=  9.00608D+02    |proj g|=  5.20356D+00

At iterate  500    f=  9.00547D+02    |proj g|=  3.07591D+00

At iterate  550    f=  9.00408D+02    |proj g|=  2.60152D+00

At iterate  600    f=  9.00305D+02    |proj g|=  5.88589D+00

At iterate  650    f=  9.00219D+02    |proj g|=  6.07319D+00

At iterate  700    f=  9.00185D+02    |proj g|=  4.88959D-01

At iterate  750    f=  9.00174D+02    |proj g|=  6.33497D-01

At iterate  800    f=  9.00168D+02    |proj g|=  1.18751D-01

       

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.9s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =          769     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  1.38629D+03    |proj g|=  8.29858D+01

At iterate   50    f=  9.29043D+02    |proj g|=  1.24939D+01

At iterate  100    f=  8.95504D+02    |proj g|=  3.94853D+00

At iterate  150    f=  8.90509D+02    |proj g|=  3.61422D+00

At iterate  200    f=  8.89900D+02    |proj g|=  4.75914D+00

At iterate  250    f=  8.89849D+02    |proj g|=  5.60374D+00

At iterate  300    f=  8.89831D+02    |proj g|=  5.14311D+00

At iterate  350    f=  8.89820D+02    |proj g|=  1.36623D+00

At iterate  400    f=  8.89815D+02    |proj g|=  1.27540D-01

At iterate  450    f=  8.89809D+02    |proj g|=  4.48564D-01

At iterate  500    f=  8.89796D+02    |proj g|=  3.14158D+00

At iterate  550    f=  8.89774D+02    |proj g|=  1.98201D+00

At iterate  600    f=  8.89701D+02    |proj g|=  1.70395D+00

At iterate  650    f=  8.8

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.



At iterate  150    f=  8.83617D+02    |proj g|=  5.11324D+01

At iterate  200    f=  8.78725D+02    |proj g|=  2.22912D+01

At iterate  250    f=  8.77696D+02    |proj g|=  6.66085D+00

At iterate  300    f=  8.77319D+02    |proj g|=  3.97054D+00

At iterate  350    f=  8.77232D+02    |proj g|=  4.53534D-01

At iterate  400    f=  8.77212D+02    |proj g|=  3.76740D+00

At iterate  450    f=  8.77207D+02    |proj g|=  1.19489D+00

At iterate  500    f=  8.77191D+02    |proj g|=  2.06597D+00

At iterate  550    f=  8.77148D+02    |proj g|=  1.61761D+00

At iterate  600    f=  8.77067D+02    |proj g|=  9.64368D+00

At iterate  650    f=  8.76872D+02    |proj g|=  1.68948D+00

At iterate  700    f=  8.76811D+02    |proj g|=  2.23866D+00

At iterate  750    f=  8.76779D+02    |proj g|=  4.30763D+00

At iterate  800    f=  8.76772D+02    |proj g|=  4.63838D-01

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments 

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.



At iterate   50    f=  9.17670D+02    |proj g|=  2.83455D+01

At iterate  100    f=  8.68328D+02    |proj g|=  2.45152D+01

At iterate  150    f=  8.57536D+02    |proj g|=  2.10658D+01

At iterate  200    f=  8.55610D+02    |proj g|=  2.33572D+01

At iterate  250    f=  8.54972D+02    |proj g|=  3.93009D+00

At iterate  300    f=  8.54867D+02    |proj g|=  2.71886D+00

At iterate  350    f=  8.54833D+02    |proj g|=  3.95704D+00

At iterate  400    f=  8.54823D+02    |proj g|=  2.83478D-01

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
  769    445    526      1     0     0   1.063D-01   8.548D+02
  F =   854.820667562

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.5s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
 This problem is unconstrained.



At iterate  200    f=  1.00386D+03    |proj g|=  2.68867D-01

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
  769    219    259      1     0     0   3.523D-01   1.004D+03
  F =   1003.8550754738139     

CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH             


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s finished


In [14]:
mlps, accuracies = run_experiment_across_layers(mlp_probe_experiment, train_detect_neg_consistency, train_labels_det_neg_consistency, test_detect_neg_consistency, test_labels_det_neg_consistency)


Iteration 1, loss = 0.63604699
Iteration 2, loss = 0.59836796
Iteration 3, loss = 0.57488126
Iteration 4, loss = 0.55649764
Iteration 5, loss = 0.54200105
Iteration 6, loss = 0.52694793
Iteration 7, loss = 0.51097616
Iteration 8, loss = 0.49570936
Iteration 9, loss = 0.46995626
Iteration 10, loss = 0.45055278
Iteration 11, loss = 0.43736771
Iteration 12, loss = 0.43190333
Iteration 13, loss = 0.40215087
Iteration 14, loss = 0.37750309
Iteration 15, loss = 0.34973615
Iteration 16, loss = 0.32980012
Iteration 17, loss = 0.31261986
Iteration 18, loss = 0.29195215
Iteration 19, loss = 0.27289586
Iteration 20, loss = 0.25857880
Iteration 21, loss = 0.24016809
Iteration 22, loss = 0.23006969
Iteration 23, loss = 0.20569809
Iteration 24, loss = 0.19178704
Iteration 25, loss = 0.18555150
Iteration 26, loss = 0.17421476
Iteration 27, loss = 0.16494094
Iteration 28, loss = 0.15037308
Iteration 29, loss = 0.14145178
Iteration 30, loss = 0.13717608
Iteration 31, loss = 0.12523073
Iteration 32, los

In [15]:
reports = generate_classification_report_all_layers(test_detect_neg_consistency, test_labels_det_neg_consistency, mlps)

In [18]:
for report in reports[:3]:
    print(report)

              precision    recall  f1-score   support

           0       0.67      0.78      0.72       954
           1       0.63      0.48      0.55       721

    accuracy                           0.65      1675
   macro avg       0.65      0.63      0.63      1675
weighted avg       0.65      0.65      0.65      1675

              precision    recall  f1-score   support

           0       0.67      0.72      0.69       954
           1       0.59      0.53      0.56       721

    accuracy                           0.64      1675
   macro avg       0.63      0.63      0.63      1675
weighted avg       0.64      0.64      0.64      1675

              precision    recall  f1-score   support

           0       0.71      0.70      0.71       954
           1       0.61      0.62      0.61       721

    accuracy                           0.67      1675
   macro avg       0.66      0.66      0.66      1675
weighted avg       0.67      0.67      0.67      1675



In [19]:
for report in reports[3:6]:
    print(report)

              precision    recall  f1-score   support

           0       0.71      0.72      0.71       954
           1       0.62      0.60      0.61       721

    accuracy                           0.67      1675
   macro avg       0.66      0.66      0.66      1675
weighted avg       0.67      0.67      0.67      1675

              precision    recall  f1-score   support

           0       0.69      0.74      0.72       954
           1       0.62      0.56      0.59       721

    accuracy                           0.66      1675
   macro avg       0.66      0.65      0.65      1675
weighted avg       0.66      0.66      0.66      1675

              precision    recall  f1-score   support

           0       0.69      0.76      0.72       954
           1       0.63      0.54      0.58       721

    accuracy                           0.67      1675
   macro avg       0.66      0.65      0.65      1675
weighted avg       0.66      0.67      0.66      1675



In [20]:
for report in reports[6:9]:
    print(report)

              precision    recall  f1-score   support

           0       0.68      0.76      0.72       954
           1       0.62      0.53      0.57       721

    accuracy                           0.66      1675
   macro avg       0.65      0.64      0.64      1675
weighted avg       0.65      0.66      0.65      1675

              precision    recall  f1-score   support

           0       0.68      0.76      0.72       954
           1       0.62      0.53      0.57       721

    accuracy                           0.66      1675
   macro avg       0.65      0.64      0.64      1675
weighted avg       0.66      0.66      0.65      1675

              precision    recall  f1-score   support

           0       0.69      0.77      0.73       954
           1       0.64      0.55      0.59       721

    accuracy                           0.67      1675
   macro avg       0.66      0.66      0.66      1675
weighted avg       0.67      0.67      0.67      1675



In [22]:
for report in reports[9:12]:
    print(report)

              precision    recall  f1-score   support

           0       0.72      0.70      0.71       954
           1       0.62      0.64      0.63       721

    accuracy                           0.67      1675
   macro avg       0.67      0.67      0.67      1675
weighted avg       0.68      0.67      0.67      1675

              precision    recall  f1-score   support

           0       0.70      0.75      0.72       954
           1       0.63      0.57      0.60       721

    accuracy                           0.67      1675
   macro avg       0.66      0.66      0.66      1675
weighted avg       0.67      0.67      0.67      1675

              precision    recall  f1-score   support

           0       0.69      0.72      0.71       954
           1       0.61      0.58      0.59       721

    accuracy                           0.66      1675
   macro avg       0.65      0.65      0.65      1675
weighted avg       0.66      0.66      0.66      1675



In [23]:
for report in reports[12:]:
    print(report)

              precision    recall  f1-score   support

           0       0.69      0.75      0.72       954
           1       0.63      0.57      0.59       721

    accuracy                           0.67      1675
   macro avg       0.66      0.66      0.66      1675
weighted avg       0.67      0.67      0.67      1675



In [46]:
test_dataset_neg_consistency["masked_non_negated"][:5000]

['Piero Gobetti used to communicate in <mask>.',
 'The original language of Thulladha Manamum Thullum is <mask>.',
 'Fiat Siena is produced by <mask>.',
 'ferric phosphate consists of <mask>.',
 'Carlos Fuentes used to communicate in <mask>.',
 'National Assembly of Hungary is a legal term in <mask>.',
 'Dennistoun Glacier is located in <mask>.',
 'Sasha Krasny died in the city of <mask>.',
 'Pakistan shares border with <mask>.',
 'Dream with Me was written in <mask>.',
 'Hamilton Central railway station is named after <mask>.',
 'West Royalty, Prince Edward Island is located in <mask>.',
 'Dyfed Archaeological Trust works in the field of <mask>.',
 'Florian Henckel von Donnersmarck used to communicate in <mask>.',
 'Edmund Hobhouse was born in the city of <mask>.',
 'The official language of Idaho is <mask>.',
 'Sir Thomas Buxton, 1st Baronet used to work in <mask>.',
 'Pro-feminism is part of <mask>.',
 'Yotam Halperin used to communicate in <mask>.',
 'Newfoundland expedition is loc

In [35]:
train_dataset_neg_detect["content"][:5000]


['Library of Alexandria',
 'Bruno Racine',
 'Salimuzzaman Siddiqui',
 "Kay O'Brien",
 'Antim Peak',
 'Adobe Media Player',
 'Cauchy stress tensor',
 'Raymond Queneau',
 'Sweden',
 'Chrysler K platform',
 'El Siglo Futuro',
 'Johan Gustaf Sandberg',
 'Fiat Uno',
 'Louisiana Voodoo',
 'Jean-Baptiste Henri Lacordaire',
 'aerial photography',
 'Chrysalis',
 'Charles Augustin Sainte-Beuve',
 'Super Monaco GP',
 'Abraham Blooteling',
 'Edward Bulwer-Lytton',
 "Pourvu qu'elles soient douces",
 'The Bill Cosby Show',
 'Pauline Mills McGibbon',
 'Maryland General Assembly',
 'Average Joe',
 'Singapore International Film Festival',
 'Munich Re',
 'Delhi Technological University',
 'Ukrainian Railways',
 'President of Vietnam',
 'BBC Persian Television',
 'Mishor Rohoshyo',
 'Super Monaco GP',
 'Western Canada Concept',
 'Sardinian Judicati',
 'Sony Crackle',
 'Christian Wilhelm Blomstrand',
 'Florence Regional Airport',
 'Little Child',
 'nickel silver',
 'Jan van Krimpen',
 'Uppslagsverket Finl

## Negation consistency given entire sentence

In [47]:
train_detect_neg_consistency_ful, train_labels_det_neg_consistency = get_hidden_states_many_examples(model_wrapped, train_dataset_neg_consistency, n=23000, batch_size=100, query_column="masked_non_negated")


In [49]:
test_detect_neg_consistency_ful, test_labels_det_neg_consistency = get_hidden_states_many_examples(model_wrapped, test_dataset_neg_consistency, n=5000, batch_size=100, query_column="masked_non_negated")


In [50]:
mlps, accuracies = run_experiment_across_layers(mlp_probe_experiment, train_detect_neg_consistency_ful, train_labels_det_neg_consistency, test_detect_neg_consistency_ful, test_labels_det_neg_consistency)



Iteration 1, loss = 0.56997413
Iteration 2, loss = 0.50857677
Iteration 3, loss = 0.47857090
Iteration 4, loss = 0.46597085
Iteration 5, loss = 0.45632371
Iteration 6, loss = 0.44836501
Iteration 7, loss = 0.44234312
Iteration 8, loss = 0.44229012
Iteration 9, loss = 0.43114708
Iteration 10, loss = 0.42617749
Iteration 11, loss = 0.42134867
Iteration 12, loss = 0.43761238
Iteration 13, loss = 0.41449206
Iteration 14, loss = 0.40155792
Iteration 15, loss = 0.39121109
Iteration 16, loss = 0.39435257
Iteration 17, loss = 0.37953237
Iteration 18, loss = 0.37719408
Iteration 19, loss = 0.36938035
Iteration 20, loss = 0.36610236
Iteration 21, loss = 0.35348845
Iteration 22, loss = 0.34967081
Iteration 23, loss = 0.35020981
Iteration 24, loss = 0.33993237
Iteration 25, loss = 0.32915414
Iteration 26, loss = 0.31814559
Iteration 27, loss = 0.31470023
Iteration 28, loss = 0.30567636
Iteration 29, loss = 0.29465151
Iteration 30, loss = 0.29191080
Iteration 31, loss = 0.28544499
Iteration 32, los

In [53]:
reports = generate_classification_report_all_layers(test_detect_neg_consistency_ful, test_labels_det_neg_consistency, mlps)


In [55]:
for report in reports[0:3]:
    print(report)

              precision    recall  f1-score   support

           0       0.79      0.80      0.79       954
           1       0.73      0.71      0.72       721

    accuracy                           0.76      1675
   macro avg       0.76      0.75      0.76      1675
weighted avg       0.76      0.76      0.76      1675

              precision    recall  f1-score   support

           0       0.77      0.83      0.80       954
           1       0.75      0.68      0.71       721

    accuracy                           0.77      1675
   macro avg       0.76      0.76      0.76      1675
weighted avg       0.76      0.77      0.76      1675

              precision    recall  f1-score   support

           0       0.79      0.79      0.79       954
           1       0.73      0.72      0.72       721

    accuracy                           0.76      1675
   macro avg       0.76      0.76      0.76      1675
weighted avg       0.76      0.76      0.76      1675



In [54]:
for report in reports[6:9]:
    print(report)

              precision    recall  f1-score   support

           0       0.82      0.83      0.82       954
           1       0.77      0.76      0.76       721

    accuracy                           0.80      1675
   macro avg       0.79      0.79      0.79      1675
weighted avg       0.80      0.80      0.80      1675

              precision    recall  f1-score   support

           0       0.82      0.80      0.81       954
           1       0.74      0.76      0.75       721

    accuracy                           0.78      1675
   macro avg       0.78      0.78      0.78      1675
weighted avg       0.78      0.78      0.78      1675

              precision    recall  f1-score   support

           0       0.81      0.82      0.82       954
           1       0.76      0.74      0.75       721

    accuracy                           0.79      1675
   macro avg       0.78      0.78      0.78      1675
weighted avg       0.79      0.79      0.79      1675



In [56]:
for report in reports[9:]:
    print(report)

              precision    recall  f1-score   support

           0       0.82      0.83      0.83       954
           1       0.77      0.76      0.77       721

    accuracy                           0.80      1675
   macro avg       0.80      0.80      0.80      1675
weighted avg       0.80      0.80      0.80      1675

              precision    recall  f1-score   support

           0       0.80      0.85      0.83       954
           1       0.78      0.73      0.75       721

    accuracy                           0.80      1675
   macro avg       0.79      0.79      0.79      1675
weighted avg       0.80      0.80      0.80      1675

              precision    recall  f1-score   support

           0       0.81      0.84      0.82       954
           1       0.77      0.73      0.75       721

    accuracy                           0.79      1675
   macro avg       0.79      0.79      0.79      1675
weighted avg       0.79      0.79      0.79      1675

              preci