In [None]:
import os
seed = 7
os.environ['PYTHONHASHSEED']=str(seed)
os.environ["CUBLAS_WORKSPACE_CONFIG"]=":4096:8" 
os.environ["CUDA_LAUNCH_BLOCKING"]="1" 

In [None]:
!git clone https://github.com/fdalvi/NeuroX

In [None]:
import sys
package_paths = [
    '/kaggle/working/NeuroX/',
]

for pth in package_paths:
    sys.path.append(pth)

In [None]:
import re
import numpy as np
from copy import deepcopy
from pathlib import Path
from collections import OrderedDict
from IPython.display import clear_output

In [None]:
import torch
torch.__version__

In [None]:
import random

In [None]:
def set_seed(seed):
    
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
    torch.use_deterministic_algorithms(True)
    
    random.seed(seed)
    torch.manual_seed(seed)

    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)

    os.environ["PYTHONHASHSEED"] = str(seed)
    
    
set_seed(seed)

In [None]:
torch.set_num_threads(1)

In [None]:
data_path = '/kaggle/input/taiga-pos'
path_work = '/kaggle/working/'

In [None]:
from NeuroX.neurox.data.extraction import transformers_extractor

Конвертируем файл, который получили из Probing_framework (уже получили, не в этом ноутбуке)

In [None]:
data_seed = 12345

In [None]:
class ConvertSample:
    """"
    Gets .csv files, makes train & test split in .txt format, trying to balance data.
    """
    
    def __init__(self, path, train_size=2500, test_size=900, shuffle: bool = True): 

        self.shuffle = shuffle
        self.path = path
        self.project_path = str(Path(os.getcwd()).parents[0])
        self.category = re.search(r'[a-zA-Z]+_[a-zA-Z]+(?=.csv)', path)[0]
        self.train_size = train_size
        self.test_size = test_size
        

    def read(self) -> list: 
        with open(self.path, encoding="utf-8") as f:
            lines = [line.split('\t') for line in f]
            
            if self.shuffle:
                random.seed(data_seed)
                random.shuffle(lines)
                
        return lines
    
    def stupid_cycle(self, values, dct, number) -> dict: #util для семплинга
        
        dict_filter = OrderedDict()
        
        for value in values:
            i = 0
            for k, v in dct.items():
                if v == value:
                    if i < number:
                        dict_filter[k] = v
                        i+=1
            
        return dict_filter
    
    
    def stupid_test(self, values, dct) -> dict: #util для семплинга
        
        dict_filter = OrderedDict()
        
        for value in values:
            i = 0
            for k, v in dct.items():
                if v == value:
                    dict_filter[k] = v
                    
        return dict_filter
                
    def stupid_sampler(self) -> dict: #семплинг данных
        
        sents = self.read()
        values_train = []
        values_test = []
        sents_train = []
        sents_test = []

        for line in sents:
            part, value, sentence = line[0], line[1], line[2]
            if 2 < len(sentence.split()) < 35:
                if part == 'tr':
                    if sentence not in sents_train:
                        values_train.append(value)
                        sents_train.append(sentence)
                    
                if part == 'te' or part== 'va':
                    if sentence not in sents_train and sentence not in sents_test:
                        values_test.append(value)
                        sents_test.append(sentence)
        
        
        train_dict = OrderedDict(zip(sents_train, values_train))
        test_dict = OrderedDict(zip(sents_test, values_test))

        A = set(values_train)
        B = set(values_test)
        values = sorted(list(A.intersection(B)))
        
        length = len(values)
            
        number_one = round(self.train_size/length)

        dict_filter_train = self.stupid_cycle(values, train_dict, number_one)
        dict_filter_test = self.stupid_test(values, test_dict)
            
        return dict_filter_train, dict_filter_test

    def permute(self, dct) -> dict: # перемешивает словарь данных
        
        l = list(dct.items())
        random.seed(data_seed)
        random.shuffle(l)
        return OrderedDict(l)
        
    def using_shuffle(self, a):
        
        keys = list(a.keys())
        values = list(a.values())
        random.seed(data_seed)
        random.shuffle(values)
        d = OrderedDict(zip(keys, values))
        return d

    def create_dicts(self):
        
        dict_filter_train, dict_filter_test = self.stupid_sampler()

        if self.shuffle:
            dict_filter_train = self.permute(dict_filter_train)
            dict_filter_test = self.permute(dict_filter_test)
        
        dict_control_task = dict_filter_train.copy()
        dict_control_task = self.using_shuffle(dict_control_task)

        return dict_filter_train, dict_filter_test, dict_control_task


    def create_paths(self) -> str:
        
        if re.search(r'(?<=\/)[a-zA-Z][a-zA-Z]_[a-zA-Z]+(?=_)', self.path)[0]:
            dataset = re.search(r'(?<=\/)[a-zA-Z][a-zA-Z]_[a-zA-Z]+(?=_)', self.path)[0]
            path = path_work+f'/large_data_{dataset}'
        else:
            path = path_work+'/large_data'
            
        if not os.path.isdir(path):
            os.mkdir(path)
            
        if not os.path.isdir(path+f'/data_{self.category}'):
            os.mkdir(path+f'/data_{self.category}')
        
        result_path_datatrain = path+f"/data_{self.category}/datatrain_{self.category}.txt"
        result_path_labeltrain = path+f"/data_{self.category}/labeltrain_{self.category}.txt"
        
        result_path_cdatatrain = path+f"/data_{self.category}/cdatatrain_{self.category}.txt"
        result_path_clabeltrain = path+f"/data_{self.category}/clabeltrain_{self.category}.txt"
        
        result_path_datatest = path+f"/data_{self.category}/datatest_{self.category}.txt"
        result_path_labeltest = path+f"/data_{self.category}/labeltest_{self.category}.txt"

        return result_path_datatrain, result_path_labeltrain, result_path_cdatatrain, result_path_clabeltrain, \
               result_path_datatest, result_path_labeltest


    def writer(self) -> str: 
        """
        Writes to a file
        """
        result_datatrain, result_labeltrain, result_cdatatrain, result_clabeltrain, result_datatest, result_labeltest = self.create_paths()
       
        
        dict_filter_train, dict_filter_test, dict_control_task = self.create_dicts()

        with open(result_datatrain, "w", encoding="utf-8") as traindata, \
             open(result_labeltrain, "w", encoding="utf-8") as trainlabel, \
             open(result_cdatatrain, "w", encoding="utf-8") as ctraindata, \
             open(result_clabeltrain, "w", encoding="utf-8") as ctrainlabel, \
             open(result_datatest, "w", encoding="utf-8") as testdata, \
             open(result_labeltest, "w", encoding="utf-8") as testlabel:
            
    
            for sentence, value in dict_filter_train.items():
                traindata.writelines(sentence)
                trainlabel.writelines(value + '\n')

            for sentence, value in dict_control_task .items():
                ctraindata.writelines(sentence)
                ctrainlabel.writelines(value + '\n')


            for sentence, value in dict_filter_test.items():
                testdata.writelines(sentence)
                testlabel.writelines(value + '\n')
                                                                  
        
        return result_datatrain, result_labeltrain, result_cdatatrain, result_clabeltrain, result_datatest, result_labeltest
        


class GetEmbeddings:
    """"
    Receives .txt files with sentences and computes embeddings for them.
    """
    
    def __init__(self, path_trdata, path_tedata):
        
        self.path_trdata = path_trdata
        self.path_tedata = path_tedata
        
        self.category = re.search(r'[a-zA-Z]+_[a-zA-Z]+(?=.txt)', path_trdata)[0]
        self.dataset = re.search(r'(?<=_)[a-zA-Z]+_[a-zA-Z]+(?=\/)', path_trdata)[0]
        
    def jsons(self, model):
        
        path = path_work + f'/large_data_{self.dataset}/data_{self.category}'
        
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        print('Using device:', device)
        print()
        
        transformers_extractor.extract_representations(model,
        self.path_trdata,
        path+'/activations_train.json',
        aggregation="average", #last, first   
        device=device                                            
        )
        
        clear_output(wait=False)
        print('Using device:', device)
        print()
        
        transformers_extractor.extract_representations(model,
        self.path_tedata,
        path+'/activations_te.json',
        aggregation="average", #last, first
        device=device                                               
        )
        clear_output(wait=False)

https://neurox.qcri.org/docs/neurox.data.extraction.html?highlight=extract_representations#neurox.data.extraction.transformers_extractor.extract_representations

In [None]:
import pickle
from NeuroX.neurox.data import loader as data_loader
from NeuroX.neurox.interpretation import utils
from NeuroX.neurox.interpretation import ablation
from NeuroX.neurox.interpretation import linear_probe

In [None]:
def load_sentence_data(source_path, labels_path, activations): 
    
    #тут немного переписали функцию потому что в библиотеке ошибка!!!

    tokens = {"source": [], "target": []}

    with open(source_path) as source_fp:
        for line_idx, line in enumerate(source_fp):
            line_tokens = line.strip().split() #вот тут переписано
            tokens["source"].append(line_tokens) #и тут

    with open(labels_path) as labels_fp:
        for line in labels_fp:
            line_tokens = line.strip().split()
            tokens["target"].append(line_tokens)

    assert len(tokens["source"]) == len(tokens["target"]), (
        "Number of lines do not match (source: %d, target: %d)!"
        % (len(tokens["source"]), len(tokens["target"]))
    )

    assert len(activations) == len(tokens["source"]), (
        "Number of lines do not match (activations: %d, source: %d)!"
        % (len(activations), len(tokens["source"]))
    )

    
    for idx, activation in enumerate(activations):
        assert activation.shape[0] == len(tokens["source"][idx])

    return tokens

In [None]:
import torch.nn as nn
from torch.autograd import Variable


def _numpyfy(x):
    if isinstance(x, np.ndarray):
        return x
    return np.array(x)


def accuracy(preds, labels):
    preds = _numpyfy(preds)
    labels = _numpyfy(labels)
    return (preds == labels).mean()


class LinearProbe(nn.Module):
    """Torch model for linear probe"""
    
    def __init__(self, input_size, num_classes):
        """Initialize a linear model"""
        super(LinearProbe, self).__init__()
        self.linear = nn.Linear(input_size, num_classes)

    def forward(self, x):
        """Run a forward pass on the model"""
        out = self.linear(x)
        return out

def l1_penalty(var):
    return torch.abs(var).sum()


def l2_penalty(var):
    return torch.sqrt(torch.pow(var, 2).sum())


def _train_probe(
    X_train,
    y_train,
    task_type,
    lambda_l1=0,
    lambda_l2=0,
    num_epochs=10,
    batch_size=32,
    learning_rate=0.001,
    ):

    progressbar = utils.get_progress_bar()
    print("Training %s probe" % (task_type))
    # Check if we can use GPU's for training
    use_gpu = torch.cuda.is_available()

    if lambda_l1 is None or lambda_l2 is None:
        raise ValueError("Regularization weights cannot be None")

    print("Creating model...")
    if task_type == "classification":
        num_classes = len(set(y_train))
        if num_classes <= 1:
            raise ValueError(
                "Classification problem must have more than one target class"
            )
    else:
        num_classes = 1
    print("Number of training instances:", X_train.shape[0])
    if task_type == "classification":
        print("Number of classes:", num_classes)
    set_seed(seed)
    probe = LinearProbe(X_train.shape[1], num_classes)
    if use_gpu:
        probe = probe.cuda()

    if task_type == "classification":
        criterion = nn.CrossEntropyLoss()
    elif task_type == "regression":
        criterion = nn.MSELoss()
    else:
        raise ValueError("Invalid `task_type`")
    
    set_seed(seed)
    optimizer = torch.optim.Adam(probe.parameters(), lr=learning_rate)

    X_tensor = torch.from_numpy(X_train)
    y_tensor = torch.from_numpy(y_train)

    for epoch in range(num_epochs):
        num_tokens = 0
        avg_loss = 0
        for inputs, labels in progressbar(
            utils.batch_generator(X_tensor, y_tensor, batch_size=batch_size),
            desc="epoch [%d/%d]" % (epoch + 1, num_epochs),
        ):
            num_tokens += inputs.shape[0]
            if use_gpu:
                inputs = inputs.cuda()
                labels = labels.cuda()
            inputs = inputs.float()
            inputs = Variable(inputs)
            labels = Variable(labels)

            # Forward + Backward + Optimize
            set_seed(seed)
            optimizer.zero_grad()

            outputs = probe(inputs)
            
            if task_type == "regression":
                outputs = outputs.squeeze()
                
            weights = list(probe.parameters())[0]
            
            set_seed(seed)
            loss = (
                criterion(outputs, labels)
                + lambda_l1 * l1_penalty(weights)
                + lambda_l2 * l2_penalty(weights)
            )
            
            set_seed(seed)
            loss.backward()
            
            set_seed(seed)
            optimizer.step()

            avg_loss += loss.item()

        print(
            "Epoch: [%d/%d], Loss: %.4f"
            % (epoch + 1, num_epochs, avg_loss / num_tokens)
        )

    return probe


def train_logistic_regression_probe(
    X_train,
    y_train,
    lambda_l1=0,
    lambda_l2=0,
    num_epochs=10,
    batch_size=32,
    learning_rate=0.001,
    ):

    return _train_probe(
        X_train,
        y_train,
        task_type="classification",
        lambda_l1=lambda_l1,
        lambda_l2=lambda_l2,
        num_epochs=num_epochs,
        batch_size=batch_size,
        learning_rate=learning_rate,
    )

def compute_score(preds, labels, metric):

    if metric == "accuracy":
        return accuracy(preds, labels)


def evaluate_probe(
    probe,
    X,
    y,
    idx_to_class=None,
    return_predictions=False,
    source_tokens=None,
    batch_size=32,
    metric="accuracy",
    ):
 
    progressbar = utils.get_progress_bar()

    # Check if we can use GPU's for evaluation
    use_gpu = torch.cuda.is_available()

    if use_gpu:
        probe = probe.cuda()

    # always evaluate in full precision
    probe = probe.float()

    # Test the Model
    y_pred = []

    def source_generator():
        for s in source_tokens:
            for t in s:
                yield t

    src_words = source_generator()

    if return_predictions:
        predictions = []
        src_word = -1

    for inputs, labels in progressbar(
        utils.batch_generator(
            torch.from_numpy(X), torch.from_numpy(y), batch_size=batch_size
        ),
        desc="Evaluating",
        ):
        if use_gpu:
            inputs = inputs.cuda()
            labels = labels.cuda()

        # always evaluate in full precision
        inputs = inputs.float()

        inputs = Variable(inputs)
        labels = Variable(labels)

        outputs = probe(inputs)

        if outputs.data.shape[1] == 1:
            # Regression
            predicted = outputs.data
        else:
            # Classification
            _, predicted = torch.max(outputs.data, 1)
        predicted = predicted.cpu().numpy()

        for i in range(0, len(predicted)):
            idx = predicted[i]
            if idx_to_class:
                key = idx_to_class[idx]
            else:
                key = idx

            y_pred.append(predicted[i])

            if return_predictions:
                if source_tokens:
                    src_word = next(src_words)
                else:
                    src_word = src_word + 1
                predictions.append((src_word, key, labels[i].item() == idx))

    y_pred = np.array(y_pred)

    result = compute_score(y_pred, y, metric)

    print("Score (%s) of the probe: %0.2f" % (metric, result))

    class_scores = {}
    class_scores["__OVERALL__"] = result

    if idx_to_class:
        for i in idx_to_class:
            class_name = idx_to_class[i]
            class_instances_idx = np.where(y == i)[0]
            y_pred_filtered = y_pred[class_instances_idx]
            y_filtered = y[class_instances_idx]
            total = y_filtered.shape
            if total == 0:
                class_scores[class_name] = 0
            else:
                class_scores[class_name] = compute_score(
                    y_pred_filtered, y_filtered, metric
                )

    if return_predictions:
        return class_scores, predictions
    return class_scores


In [None]:
class Experiment:
    
    
    def __init__(self, path_trdata, path_trlabel, path_tedata, path_telabel):

        #некрасиво инициализирована куча переменных для функций
        
        self.path_trdata = path_trdata
        self.path_trlabel = path_trlabel
        self.path_tedata = path_tedata
        self.path_telabel = path_telabel
        self.category = re.search(r'[a-zA-Z]+_[a-zA-Z]+(?=.txt)', path_trdata)[0]
        self.dataset = re.search(r'(?<=_)[a-zA-Z]+_[a-zA-Z]+(?=\/)', path_trdata)[0]
        
        self.path = path_work+f'large_data_{self.dataset}/data_{self.category}'
        
        self.activations_tr, self.num_layers = data_loader.load_activations(self.path+'/activations_train.json', 768)
        self.activations_te, self.num_layers = data_loader.load_activations(self.path+'/activations_te.json', 768)
        
        self.tokens_tr = load_sentence_data(self.path_trdata, self.path_trlabel, self.activations_tr)
        self.tokens_te = load_sentence_data(self.path_tedata, self.path_telabel, self.activations_te)
        
        self.X_tr, self.y_tr, mp = utils.create_tensors(self.tokens_tr, self.activations_tr, 'Nom')
        self.label2idx, self.idx2label, self.src2idx, self.idx2src = mp

        self.X_te, self.y_te, mapping = utils.create_tensors(self.tokens_te, self.activations_te, 'Nom', mappings = mp)
    

        
    def run_classification(self):#just пробинг
           
        probe = train_logistic_regression_probe(self.X_tr, self.y_tr, lambda_l1=0.003, lambda_l2=0.003, batch_size=64)
        scores_tr = evaluate_probe(probe, self.X_tr, self.y_tr, idx_to_class=self.idx2label, batch_size=64)
        scores_te = evaluate_probe(probe, self.X_te, self.y_te, idx_to_class=self.idx2label, batch_size=64)
        return probe, scores_tr, scores_te
    
    def nranking(self, probe): #тут топ нейроны

        ordering, cutoffs = linear_probe.get_neuron_ordering(probe, self.label2idx, search_stride=99)
        return ordering, cutoffs
    
    def top_n(self, probe, percentage=0.1):

        return linear_probe.get_top_neurons(probe, percentage, self.label2idx) #return np.array(list(top_neurons_union)), top_neurons (dict)
    
    def threshold_n(self, probe, fraction=2):
        return linear_probe.get_top_neurons_hard_threshold(probe, fraction, self.label2idx) #np.array(list(top_neurons_union)), top_neurons
    
    def keep_bottom(self, neurons):
        X_tr_b = deepcopy(self.X_tr)
        X_te_b = deepcopy(self.X_te)
        X_tr_selected = ablation.filter_activations_remove_neurons(X_tr_b, neurons)
        probe_selected = linear_probe.train_logistic_regression_probe(X_tr_selected, self.y_tr, lambda_l1=0.003, lambda_l2=0.003)
        scores_tr = linear_probe.evaluate_probe(probe_selected, X_tr_selected, self.y_tr, idx_to_class=self.idx2label)
        X_te_selected = ablation.filter_activations_remove_neurons(X_te_b, neurons)
        scores_te = linear_probe.evaluate_probe(probe_selected, X_te_selected, self.y_te, idx_to_class=self.idx2label)
        return scores_tr, scores_te
    
    def keep_util(self, neurons, X_tr, X_te):
        X_tr_selected = ablation.filter_activations_keep_neurons(X_tr, neurons)
        probe_selected = linear_probe.train_logistic_regression_probe(X_tr_selected, self.y_tr, lambda_l1=0.003, lambda_l2=0.003)
        scores_tr = linear_probe.evaluate_probe(probe_selected, X_tr_selected, self.y_tr, idx_to_class=self.idx2label)
        X_te_selected = ablation.filter_activations_keep_neurons(X_te, neurons)
        scores_te = linear_probe.evaluate_probe(probe_selected, X_te_selected, self.y_te, idx_to_class=self.idx2label)
        return scores_tr, scores_te
    
    def return_weights(self, probe):
        weights1 = list(probe.parameters())[0].data.cpu()
        weights2 = np.abs(weights1.numpy())
        return weights1, weights2
        
    def keep_only(self, neurons, goal='top'):
       
        if goal == 'top':
            X_tr_top = deepcopy(self.X_tr)
            X_te_top = deepcopy(self.X_te)
            return self.keep_util(neurons, X_tr_top, X_te_top)
            
        elif goal == 'threshold':
            X_tr_t = deepcopy(self.X_tr)
            X_te_t = deepcopy(self.X_te)
            return self.keep_util(neurons, X_tr_t, X_te_t)  
        
        
    def data_size(self):
        return self.X_tr.shape[0], self.X_te.shape[0], len(set(self.y_te))

In [None]:
for dirname, _, filenames in os.walk(data_path):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
ordered_neurons = {}
threshold = {}
threshold_c = {}
weights = {}

top_n = {}
top_n_c = {}

bottom_n = {}
# bottom_n2 = {}

scores = {}
scores_control = {}

size = {}

scores_keep_top = {}
scores_keep_top_c = {}

scores_keep_thres = {}
scores_keep_thres_c = {}

scores_keep_bot = {}
# scores_keep_bot2 = {}




for dirname, _, filenames in os.walk(data_path):
    for filename in filenames:
        file = os.path.join(dirname, filename)

        splitter = ConvertSample(file)

        #получаем трейновую и тестовую выборку
        path_trdata, path_trlabel,path_ctrdata, path_ctrlabel, path_tedata, path_telabel = splitter.writer()
        #получаем эмбеддинги
        data = GetEmbeddings(path_trdata, path_tedata)
        data.jsons('bert-base-multilingual-uncased')
        cat = Experiment(path_trdata, path_trlabel, path_tedata, path_telabel)    
        cat_name = re.search(r'[a-zA-Z]+_[a-zA-Z]+(?=.txt)', path_trdata)[0]
        d_name = cat.dataset
        X_tr_shape, X_te_shape, n_class = cat.data_size()
        size[cat_name] = [X_tr_shape, X_te_shape, n_class]
        
        probe, scores_tr, scores_te = cat.run_classification() # просто классификация
        scores[cat_name] = [scores_tr, scores_te]
        weights1, weights2 = cat.return_weights(probe)
        weights[cat_name] = [weights1, weights2]
        
        ordering, cutoffs = cat.nranking(probe) # ранжирование
        ordered_neurons[cat_name] = [ordering, cutoffs]
        
        top_n[cat_name] = cat.top_n(probe)[0] 
        scores_tr, scores_te = cat.keep_only(neurons=top_n[cat_name], goal='top') 
        scores_keep_top[cat_name] = [scores_tr, scores_te] # на топ процентов
        
        bottom_n[cat_name] = cat.top_n(probe, percentage=0.9)[0]
        scores_tr, scores_te = cat.keep_bottom(neurons=bottom_n[cat_name])
        scores_keep_bot[cat_name] = [scores_tr, scores_te]   # на bottom процентов
        
#         bottom_n2[cat_name] = cat.top_n(probe, percentage=0.98)[0]
#         scores_tr, scores_te = cat.keep_bottom(neurons=bottom_n2[cat_name])
#         scores_keep_bot2[cat_name] = [scores_tr, scores_te]   # на bottom процентов
        
        
        threshold[cat_name] = cat.threshold_n(probe)[0] 
        scores_tr, scores_te = cat.keep_only(threshold[cat_name], goal='threshold') 
        scores_keep_thres[cat_name] = [scores_tr, scores_te] # с трешхолдом
        
        
        with open(f'scores_{d_name}.pkl', 'wb') as f:
            pickle.dump(scores, f)
            
        with open(f'neurons_{d_name}.pkl', 'wb') as f:
            pickle.dump(ordered_neurons, f)
        
        with open(f'weights_{d_name}.pkl', 'wb') as f:
            pickle.dump(weights, f)
            
        with open(f'scores_keep_top_{d_name}.pkl', 'wb') as f:
            pickle.dump(scores_keep_top, f)
            
        with open(f'scores_keep_thres_{d_name}.pkl', 'wb') as f:
            pickle.dump(scores_keep_thres, f)
            
        with open(f'scores_keep_bot_{d_name}.pkl', 'wb') as f:
            pickle.dump(scores_keep_bot, f)
            
#         with open(f'scores_keep_bot2_{d_name}.pkl', 'wb') as f:
#             pickle.dump(scores_keep_bot2, f)
            
        with open(f'top_n_{d_name}.pkl', 'wb') as f:
            pickle.dump(top_n, f)
            
        with open(f'bottom_n_{d_name}.pkl', 'wb') as f:
            pickle.dump(bottom_n, f)
            
#         with open(f'bottom_n2_{d_name}.pkl', 'wb') as f:
#             pickle.dump(bottom_n2, f)
            
        with open(f'threshold_{d_name}.pkl', 'wb') as f:
            pickle.dump(threshold, f)
            
        with open(f'size_{d_name}.pkl', 'wb') as f:
            pickle.dump(size, f)
            
        cat = Experiment(path_ctrdata, path_ctrlabel, path_tedata, path_telabel)    
        cat_name = re.search(r'[a-zA-Z]+_[a-zA-Z]+(?=.txt)', path_trdata)[0]
        
        probe, scores_tr, scores_te = cat.run_classification()
        scores_control[cat_name] = [scores_tr, scores_te]
        
        top_n_c[cat_name] = cat.top_n(probe)[0] 
        scores_tr, scores_te = cat.keep_only(neurons=top_n_c[cat_name], goal='top')
        scores_keep_top_c[cat_name] = [scores_tr, scores_te] # на топ процентов
        
        threshold_c[cat_name] = cat.threshold_n(probe)[0] 
        scores_tr, scores_te = cat.keep_only(threshold_c[cat_name], goal='threshold') 
        scores_keep_thres_c[cat_name] = [scores_tr, scores_te] # с трешхолдом
        
        with open(f'scores_c_{d_name}.pkl', 'wb') as f:
            pickle.dump(scores_control, f)
        
        with open(f'scores_keep_top_c_{d_name}.pkl', 'wb') as f:
            pickle.dump(scores_keep_top_c, f)
            
        with open(f'scores_keep_thres_c_{d_name}.pkl', 'wb') as f:
            pickle.dump(scores_keep_thres_c, f)
         


        dir_to_delete = f'/kaggle/working/large_data_{d_name}/data_{cat_name}/' 
        with os.scandir(dir_to_delete) as entries:
            for entry in entries:
                file_to_delete = f"{dir_to_delete}{entry.name}"
                print(file_to_delete)
                os.remove(file_to_delete)

In [None]:
scores