In [1]:
import os
import re
from pathlib import Path
import pickle
from IPython.display import clear_output
import neurox.data.loader as data_loader
import neurox.interpretation.utils as utils
import neurox.interpretation.ablation as ablation
import neurox.interpretation.linear_probe as linear_probe
import neurox.data.extraction.transformers_extractor as transformers_extractor

In [2]:
path_project = Path(os.getcwd()).parents[1]
path_project = str(path_project)

In [3]:
path_project

'/home/senya/Документы/project'

In [4]:
directories = [] #тут делаем обход, чтобы получить имена каталогов
for root,dirs,files in os.walk(path_project+'/large_data_en_ewt'):
    for directory in dirs:
        directories.append(os.path.join(root, directory))

In [5]:
all_paths = []
for directory in directories: #тут делаем обход каталагов, чтобы получить имена файлов
    paths = []
    for (root,dirs,files) in os.walk(directory, topdown=True):
        for file in files:
            paths.append(os.path.join(root, file))
        all_paths.append(sorted(paths))

In [6]:
def load_sentence_data(source_path, labels_path, activations): 
    
    #тут немного переписали функцию потому что в библиотеке ошибка!!!
    """Loads sentence-annotated text-label pairs. This function loads the source
    text, target labels, and activations and tries to make them perfectly
    parallel, i.e. number of tokens in line N of source would
    match the number of activations at index N. The method will delete
    non-matching activation/source pairs. The activations will be modified
    in place.

    Parameters
    ----------
    source_path : str
        Path to the source text file, one sentence per line
    labels_path : str
        Path to the annotated labels file, one sentence per line corresponding to
        the sentences in the ``source_path`` file.
    activations : list of numpy.ndarray
        Activations returned from ``loader.load_activations``

    Returns
    -------
    tokens : dict
        Dictionary containing two lists, ``source`` and ``target``. ``source``
        contains all of the sentences from ``source_path`` that were not ignored.
        ``target`` contains the parallel set of annotated labels.

    """
    tokens = {"source": [], "target": []}

    with open(source_path) as source_fp:
        for line_idx, line in enumerate(source_fp):
            line_tokens = line.strip().split() #вот тут переписано
            tokens["source"].append(line_tokens) #и тут

    with open(labels_path) as labels_fp:
        for line in labels_fp:
            line_tokens = line.strip().split()
            tokens["target"].append(line_tokens)

    assert len(tokens["source"]) == len(tokens["target"]), (
        "Number of lines do not match (source: %d, target: %d)!"
        % (len(tokens["source"]), len(tokens["target"]))
    )

    assert len(activations) == len(tokens["source"]), (
        "Number of lines do not match (activations: %d, source: %d)!"
        % (len(activations), len(tokens["source"]))
    )

    # Check if all data is well formed (whether we have activations + labels for
    # each and every word)
    for idx, activation in enumerate(activations):
        assert activation.shape[0] == len(tokens["source"][idx])

    return tokens

In [7]:
class Experiment:
    
    
    def __init__(self, path_trdata, path_trlabel, path_tedata, path_telabel):
        
        #некрасиво инициализирована куча переменных для функций
        
        self.path_trdata = path_trdata
        self.path_trlabel = path_trlabel
        self.path_tedata = path_tedata
        self.path_telabel = path_telabel
        self.category = re.search(r'(?<=_)[a-zA-Z]+(?=.txt)', path_trdata)[0]
        self.dataset = re.search(r'(?<=_)[a-zA-Z]+_[a-zA-Z]+(?=\/)', path_trdata)[0]
        
        self.path = path_project+f'/large_data_{self.dataset}/data_{self.category}'
        
        self.activations_tr, self.num_layers = data_loader.load_activations(self.path+'/activations_train.json', 768)
        self.activations_te, self.num_layers = data_loader.load_activations(self.path+'/activations_te.json', 768)
        
        self.tokens_tr = load_sentence_data(self.path_trdata, self.path_trlabel, self.activations_tr)
        self.tokens_te = load_sentence_data(self.path_tedata, self.path_telabel, self.activations_te)
        
        self.X_tr, self.y_tr, self.mapping_tr = utils.create_tensors(self.tokens_tr, self.activations_tr, 'Nom')
        self.label2idx_tr, self.idx2label_tr, self.src2idx_tr, self.idx2src_tr = self.mapping_tr
        
        self.X_te, self.y_te, self.mapping_te = utils.create_tensors(self.tokens_te, self.activations_te, 'Nom')
        self.label2idx_te, self.idx2label_te, self.src2idx_te, self.idx2src_te = self.mapping_te
        
        
    def run_classification(self):#just пробинг
        probe = linear_probe.train_logistic_regression_probe(self.X_tr, self.y_tr, lambda_l1=0.001, lambda_l2=0.001)
        scores_tr = linear_probe.evaluate_probe(probe, self.X_tr, self.y_tr, idx_to_class=self.idx2label_tr)
        scores_te = linear_probe.evaluate_probe(probe, self.X_te, self.y_te, idx_to_class=self.idx2label_te)
        return probe, scores_tr, scores_te
    
    
    def lairwise(self, n): #пробинг по слоям! можно выбрать
        layer_0_X_tr = ablation.filter_activations_by_layers(self.X_tr, [n], 13)
        probe_layer_0 = linear_probe.train_logistic_regression_probe(layer_0_X_tr, self.y_tr, lambda_l1=0.001, lambda_l2=0.001)
        scores_tr = linear_probe.evaluate_probe(probe_layer_0, layer_0_X_tr, self.y_tr, idx_to_class=self.idx2label_tr)
        
        layer_0_X_te = ablation.filter_activations_by_layers(self.X_te, [n], 13)
        scores_te = linear_probe.evaluate_probe(probe_layer_0, layer_0_X_te, self.y_te, idx_to_class=self.idx2label_te)
        return scores_tr, scores_te
    
    
    def nranking(self, n=100, k = 'ordering'): #тут топ нейроны
        probe, scores_tr, scores_te = self.run_classification()
        ordering, cutoffs = linear_probe.get_neuron_ordering(probe, self.label2idx_tr)
        if k == 'ordering':
            return ordering, scores_tr, scores_te
        elif k == 'train':
            X_tr_selected = ablation.filter_activations_keep_neurons(self.X_tr, ordering[:n])
            probe_selected = linear_probe.train_logistic_regression_probe(X_tr_selected, self.y_tr, lambda_l1=0.001, lambda_l2=0.001)
            scores_tr = linear_probe.evaluate_probe(probe_selected, X_tr_selected, self.y_tr, idx_to_class=self.idx2label_tr)
            X_te_selected = ablation.filter_activations_keep_neurons(self.X_te, ordering[:n])
            scores_te = linear_probe.evaluate_probe(probe_selected, X_te_selected, self.y_te, idx_to_class=self.idx2label_te)
            return ordering, scores_tr, scores_te

In [8]:
all_paths.pop(6) # с PronType просто падает какая то ошибка в данных, поэтому удаляю эту категорию

['/home/senya/Документы/project/large_data_en_ewt/data_PronType/activations_te.json',
 '/home/senya/Документы/project/large_data_en_ewt/data_PronType/activations_train.json',
 '/home/senya/Документы/project/large_data_en_ewt/data_PronType/datatest_PronType.txt',
 '/home/senya/Документы/project/large_data_en_ewt/data_PronType/datatrain_PronType.txt',
 '/home/senya/Документы/project/large_data_en_ewt/data_PronType/labeltest_PronType.txt',
 '/home/senya/Документы/project/large_data_en_ewt/data_PronType/labeltrain_PronType.txt']

In [9]:
# datatest[2], labeltest[4]
# datatrain[3], labeltrain[5]
ordered_neurons = {}
scores = {}
for path in all_paths:
    cat = Experiment(path[3], path[5], path[2], path[4])    
    cat_name = re.search(r'(?<=_)[a-zA-Z]+(?=.txt)', path[3])[0]
    neurons, scores_tr, scores_te = cat.nranking()
    ordered_neurons[cat_name] = neurons
    scores[cat_name] = [scores_tr, scores_te] 

Loading json activations from /home/senya/Документы/project/large_data_en_ewt/data_Number/activations_train.json...
2600 13.0
Loading json activations from /home/senya/Документы/project/large_data_en_ewt/data_Number/activations_te.json...
900 13.0
Number of tokens:  2600
length of source dictionary:  8479
length of target dictionary:  2
2600
Total instances: 2600
['2', 'Hot', 'Admin', '49', '"Set', 'What', 'yep', '"Don', 'they', 'Stay', 'During', 'Disatisfied', 'Louisiana', 'Is', '14', '--', 'Like', 'A', 'Than', 'Budgies']
Number of samples:  2600
Stats: Labels with their frequencies in the final set
Sing 1300
Plur 1300
Number of tokens:  900
length of source dictionary:  3757
length of target dictionary:  2
900
Total instances: 900
['What', 'Mercedes', 'they', 'During', 'Is', 'Absoul', 'Like', '"Twinkle', 'A', '"However', 'sincere', 'My', 'many', 'They', 'weather', 'Analyst', 'Are', 'Read', 'service', 'While']
Number of samples:  900
Stats: Labels with their frequencies in the final s

epoch [1/10]: 0it [00:00, ?it/s]

Epoch: [1/10], Loss: 0.0186


epoch [2/10]: 0it [00:00, ?it/s]

Epoch: [2/10], Loss: 0.0138


epoch [3/10]: 0it [00:00, ?it/s]

Epoch: [3/10], Loss: 0.0122


epoch [4/10]: 0it [00:00, ?it/s]

Epoch: [4/10], Loss: 0.0114


epoch [5/10]: 0it [00:00, ?it/s]

Epoch: [5/10], Loss: 0.0109


epoch [6/10]: 0it [00:00, ?it/s]

Epoch: [6/10], Loss: 0.0106


epoch [7/10]: 0it [00:00, ?it/s]

Epoch: [7/10], Loss: 0.0105


epoch [8/10]: 0it [00:00, ?it/s]

Epoch: [8/10], Loss: 0.0105


epoch [9/10]: 0it [00:00, ?it/s]

Epoch: [9/10], Loss: 0.0106


epoch [10/10]: 0it [00:00, ?it/s]

Epoch: [10/10], Loss: 0.0108


Evaluating: 0it [00:00, ?it/s]

Score (accuracy) of the probe: 0.87


Evaluating: 0it [00:00, ?it/s]

Score (accuracy) of the probe: 0.78


  0%|          | 0/101 [00:00<?, ?it/s]

Loading json activations from /home/senya/Документы/project/large_data_en_ewt/data_Tense/activations_train.json...
2600 13.0
Loading json activations from /home/senya/Документы/project/large_data_en_ewt/data_Tense/activations_te.json...
900 13.0
Number of tokens:  2600
length of source dictionary:  8418
length of target dictionary:  2
2600
Total instances: 2600
['2', 'Hot', 'Admin', '49', 'looks', 'What', 'yep', '1562', 'they', 'following', 'Use', 'During', 'Is', 'Louisiana', '--', 'Like', 'Owner', 'Tina', 'A', 'Than']
Number of samples:  2600
Stats: Labels with their frequencies in the final set
Pres 1300
Past 1300
Number of tokens:  900
length of source dictionary:  3777
length of target dictionary:  2
900
Total instances: 900
['Technically', 'What', 'Mercedes', 'they', 'During', 'Is', 'Like', 'A', 'Hamas', 'Daniel', 'Starting', 'Wendi', 'sincere', 'My', 'many', 'They', 'Sand', 'Are', 'Hancocks', 'Mahmoud']
Number of samples:  900
Stats: Labels with their frequencies in the final set

epoch [1/10]: 0it [00:00, ?it/s]

Epoch: [1/10], Loss: 0.0199


epoch [2/10]: 0it [00:00, ?it/s]

Epoch: [2/10], Loss: 0.0137


epoch [3/10]: 0it [00:00, ?it/s]

Epoch: [3/10], Loss: 0.0120


epoch [4/10]: 0it [00:00, ?it/s]

Epoch: [4/10], Loss: 0.0113


epoch [5/10]: 0it [00:00, ?it/s]

Epoch: [5/10], Loss: 0.0108


epoch [6/10]: 0it [00:00, ?it/s]

Epoch: [6/10], Loss: 0.0106


epoch [7/10]: 0it [00:00, ?it/s]

Epoch: [7/10], Loss: 0.0104


epoch [8/10]: 0it [00:00, ?it/s]

Epoch: [8/10], Loss: 0.0102


epoch [9/10]: 0it [00:00, ?it/s]

Epoch: [9/10], Loss: 0.0101


epoch [10/10]: 0it [00:00, ?it/s]

Epoch: [10/10], Loss: 0.0100


Evaluating: 0it [00:00, ?it/s]

Score (accuracy) of the probe: 0.95


Evaluating: 0it [00:00, ?it/s]

Score (accuracy) of the probe: 0.86


  0%|          | 0/101 [00:00<?, ?it/s]

Loading json activations from /home/senya/Документы/project/large_data_en_ewt/data_VerbForm/activations_train.json...
2188 13.0
Loading json activations from /home/senya/Документы/project/large_data_en_ewt/data_VerbForm/activations_te.json...
779 13.0
Number of tokens:  2188
length of source dictionary:  7524
length of target dictionary:  4
2188
Total instances: 2188
['Outside', '2', 'What', 'SHE', 'Microsoft', 'Stay', 'they', 'Giving', 'following', 'Use', 'During', 'Louisiana', 'Beth', 'Is', 'Like', 'A', 'Irony', 'Than', 'Sometimes', 'NEXT']
Number of samples:  2188
Stats: Labels with their frequencies in the final set
Fin 650
Part 650
Inf 650
Ger 238
Number of tokens:  779
length of source dictionary:  3262
length of target dictionary:  4
779
Total instances: 779
['2', 'What', 'Definately', 'Microsoft', 'they', 'During', 'Wei', 'Is', '"Islamic', 'Absoul', 'Like', 'A', 'Hamas', 'Starting', 'Wendi', 'My', 'They', '"well', 'Are', 'moving']
Number of samples:  779
Stats: Labels with thei

epoch [1/10]: 0it [00:00, ?it/s]

Epoch: [1/10], Loss: 0.0437


epoch [2/10]: 0it [00:00, ?it/s]

Epoch: [2/10], Loss: 0.0305


epoch [3/10]: 0it [00:00, ?it/s]

Epoch: [3/10], Loss: 0.0274


epoch [4/10]: 0it [00:00, ?it/s]

Epoch: [4/10], Loss: 0.0263


epoch [5/10]: 0it [00:00, ?it/s]

Epoch: [5/10], Loss: 0.0259


epoch [6/10]: 0it [00:00, ?it/s]

Epoch: [6/10], Loss: 0.0256


epoch [7/10]: 0it [00:00, ?it/s]

Epoch: [7/10], Loss: 0.0243


epoch [8/10]: 0it [00:00, ?it/s]

Epoch: [8/10], Loss: 0.0228


epoch [9/10]: 0it [00:00, ?it/s]

Epoch: [9/10], Loss: 0.0220


epoch [10/10]: 0it [00:00, ?it/s]

Epoch: [10/10], Loss: 0.0214


Evaluating: 0it [00:00, ?it/s]

Score (accuracy) of the probe: 0.88


Evaluating: 0it [00:00, ?it/s]

Score (accuracy) of the probe: 0.25


  0%|          | 0/101 [00:00<?, ?it/s]

Loading json activations from /home/senya/Документы/project/large_data_en_ewt/data_NumForm/activations_train.json...
1467 13.0
Loading json activations from /home/senya/Документы/project/large_data_en_ewt/data_NumForm/activations_te.json...
435 13.0
Number of tokens:  1467
length of source dictionary:  6210
length of target dictionary:  3
1467
Total instances: 1467
['2', 'Outside', 'Frank', 'What', 'yep', 'During', 'Disatisfied', 'Beth', 'Todd', '14', '--', 'A', 'Joe_Lardy', 'Sometimes', 'NEXT', '40', 'Sir', 'Investigators', '1561', '312']
Number of samples:  1467
Stats: Labels with their frequencies in the final set
Word 593
Roman 7
Digit 867
Number of tokens:  435
length of source dictionary:  2544
length of target dictionary:  3
435
Total instances: 435
['ps', 'i', 'Lunar', 'so', 'ive', 'THOMAS', '08', '"You', 'What', 'Delhi', 'Microsoft', 'Carve', 'why', 'Slope', 'Armed', 'Sheridan', 'Wei', 'Intercept', 'See', '09']
Number of samples:  435
Stats: Labels with their frequencies in th

epoch [1/10]: 0it [00:00, ?it/s]

Epoch: [1/10], Loss: 0.0263


epoch [2/10]: 0it [00:00, ?it/s]

Epoch: [2/10], Loss: 0.0164


epoch [3/10]: 0it [00:00, ?it/s]

Epoch: [3/10], Loss: 0.0134


epoch [4/10]: 0it [00:00, ?it/s]

Epoch: [4/10], Loss: 0.0120


epoch [5/10]: 0it [00:00, ?it/s]

Epoch: [5/10], Loss: 0.0114


epoch [6/10]: 0it [00:00, ?it/s]

Epoch: [6/10], Loss: 0.0111


epoch [7/10]: 0it [00:00, ?it/s]

Epoch: [7/10], Loss: 0.0110


epoch [8/10]: 0it [00:00, ?it/s]

Epoch: [8/10], Loss: 0.0109


epoch [9/10]: 0it [00:00, ?it/s]

Epoch: [9/10], Loss: 0.0109


epoch [10/10]: 0it [00:00, ?it/s]

Epoch: [10/10], Loss: 0.0106


Evaluating: 0it [00:00, ?it/s]

Score (accuracy) of the probe: 0.95


Evaluating: 0it [00:00, ?it/s]

Score (accuracy) of the probe: 0.78


  0%|          | 0/101 [00:00<?, ?it/s]

Loading json activations from /home/senya/Документы/project/large_data_en_ewt/data_Person/activations_train.json...
2601 13.0
Loading json activations from /home/senya/Документы/project/large_data_en_ewt/data_Person/activations_te.json...
862 13.0
Number of tokens:  2601
length of source dictionary:  7763
length of target dictionary:  3
2601
Total instances: 2601
['2', 'Admin', '"Set', 'What', 'Microsoft', 'figure', '1562', 'following', 'they', 'Louisiana', 'Beth', 'Is', '--', 'Like', 'A', 'Sometimes', 'NEXT', '**', '1561', 'Offer']
Number of samples:  2601
Stats: Labels with their frequencies in the final set
3 867
1 867
2 867
Number of tokens:  862
length of source dictionary:  3389
length of target dictionary:  3
862
Total instances: 862
['2', 'Lied', 'Superior', 'What', 'Delhi', 'they', 'Use', 'Ash', 'Wei', 'Is', 'Absoul', 'Like', '"Twinkle', 'A', 'Hamas', 'Starting', '"However', 'sincere', '**', 'My']
Number of samples:  862
Stats: Labels with their frequencies in the final set
3 

epoch [1/10]: 0it [00:00, ?it/s]

Epoch: [1/10], Loss: 0.0211


epoch [2/10]: 0it [00:00, ?it/s]

Epoch: [2/10], Loss: 0.0131


epoch [3/10]: 0it [00:00, ?it/s]

Epoch: [3/10], Loss: 0.0114


epoch [4/10]: 0it [00:00, ?it/s]

Epoch: [4/10], Loss: 0.0106


epoch [5/10]: 0it [00:00, ?it/s]

Epoch: [5/10], Loss: 0.0102


epoch [6/10]: 0it [00:00, ?it/s]

Epoch: [6/10], Loss: 0.0100


epoch [7/10]: 0it [00:00, ?it/s]

Epoch: [7/10], Loss: 0.0099


epoch [8/10]: 0it [00:00, ?it/s]

Epoch: [8/10], Loss: 0.0098


epoch [9/10]: 0it [00:00, ?it/s]

Epoch: [9/10], Loss: 0.0098


epoch [10/10]: 0it [00:00, ?it/s]

Epoch: [10/10], Loss: 0.0097


Evaluating: 0it [00:00, ?it/s]

Score (accuracy) of the probe: 0.96


Evaluating: 0it [00:00, ?it/s]

Score (accuracy) of the probe: 0.86


  0%|          | 0/101 [00:00<?, ?it/s]

Loading json activations from /home/senya/Документы/project/large_data_en_ewt/data_ExtPos/activations_train.json...
143 13.0
Loading json activations from /home/senya/Документы/project/large_data_en_ewt/data_ExtPos/activations_te.json...
37 13.0
Number of tokens:  143
length of source dictionary:  1308
length of target dictionary:  5
143
Total instances: 143
['Wire', 'What', '"grey', 'they', 'Wii', 'Well', 'Once', 'Please', 'To', 'My', 'They', 'There', 'of', 'His', 'She', 'It', '"According', 'Employees', 'Since', '"Came']
Number of samples:  143
Stats: Labels with their frequencies in the final set
ADV 73
CCONJ 13
PRON 11
SCONJ 4
ADP 42
Number of tokens:  37
length of source dictionary:  429
length of target dictionary:  5
37
Total instances: 37
['The', 'I', 'As', 'Cities', 'If', 'go', 'Its', 'Airfare', 'But', 'Well', 'Argentinian', 'All', 'Now', 'They', 'Therefore', 'There', 'You', 'This', '"""', 'According']
Number of samples:  37
Stats: Labels with their frequencies in the final set

epoch [1/10]: 0it [00:00, ?it/s]

Epoch: [1/10], Loss: 0.0536


epoch [2/10]: 0it [00:00, ?it/s]

Epoch: [2/10], Loss: 0.0276


epoch [3/10]: 0it [00:00, ?it/s]

Epoch: [3/10], Loss: 0.0213


epoch [4/10]: 0it [00:00, ?it/s]

Epoch: [4/10], Loss: 0.0154


epoch [5/10]: 0it [00:00, ?it/s]

Epoch: [5/10], Loss: 0.0128


epoch [6/10]: 0it [00:00, ?it/s]

Epoch: [6/10], Loss: 0.0119


epoch [7/10]: 0it [00:00, ?it/s]

Epoch: [7/10], Loss: 0.0114


epoch [8/10]: 0it [00:00, ?it/s]

Epoch: [8/10], Loss: 0.0108


epoch [9/10]: 0it [00:00, ?it/s]

Epoch: [9/10], Loss: 0.0104


epoch [10/10]: 0it [00:00, ?it/s]

Epoch: [10/10], Loss: 0.0100


Evaluating: 0it [00:00, ?it/s]

Score (accuracy) of the probe: 1.00


Evaluating: 0it [00:00, ?it/s]

Score (accuracy) of the probe: 0.59


  0%|          | 0/101 [00:00<?, ?it/s]

Loading json activations from /home/senya/Документы/project/large_data_en_ewt/data_Mood/activations_train.json...
1571 13.0
Loading json activations from /home/senya/Документы/project/large_data_en_ewt/data_Mood/activations_te.json...
481 13.0
Number of tokens:  1571
length of source dictionary:  5559
length of target dictionary:  3
1571
Total instances: 1571
['2', '"Set', 'What', 'Microsoft', 'figure', 'Stay', 'they', '"Don', 'Use', 'During', 'Louisiana', 'Is', '--', 'A', 'Offer', 'My', 'Treat', 'They', 'close', 'Ignore']
Number of samples:  1571
Stats: Labels with their frequencies in the final set
Ind 867
Sub 5
Imp 699
Number of tokens:  481
length of source dictionary:  2332
length of target dictionary:  3
481
Total instances: 481
['buy', 'i', 'American', 'so', 'Talk', 'don', 'Ginny', 'Ben', 'What', 'Make', 'give', 'Carve', 'why', 'they', 'Lets', 'Use', 'Find', 'Work', 'Wei', 'Is']
Number of samples:  481
Stats: Labels with their frequencies in the final set
Ind 300
Sub 2
Imp 179
T

epoch [1/10]: 0it [00:00, ?it/s]

Epoch: [1/10], Loss: 0.0140


epoch [2/10]: 0it [00:00, ?it/s]

Epoch: [2/10], Loss: 0.0078


epoch [3/10]: 0it [00:00, ?it/s]

Epoch: [3/10], Loss: 0.0067


epoch [4/10]: 0it [00:00, ?it/s]

Epoch: [4/10], Loss: 0.0062


epoch [5/10]: 0it [00:00, ?it/s]

Epoch: [5/10], Loss: 0.0058


epoch [6/10]: 0it [00:00, ?it/s]

Epoch: [6/10], Loss: 0.0054


epoch [7/10]: 0it [00:00, ?it/s]

Epoch: [7/10], Loss: 0.0051


epoch [8/10]: 0it [00:00, ?it/s]

Epoch: [8/10], Loss: 0.0049


epoch [9/10]: 0it [00:00, ?it/s]

Epoch: [9/10], Loss: 0.0048


epoch [10/10]: 0it [00:00, ?it/s]

Epoch: [10/10], Loss: 0.0048


Evaluating: 0it [00:00, ?it/s]

Score (accuracy) of the probe: 0.99


Evaluating: 0it [00:00, ?it/s]

Score (accuracy) of the probe: 0.95


  0%|          | 0/101 [00:00<?, ?it/s]

Loading json activations from /home/senya/Документы/project/large_data_en_ewt/data_Gender/activations_train.json...
1936 13.0
Loading json activations from /home/senya/Документы/project/large_data_en_ewt/data_Gender/activations_te.json...
482 13.0
Number of tokens:  1936
length of source dictionary:  6506
length of target dictionary:  3
1936
Total instances: 1936
['Outside', '49', 'looks', 'What', 'SHE', '1562', 'they', '"Her', 'During', 'Is', 'Like', 'Owner', 'Tina', 'A', 'Budgies', 'Investigators', '1561', 'My', 'They', 'close']
Number of samples:  1936
Stats: Labels with their frequencies in the final set
Fem 278
Masc 791
Neut 867
Number of tokens:  482
length of source dictionary:  2420
length of target dictionary:  3
482
Total instances: 482
['i', 'North', 'Lied', 'Technically', 'Superior', 'What', 'Make', 'why', 'Find', 'During', 'Is', 'Truly', 'could', 'A', 'Well', 'Daniel', 'no', 'Please', 'Wendi', 'Luckily']
Number of samples:  482
Stats: Labels with their frequencies in the f

epoch [1/10]: 0it [00:00, ?it/s]

Epoch: [1/10], Loss: 0.0281


epoch [2/10]: 0it [00:00, ?it/s]

Epoch: [2/10], Loss: 0.0185


epoch [3/10]: 0it [00:00, ?it/s]

Epoch: [3/10], Loss: 0.0146


epoch [4/10]: 0it [00:00, ?it/s]

Epoch: [4/10], Loss: 0.0131


epoch [5/10]: 0it [00:00, ?it/s]

Epoch: [5/10], Loss: 0.0125


epoch [6/10]: 0it [00:00, ?it/s]

Epoch: [6/10], Loss: 0.0122


epoch [7/10]: 0it [00:00, ?it/s]

Epoch: [7/10], Loss: 0.0120


epoch [8/10]: 0it [00:00, ?it/s]

Epoch: [8/10], Loss: 0.0119


epoch [9/10]: 0it [00:00, ?it/s]

Epoch: [9/10], Loss: 0.0120


epoch [10/10]: 0it [00:00, ?it/s]

Epoch: [10/10], Loss: 0.0121


Evaluating: 0it [00:00, ?it/s]

Score (accuracy) of the probe: 0.96


Evaluating: 0it [00:00, ?it/s]

Score (accuracy) of the probe: 0.62


  0%|          | 0/101 [00:00<?, ?it/s]

Loading json activations from /home/senya/Документы/project/large_data_en_ewt/data_Case/activations_train.json...
2291 13.0
Loading json activations from /home/senya/Документы/project/large_data_en_ewt/data_Case/activations_te.json...
683 13.0
Number of tokens:  2291
length of source dictionary:  7587
length of target dictionary:  3
2291
Total instances: 2291
['Fresh', '2', 'Torrey', 'Hot', '49', '"Set', 'What', 'SHE', 'Microsoft', '1562', 'they', '"Her', 'Use', 'following', 'During', 'Is', 'Beth', 'Louisiana', '14', 'Like']
Number of samples:  2291
Stats: Labels with their frequencies in the final set
Gen 867
Acc 557
Nom 867
Number of tokens:  683
length of source dictionary:  3002
length of target dictionary:  3
683
Total instances: 683
['buy', 'i', 'ps', 'North', 'Lied', 'Talk', 'ive', 'Superior', 'Abbas', '"You', 'What', 'Make', 'Carve', '"Authorised', 'why', 'they', 'Lets', 'give', 'Find', 'Destiny']
Number of samples:  683
Stats: Labels with their frequencies in the final set
Gen

epoch [1/10]: 0it [00:00, ?it/s]

Epoch: [1/10], Loss: 0.0318


epoch [2/10]: 0it [00:00, ?it/s]

Epoch: [2/10], Loss: 0.0217


epoch [3/10]: 0it [00:00, ?it/s]

Epoch: [3/10], Loss: 0.0192


epoch [4/10]: 0it [00:00, ?it/s]

Epoch: [4/10], Loss: 0.0180


epoch [5/10]: 0it [00:00, ?it/s]

Epoch: [5/10], Loss: 0.0170


epoch [6/10]: 0it [00:00, ?it/s]

Epoch: [6/10], Loss: 0.0164


epoch [7/10]: 0it [00:00, ?it/s]

Epoch: [7/10], Loss: 0.0161


epoch [8/10]: 0it [00:00, ?it/s]

Epoch: [8/10], Loss: 0.0159


epoch [9/10]: 0it [00:00, ?it/s]

Epoch: [9/10], Loss: 0.0159


epoch [10/10]: 0it [00:00, ?it/s]

Epoch: [10/10], Loss: 0.0160


Evaluating: 0it [00:00, ?it/s]

Score (accuracy) of the probe: 0.87


Evaluating: 0it [00:00, ?it/s]

Score (accuracy) of the probe: 0.74


  0%|          | 0/101 [00:00<?, ?it/s]

Loading json activations from /home/senya/Документы/project/large_data_en_ewt/data_NumType/activations_train.json...
913 13.0
Loading json activations from /home/senya/Документы/project/large_data_en_ewt/data_NumType/activations_te.json...
281 13.0
Number of tokens:  913
length of source dictionary:  4538
length of target dictionary:  4
913
Total instances: 913
['Outside', '2', 'What', 'yep', 'Stay', 'Thursday', 'A', 'Joe_Lardy', 'NEXT', '40', '1561', 'Jean', 'My', 'They', '42', '"30', 'FYI', 'seems', 'Messages', '"According']
Number of samples:  913
Stats: Labels with their frequencies in the final set
Mult 41
Ord 203
Card 650
Frac 19
Number of tokens:  281
length of source dictionary:  1853
length of target dictionary:  4
281
Total instances: 281
['i', 'ps', 'Technically', '08', 'don', '"You', 'What', 'why', 'Armed', 'Wei', 'Is', 'Intercept', 'by', 'could', 'A', 'Once', 'HAS', 'Please', 'Valero', 'My']
Number of samples:  281
Stats: Labels with their frequencies in the final set
Mult

epoch [1/10]: 0it [00:00, ?it/s]

Epoch: [1/10], Loss: 0.0373


epoch [2/10]: 0it [00:00, ?it/s]

Epoch: [2/10], Loss: 0.0217


epoch [3/10]: 0it [00:00, ?it/s]

Epoch: [3/10], Loss: 0.0161


epoch [4/10]: 0it [00:00, ?it/s]

Epoch: [4/10], Loss: 0.0142


epoch [5/10]: 0it [00:00, ?it/s]

Epoch: [5/10], Loss: 0.0129


epoch [6/10]: 0it [00:00, ?it/s]

Epoch: [6/10], Loss: 0.0123


epoch [7/10]: 0it [00:00, ?it/s]

Epoch: [7/10], Loss: 0.0118


epoch [8/10]: 0it [00:00, ?it/s]

Epoch: [8/10], Loss: 0.0115


epoch [9/10]: 0it [00:00, ?it/s]

Epoch: [9/10], Loss: 0.0113


epoch [10/10]: 0it [00:00, ?it/s]

Epoch: [10/10], Loss: 0.0112


Evaluating: 0it [00:00, ?it/s]

Score (accuracy) of the probe: 0.96


Evaluating: 0it [00:00, ?it/s]

Score (accuracy) of the probe: 0.76


  0%|          | 0/101 [00:00<?, ?it/s]

Loading json activations from /home/senya/Документы/project/large_data_en_ewt/data_Definite/activations_train.json...
2600 13.0
Loading json activations from /home/senya/Документы/project/large_data_en_ewt/data_Definite/activations_te.json...
900 13.0
Number of tokens:  2600
length of source dictionary:  8671
length of target dictionary:  2
2600
Total instances: 2600
['2', 'Gives', 'Outside', 'Hot', '"Set', 'What', 'they', 'Giving', 'Use', 'During', 'Is', '14', '--', '"Bilbray', 'Like', 'Travelers', 'Tina', 'A', 'Sometimes', 'Birth']
Number of samples:  2600
Stats: Labels with their frequencies in the final set
Def 1300
Ind 1300
Number of tokens:  900
length of source dictionary:  4070
length of target dictionary:  2
900
Total instances: 900
['2', 'Technically', 'THOMAS', 'What', 'Delhi', 'they', 'Ash', 'Wei', 'Is', 'Like', '"Twinkle', 'A', 'Starting', 'Wendi', '"However', 'many', 'My', 'They', 'Catriona', 'Mahmoud']
Number of samples:  900
Stats: Labels with their frequencies in the f

epoch [1/10]: 0it [00:00, ?it/s]

Epoch: [1/10], Loss: 0.0246


epoch [2/10]: 0it [00:00, ?it/s]

Epoch: [2/10], Loss: 0.0182


epoch [3/10]: 0it [00:00, ?it/s]

Epoch: [3/10], Loss: 0.0165


epoch [4/10]: 0it [00:00, ?it/s]

Epoch: [4/10], Loss: 0.0155


epoch [5/10]: 0it [00:00, ?it/s]

Epoch: [5/10], Loss: 0.0148


epoch [6/10]: 0it [00:00, ?it/s]

Epoch: [6/10], Loss: 0.0143


epoch [7/10]: 0it [00:00, ?it/s]

Epoch: [7/10], Loss: 0.0139


epoch [8/10]: 0it [00:00, ?it/s]

Epoch: [8/10], Loss: 0.0136


epoch [9/10]: 0it [00:00, ?it/s]

Epoch: [9/10], Loss: 0.0134


epoch [10/10]: 0it [00:00, ?it/s]

Epoch: [10/10], Loss: 0.0132


Evaluating: 0it [00:00, ?it/s]

Score (accuracy) of the probe: 0.87


Evaluating: 0it [00:00, ?it/s]

Score (accuracy) of the probe: 0.72


  0%|          | 0/101 [00:00<?, ?it/s]

Loading json activations from /home/senya/Документы/project/large_data_en_ewt/data_Degree/activations_train.json...
1471 13.0
Loading json activations from /home/senya/Документы/project/large_data_en_ewt/data_Degree/activations_te.json...
483 13.0
Number of tokens:  1471
length of source dictionary:  5925
length of target dictionary:  3
1471
Total instances: 1471
['2', 'Admin', '"Set', 'What', 'they', 'During', 'Is', 'Beth', 'Todd', '14', 'Owner', 'A', 'Irony', 'Sometimes', '"Does', 'Budgies', '1561', 'My', 'They', '130']
Number of samples:  1471
Stats: Labels with their frequencies in the final set
Pos 867
Cmp 348
Sup 256
Number of tokens:  483
length of source dictionary:  2509
length of target dictionary:  3
483
Total instances: 483
['buy', 'i', '2', 'Lunar', 'so', 'Hot', 'Technically', 'don', 'Superior', '"You', 'What', 'U', 'out', 'Falluja', 'they', 'Lets', 'Monotheism', 'Natasha', 'Susanna', 'Is']
Number of samples:  483
Stats: Labels with their frequencies in the final set
Pos 3

epoch [1/10]: 0it [00:00, ?it/s]

Epoch: [1/10], Loss: 0.0391


epoch [2/10]: 0it [00:00, ?it/s]

Epoch: [2/10], Loss: 0.0262


epoch [3/10]: 0it [00:00, ?it/s]

Epoch: [3/10], Loss: 0.0218


epoch [4/10]: 0it [00:00, ?it/s]

Epoch: [4/10], Loss: 0.0202


epoch [5/10]: 0it [00:00, ?it/s]

Epoch: [5/10], Loss: 0.0191


epoch [6/10]: 0it [00:00, ?it/s]

Epoch: [6/10], Loss: 0.0185


epoch [7/10]: 0it [00:00, ?it/s]

Epoch: [7/10], Loss: 0.0181


epoch [8/10]: 0it [00:00, ?it/s]

Epoch: [8/10], Loss: 0.0179


epoch [9/10]: 0it [00:00, ?it/s]

Epoch: [9/10], Loss: 0.0179


epoch [10/10]: 0it [00:00, ?it/s]

Epoch: [10/10], Loss: 0.0181


Evaluating: 0it [00:00, ?it/s]

Score (accuracy) of the probe: 0.84


Evaluating: 0it [00:00, ?it/s]

Score (accuracy) of the probe: 0.60


  0%|          | 0/101 [00:00<?, ?it/s]

In [10]:
with open('scores.pkl', 'wb') as f:
    pickle.dump(scores, f)
with open('neurons.pkl', 'wb') as f:
    pickle.dump(ordered_neurons, f)

In [11]:
with open('scores.pkl', 'rb') as f:
    scores = pickle.load(f)
with open('neurons.pkl', 'rb') as f:
    ordered_neurons = pickle.load(f)

In [12]:
scores

{'Number': [{'__OVERALL__': 0.8657692307692307,
   'Sing': 0.7361538461538462,
   'Plur': 0.9953846153846154},
  {'__OVERALL__': 0.7811111111111111,
   'Sing': 0.6044444444444445,
   'Plur': 0.9577777777777777}],
 'Tense': [{'__OVERALL__': 0.9515384615384616,
   'Pres': 0.9253846153846154,
   'Past': 0.9776923076923076},
  {'__OVERALL__': 0.8588888888888889,
   'Pres': 0.8177777777777778,
   'Past': 0.9}],
 'VerbForm': [{'__OVERALL__': 0.8807129798903108,
   'Fin': 0.7861538461538462,
   'Part': 0.9523076923076923,
   'Inf': 0.963076923076923,
   'Ger': 0.7184873949579832},
  {'__OVERALL__': 0.2490372272143774,
   'Fin': 0.49333333333333335,
   'Ger': 0.5,
   'Part': 0.1288888888888889,
   'Inf': 0.008888888888888889}],
 'NumForm': [{'__OVERALL__': 0.9488752556237219,
   'Word': 0.984822934232715,
   'Roman': 0.5714285714285714,
   'Digit': 0.9273356401384083},
  {'__OVERALL__': 0.7839080459770115,
   'Word': 0.7251908396946565,
   'Roman': 0.0,
   'Digit': 0.82}],
 'Person': [{'__OVER

In [13]:
for k, v in ordered_neurons.items():
    print(k, v[:100])

Number [9707, 7618, 4775, 7976, 8939, 6744, 7632, 7889, 5778, 9109, 7512, 8035, 4645, 5543, 4007, 7208, 2536, 9867, 6028, 5864, 8878, 8788, 4853, 5976, 9082, 9154, 5413, 6181, 5096, 8110, 7408, 4113, 5010, 4628, 9556, 8314, 7739, 3612, 7998, 9761, 8099, 6148, 7267, 5093, 6311, 5640, 5034, 7435, 7838, 8335, 6544, 8657, 9048, 5497, 7677, 7326, 6850, 6275, 7043, 8581, 8231, 9099, 6831, 7807, 2292, 8439, 5304, 5593, 3514, 8280, 2652, 2206, 9151, 8282, 6854, 7526, 9093, 4907, 6381, 3575, 4282, 3964, 8094, 4127, 6178, 6340, 6150, 4614, 3837, 5802, 8971, 6796, 5935, 5009, 2077, 7412, 8407, 6072, 4346, 6268]
Tense [970, 1187, 7302, 9678, 8175, 6175, 8633, 9692, 7325, 8479, 6500, 6155, 9101, 8493, 5583, 4815, 4047, 8686, 1017, 7381, 9624, 7865, 6557, 9247, 7747, 6212, 4517, 6534, 8871, 8841, 8777, 8044, 9037, 7026, 9395, 800, 7109, 8070, 9034, 9291, 6378, 8943, 6320, 9201, 8626, 9848, 4670, 7711, 7841, 7906, 5541, 6470, 6471, 6602, 6795, 8266, 4842, 5969, 2675, 6548, 7027, 6710, 8856, 7357, 963