In [1]:
import os
import re
from pathlib import Path
import pickle
from IPython.display import clear_output
import neurox.data.loader as data_loader
import neurox.interpretation.utils as utils
import neurox.interpretation.ablation as ablation
import neurox.interpretation.linear_probe as linear_probe
import neurox.data.extraction.transformers_extractor as transformers_extractor

In [2]:
path_project = Path(os.getcwd()).parents[0]
path_project = str(path_project)

In [3]:
data_path = path_project+'/large_data_en_gum'

In [4]:
directories = [] #тут делаем обход, чтобы получить имена каталогов
for root,dirs,files in os.walk(data_path):
    for directory in dirs:
        directories.append(os.path.join(root, directory))

In [5]:
all_paths = []
for directory in directories: #тут делаем обход каталагов, чтобы получить имена файлов
    paths = []
    for (root,dirs,files) in os.walk(directory, topdown=True):
        for file in files:
            paths.append(os.path.join(root, file))
        all_paths.append(sorted(paths))

In [6]:
def load_sentence_data(source_path, labels_path, activations): 
    
    #тут немного переписали функцию потому что в библиотеке ошибка!!!

    tokens = {"source": [], "target": []}

    with open(source_path) as source_fp:
        for line_idx, line in enumerate(source_fp):
            line_tokens = line.strip().split() #вот тут переписано
            tokens["source"].append(line_tokens) #и тут

    with open(labels_path) as labels_fp:
        for line in labels_fp:
            line_tokens = line.strip().split()
            tokens["target"].append(line_tokens)

    assert len(tokens["source"]) == len(tokens["target"]), (
        "Number of lines do not match (source: %d, target: %d)!"
        % (len(tokens["source"]), len(tokens["target"]))
    )

    assert len(activations) == len(tokens["source"]), (
        "Number of lines do not match (activations: %d, source: %d)!"
        % (len(activations), len(tokens["source"]))
    )

    
    for idx, activation in enumerate(activations):
        assert activation.shape[0] == len(tokens["source"][idx])

    return tokens

In [7]:
class Experiment:
    
    
    def __init__(self, path_trdata, path_trlabel, path_tedata, path_telabel):
        
        #некрасиво инициализирована куча переменных для функций
        
        self.path_trdata = path_trdata
        self.path_trlabel = path_trlabel
        self.path_tedata = path_tedata
        self.path_telabel = path_telabel
        self.category = re.search(r'(?<=_)[a-zA-Z]+(?=.txt)', path_trdata)[0]
        self.dataset = re.search(r'(?<=_)[a-zA-Z]+_[a-zA-Z]+(?=\/)', path_trdata)[0]
        
        self.path = path_project+f'/large_data_{self.dataset}/data_{self.category}'
        
        self.activations_tr, self.num_layers = data_loader.load_activations(self.path+'/activations_train.json', 768)
        self.activations_te, self.num_layers = data_loader.load_activations(self.path+'/activations_te.json', 768)
        
        self.tokens_tr = load_sentence_data(self.path_trdata, self.path_trlabel, self.activations_tr)
        self.tokens_te = load_sentence_data(self.path_tedata, self.path_telabel, self.activations_te)
        
        self.X_tr, self.y_tr, self.mapping_tr = utils.create_tensors(self.tokens_tr, self.activations_tr, 'Nom')
        self.label2idx_tr, self.idx2label_tr, self.src2idx_tr, self.idx2src_tr = self.mapping_tr
        
        self.X_te, self.y_te, self.mapping_te = utils.create_tensors(self.tokens_te, self.activations_te, 'Nom')
        self.label2idx_te, self.idx2label_te, self.src2idx_te, self.idx2src_te = self.mapping_te
        
        
    def run_classification(self):#just пробинг
        probe = linear_probe.train_logistic_regression_probe(self.X_tr, self.y_tr, lambda_l1=0.001, lambda_l2=0.001)
        scores_tr = linear_probe.evaluate_probe(probe, self.X_tr, self.y_tr, idx_to_class=self.idx2label_tr)
        scores_te = linear_probe.evaluate_probe(probe, self.X_te, self.y_te, idx_to_class=self.idx2label_te)
        return probe, scores_tr, scores_te
    
    
    def lairwise(self, n): #пробинг по слоям! можно выбрать
        layer_0_X_tr = ablation.filter_activations_by_layers(self.X_tr, [n], 13)
        probe_layer_0 = linear_probe.train_logistic_regression_probe(layer_0_X_tr, self.y_tr, lambda_l1=0.001, lambda_l2=0.001)
        scores_tr = linear_probe.evaluate_probe(probe_layer_0, layer_0_X_tr, self.y_tr, idx_to_class=self.idx2label_tr)
        
        layer_0_X_te = ablation.filter_activations_by_layers(self.X_te, [n], 13)
        scores_te = linear_probe.evaluate_probe(probe_layer_0, layer_0_X_te, self.y_te, idx_to_class=self.idx2label_te)
        return scores_tr, scores_te
    
    
    def nranking(self, n=100, k = 'ordering'): #тут топ нейроны
        probe, scores_tr, scores_te = self.run_classification()
        ordering, cutoffs = linear_probe.get_neuron_ordering(probe, self.label2idx_tr)
        if k == 'ordering':
            return ordering, scores_tr, scores_te
        elif k == 'train':
            X_tr_selected = ablation.filter_activations_keep_neurons(self.X_tr, ordering[:n])
            probe_selected = linear_probe.train_logistic_regression_probe(X_tr_selected, self.y_tr, lambda_l1=0.001, lambda_l2=0.001)
            scores_tr = linear_probe.evaluate_probe(probe_selected, X_tr_selected, self.y_tr, idx_to_class=self.idx2label_tr)
            X_te_selected = ablation.filter_activations_keep_neurons(self.X_te, ordering[:n])
            scores_te = linear_probe.evaluate_probe(probe_selected, X_te_selected, self.y_te, idx_to_class=self.idx2label_te)
            return ordering, scores_tr, scores_te

In [8]:
# datatest[2], labeltest[4]
# datatrain[3], labeltrain[5]
ordered_neurons = {}
scores = {}
for path in all_paths:
    cat = Experiment(path[3], path[5], path[2], path[4])    
    cat_name = re.search(r'(?<=_)[a-zA-Z]+(?=.txt)', path[3])[0]
    neurons, scores_tr, scores_te = cat.nranking()
    ordered_neurons[cat_name] = neurons
    scores[cat_name] = [scores_tr, scores_te] 

Loading json activations from /home/senya/Документы/project/large_data_en_gum/data_Number/activations_train.json...
2553 13.0
Loading json activations from /home/senya/Документы/project/large_data_en_gum/data_Number/activations_te.json...
882 13.0
Number of tokens:  2553
length of source dictionary:  8554
length of target dictionary:  2
2553
Total instances: 2553
['Significant', 'Prior', 'Some', 'Previous', 'Leave', 'Beyond', 'On', 'Sunday', 'Besides', 'Arrogant', 'Hackers', 'Sensations', 'Brampton', 'Italian', 'Achieving', 'Makes', 'Street', '"We', 'Europe', '"Do']
Number of samples:  2553
Stats: Labels with their frequencies in the final set
Sing 1300
Plur 1253
Number of tokens:  882
length of source dictionary:  4076
length of target dictionary:  2
882
Total instances: 882
['Southerners', 'elderly', 'Even', 'Some', 'Leave', 'Space', 'Second', 'Use', 'Later', 'On', 'Village', 'Besides', 'Mandatory', 'More', 'Uh', 'Sociologists', 'These', 'Closing', 'North', 'My']
Number of samples:  

epoch [1/10]: 0it [00:00, ?it/s]

Epoch: [1/10], Loss: 0.0212


epoch [2/10]: 0it [00:00, ?it/s]

Epoch: [2/10], Loss: 0.0152


epoch [3/10]: 0it [00:00, ?it/s]

Epoch: [3/10], Loss: 0.0136


epoch [4/10]: 0it [00:00, ?it/s]

Epoch: [4/10], Loss: 0.0127


epoch [5/10]: 0it [00:00, ?it/s]

Epoch: [5/10], Loss: 0.0121


epoch [6/10]: 0it [00:00, ?it/s]

Epoch: [6/10], Loss: 0.0118


epoch [7/10]: 0it [00:00, ?it/s]

Epoch: [7/10], Loss: 0.0116


epoch [8/10]: 0it [00:00, ?it/s]

Epoch: [8/10], Loss: 0.0117


epoch [9/10]: 0it [00:00, ?it/s]

Epoch: [9/10], Loss: 0.0118


epoch [10/10]: 0it [00:00, ?it/s]

Epoch: [10/10], Loss: 0.0120


Evaluating: 0it [00:00, ?it/s]

Score (accuracy) of the probe: 0.91


Evaluating: 0it [00:00, ?it/s]

Score (accuracy) of the probe: 0.79


  0%|          | 0/101 [00:00<?, ?it/s]

Loading json activations from /home/senya/Документы/project/large_data_en_gum/data_Tense/activations_train.json...
2600 13.0
Loading json activations from /home/senya/Документы/project/large_data_en_gum/data_Tense/activations_te.json...
900 13.0
Number of tokens:  2600
length of source dictionary:  8735
length of target dictionary:  2
2600
Total instances: 2600
['Significant', 'Prior', 'Some', 'Previous', 'Beyond', 'On', 'Besides', 'Arrogant', 'Note', 'Hackers', 'Sensations', 'Brampton', 'Late', 'Achieving', 'Makes', 'Street', 'Peace', 'Facebook', '"Also', '"We']
Number of samples:  2600
Stats: Labels with their frequencies in the final set
Pres 1300
Past 1300
Number of tokens:  900
length of source dictionary:  4224
length of target dictionary:  2
900
Total instances: 900
['Southerners', 'Even', 'Some', 'Leave', 'Space', 'Second', 'On', 'Later', 'Village', 'Besides', 'Mandatory', 'More', 'These', 'Winter', 'it', 'My', 'North', 'Virginia', 'Whilst', 'Late']
Number of samples:  900
Stat

epoch [1/10]: 0it [00:00, ?it/s]

Epoch: [1/10], Loss: 0.0200


epoch [2/10]: 0it [00:00, ?it/s]

Epoch: [2/10], Loss: 0.0150


epoch [3/10]: 0it [00:00, ?it/s]

Epoch: [3/10], Loss: 0.0129


epoch [4/10]: 0it [00:00, ?it/s]

Epoch: [4/10], Loss: 0.0124


epoch [5/10]: 0it [00:00, ?it/s]

Epoch: [5/10], Loss: 0.0122


epoch [6/10]: 0it [00:00, ?it/s]

Epoch: [6/10], Loss: 0.0120


epoch [7/10]: 0it [00:00, ?it/s]

Epoch: [7/10], Loss: 0.0115


epoch [8/10]: 0it [00:00, ?it/s]

Epoch: [8/10], Loss: 0.0111


epoch [9/10]: 0it [00:00, ?it/s]

Epoch: [9/10], Loss: 0.0108


epoch [10/10]: 0it [00:00, ?it/s]

Epoch: [10/10], Loss: 0.0105


Evaluating: 0it [00:00, ?it/s]

Score (accuracy) of the probe: 0.95


Evaluating: 0it [00:00, ?it/s]

Score (accuracy) of the probe: 0.82


  0%|          | 0/101 [00:00<?, ?it/s]

Loading json activations from /home/senya/Документы/project/large_data_en_gum/data_VerbForm/activations_train.json...
2032 13.0
Loading json activations from /home/senya/Документы/project/large_data_en_gum/data_VerbForm/activations_te.json...
700 13.0
Number of tokens:  2032
length of source dictionary:  7352
length of target dictionary:  4
2032
Total instances: 2032
['Significant', 'Some', 'Previous', 'On', 'Arrogant', 'Hackers', 'Dance', 'Late', 'Achieving', 'Facebook', '"Also', '"We', 'God', 'Europe', 'Next', 'Data', 'Recent', 'Germany', 'Discuss', 'Why']
Number of samples:  2032
Stats: Labels with their frequencies in the final set
Inf 650
Part 650
Ger 82
Fin 650
Number of tokens:  700
length of source dictionary:  3315
length of target dictionary:  4
700
Total instances: 700
['Southerners', 'elderly', 'Even', '"However', 'Some', 'Space', 'Second', 'Use', 'On', 'Besides', '"So', 'Mandatory', 'Sociologists', 'These', 'Closing', 'it', 'My', 'North', 'Virginia', 'Whilst']
Number of sa

epoch [1/10]: 0it [00:00, ?it/s]

Epoch: [1/10], Loss: 0.0415


epoch [2/10]: 0it [00:00, ?it/s]

Epoch: [2/10], Loss: 0.0278


epoch [3/10]: 0it [00:00, ?it/s]

Epoch: [3/10], Loss: 0.0242


epoch [4/10]: 0it [00:00, ?it/s]

Epoch: [4/10], Loss: 0.0228


epoch [5/10]: 0it [00:00, ?it/s]

Epoch: [5/10], Loss: 0.0218


epoch [6/10]: 0it [00:00, ?it/s]

Epoch: [6/10], Loss: 0.0216


epoch [7/10]: 0it [00:00, ?it/s]

Epoch: [7/10], Loss: 0.0220


epoch [8/10]: 0it [00:00, ?it/s]

Epoch: [8/10], Loss: 0.0229


epoch [9/10]: 0it [00:00, ?it/s]

Epoch: [9/10], Loss: 0.0247


epoch [10/10]: 0it [00:00, ?it/s]

Epoch: [10/10], Loss: 0.0253


Evaluating: 0it [00:00, ?it/s]

Score (accuracy) of the probe: 0.86


Evaluating: 0it [00:00, ?it/s]

Score (accuracy) of the probe: 0.66


  0%|          | 0/101 [00:00<?, ?it/s]

Loading json activations from /home/senya/Документы/project/large_data_en_gum/data_NumForm/activations_train.json...
1178 13.0
Loading json activations from /home/senya/Документы/project/large_data_en_gum/data_NumForm/activations_te.json...
363 13.0
Number of tokens:  1178
length of source dictionary:  5879
length of target dictionary:  3
1178
Total instances: 1178
['City', 'Some', 'Previous', 'On', 'Sunday', 'Besides', 'Late', 'Achieving', 'Siméon', 'Peace', 'Makes', '"Also', '"We', 'Previously', 'Next', 'Stardust', 'Non', 'Why', 'Fillmore', 'Jack']
Number of samples:  1178
Stats: Labels with their frequencies in the final set
Word 471
Digit 703
Roman 4
Number of tokens:  363
length of source dictionary:  2611
length of target dictionary:  3
363
Total instances: 363
['Even', 'Some', 'Space', 'Second', 'Later', 'On', 'Press', 'Mandatory', 'More', 'Sociologists', 'These', 'it', 'My', 'Late', 'Dvořák', 'East', 'No', 'neuroscientists', '"In', 'Everyone']
Number of samples:  363
Stats: Lab

epoch [1/10]: 0it [00:00, ?it/s]

Epoch: [1/10], Loss: 0.0282


epoch [2/10]: 0it [00:00, ?it/s]

Epoch: [2/10], Loss: 0.0157


epoch [3/10]: 0it [00:00, ?it/s]

Epoch: [3/10], Loss: 0.0129


epoch [4/10]: 0it [00:00, ?it/s]

Epoch: [4/10], Loss: 0.0118


epoch [5/10]: 0it [00:00, ?it/s]

Epoch: [5/10], Loss: 0.0115


epoch [6/10]: 0it [00:00, ?it/s]

Epoch: [6/10], Loss: 0.0115


epoch [7/10]: 0it [00:00, ?it/s]

Epoch: [7/10], Loss: 0.0117


epoch [8/10]: 0it [00:00, ?it/s]

Epoch: [8/10], Loss: 0.0120


epoch [9/10]: 0it [00:00, ?it/s]

Epoch: [9/10], Loss: 0.0121


epoch [10/10]: 0it [00:00, ?it/s]

Epoch: [10/10], Loss: 0.0122


Evaluating: 0it [00:00, ?it/s]

Score (accuracy) of the probe: 0.88


Evaluating: 0it [00:00, ?it/s]

Score (accuracy) of the probe: 0.69


  0%|          | 0/101 [00:00<?, ?it/s]

Loading json activations from /home/senya/Документы/project/large_data_en_gum/data_Person/activations_train.json...
2557 13.0
Loading json activations from /home/senya/Документы/project/large_data_en_gum/data_Person/activations_te.json...
824 13.0
Number of tokens:  2557
length of source dictionary:  7745
length of target dictionary:  3
2557
Total instances: 2557
['Re', 'Some', 'Leave', 'Beyond', 'On', 'Besides', 'Note', 'Sensations', 'Dance', 'Brampton', 'Late', 'Spiral', 'Makes', 'Team', 'Facebook', '"Also', '"We', 'God', 'Europe', 'Arrange']
Number of samples:  2557
Stats: Labels with their frequencies in the final set
1 867
2 823
3 867
Number of tokens:  824
length of source dictionary:  3358
length of target dictionary:  3
824
Total instances: 824
['elderly', 'Even', 'Some', 'Leave', 'Space', 'Second', 'Use', 'On', 'Later', 'Press', 'Run', 'Besides', 'More', 'Uh', 'These', 'Winter', 'it', 'My', 'Dvořák', '"Josh']
Number of samples:  824
Stats: Labels with their frequencies in the 

epoch [1/10]: 0it [00:00, ?it/s]

Epoch: [1/10], Loss: 0.0195


epoch [2/10]: 0it [00:00, ?it/s]

Epoch: [2/10], Loss: 0.0136


epoch [3/10]: 0it [00:00, ?it/s]

Epoch: [3/10], Loss: 0.0117


epoch [4/10]: 0it [00:00, ?it/s]

Epoch: [4/10], Loss: 0.0111


epoch [5/10]: 0it [00:00, ?it/s]

Epoch: [5/10], Loss: 0.0107


epoch [6/10]: 0it [00:00, ?it/s]

Epoch: [6/10], Loss: 0.0106


epoch [7/10]: 0it [00:00, ?it/s]

Epoch: [7/10], Loss: 0.0107


epoch [8/10]: 0it [00:00, ?it/s]

Epoch: [8/10], Loss: 0.0109


epoch [9/10]: 0it [00:00, ?it/s]

Epoch: [9/10], Loss: 0.0112


epoch [10/10]: 0it [00:00, ?it/s]

Epoch: [10/10], Loss: 0.0116


Evaluating: 0it [00:00, ?it/s]

Score (accuracy) of the probe: 0.94


Evaluating: 0it [00:00, ?it/s]

Score (accuracy) of the probe: 0.86


  0%|          | 0/101 [00:00<?, ?it/s]

Loading json activations from /home/senya/Документы/project/large_data_en_gum/data_PronType/activations_train.json...
1243 13.0
Loading json activations from /home/senya/Документы/project/large_data_en_gum/data_PronType/activations_te.json...
438 13.0
Number of tokens:  1243
length of source dictionary:  5221
length of target dictionary:  9
1243
Total instances: 1243
['Significant', 'Some', 'Leave', 'On', 'Besides', 'Dance', '"Such', 'Achieving', 'Spiral', 'Street', 'Peace', '"We', 'Anneal', 'Atwood', 'Germany', 'Hey', 'Discuss', 'Why', 'Stardust', 'Non']
Number of samples:  1243
Stats: Labels with their frequencies in the final set
Neg 7
Dem 260
Tot 90
Rel 79
Int 210
Prs 260
Emp 2
Ind 75
Art 260
Number of tokens:  438
length of source dictionary:  2495
length of target dictionary:  9
438
Total instances: 438
['Some', 'Leave', 'Space', 'Second', 'Use', 'On', 'Besides', 'Mandatory', '"So', 'These', 'My', 'Whilst', 'Libertarians', 'Sure', '"Josh', 'No', 'Find', 'Cyclones', '"In', 'Everyo

epoch [1/10]: 0it [00:00, ?it/s]

Epoch: [1/10], Loss: 0.0567


epoch [2/10]: 0it [00:00, ?it/s]

Epoch: [2/10], Loss: 0.0322


epoch [3/10]: 0it [00:00, ?it/s]

Epoch: [3/10], Loss: 0.0271


epoch [4/10]: 0it [00:00, ?it/s]

Epoch: [4/10], Loss: 0.0241


epoch [5/10]: 0it [00:00, ?it/s]

Epoch: [5/10], Loss: 0.0224


epoch [6/10]: 0it [00:00, ?it/s]

Epoch: [6/10], Loss: 0.0212


epoch [7/10]: 0it [00:00, ?it/s]

Epoch: [7/10], Loss: 0.0207


epoch [8/10]: 0it [00:00, ?it/s]

Epoch: [8/10], Loss: 0.0201


epoch [9/10]: 0it [00:00, ?it/s]

Epoch: [9/10], Loss: 0.0195


epoch [10/10]: 0it [00:00, ?it/s]

Epoch: [10/10], Loss: 0.0193


Evaluating: 0it [00:00, ?it/s]

Score (accuracy) of the probe: 0.97


Evaluating: 0it [00:00, ?it/s]

Score (accuracy) of the probe: 0.68


  0%|          | 0/101 [00:00<?, ?it/s]

Loading json activations from /home/senya/Документы/project/large_data_en_gum/data_Mood/activations_train.json...
1732 13.0
Loading json activations from /home/senya/Документы/project/large_data_en_gum/data_Mood/activations_te.json...
527 13.0
Number of tokens:  1732
length of source dictionary:  6601
length of target dictionary:  2
1732
Total instances: 1732
['Significant', 'Re', 'Some', 'Leave', 'On', 'Arrogant', 'Note', 'Hackers', 'Dance', '"Such', 'Late', 'Italian', 'Peace', 'Street', 'Team', 'Facebook', 'Spiral', '"We', '"Do', 'Arrange']
Number of samples:  1732
Stats: Labels with their frequencies in the final set
Imp 432
Ind 1300
Number of tokens:  527
length of source dictionary:  2878
length of target dictionary:  2
527
Total instances: 527
['Even', 'Some', 'Leave', 'Second', 'Use', 'On', 'Press', 'Run', 'Mandatory', '"So', 'Besides', 'Uh', 'These', 'it', 'My', 'Late', 'Libertarians', '"Josh', 'No', 'neuroscientists']
Number of samples:  527
Stats: Labels with their frequencie

epoch [1/10]: 0it [00:00, ?it/s]

Epoch: [1/10], Loss: 0.0091


epoch [2/10]: 0it [00:00, ?it/s]

Epoch: [2/10], Loss: 0.0063


epoch [3/10]: 0it [00:00, ?it/s]

Epoch: [3/10], Loss: 0.0052


epoch [4/10]: 0it [00:00, ?it/s]

Epoch: [4/10], Loss: 0.0043


epoch [5/10]: 0it [00:00, ?it/s]

Epoch: [5/10], Loss: 0.0039


epoch [6/10]: 0it [00:00, ?it/s]

Epoch: [6/10], Loss: 0.0037


epoch [7/10]: 0it [00:00, ?it/s]

Epoch: [7/10], Loss: 0.0038


epoch [8/10]: 0it [00:00, ?it/s]

Epoch: [8/10], Loss: 0.0038


epoch [9/10]: 0it [00:00, ?it/s]

Epoch: [9/10], Loss: 0.0038


epoch [10/10]: 0it [00:00, ?it/s]

Epoch: [10/10], Loss: 0.0039


Evaluating: 0it [00:00, ?it/s]

Score (accuracy) of the probe: 0.97


Evaluating: 0it [00:00, ?it/s]

Score (accuracy) of the probe: 0.95


  0%|          | 0/101 [00:00<?, ?it/s]

Loading json activations from /home/senya/Документы/project/large_data_en_gum/data_Gender/activations_train.json...
1413 13.0
Loading json activations from /home/senya/Документы/project/large_data_en_gum/data_Gender/activations_te.json...
445 13.0
Number of tokens:  1413
length of source dictionary:  5894
length of target dictionary:  4
1413
Total instances: 1413
['Some', 'On', 'Late', 'Achieving', 'Siméon', 'Makes', '"Do', 'Anneal', 'Previously', 'Next', 'Atwood', 'Recent', 'Why', 'Fillmore', 'Incidentally', 'Built', 'Keep', 'Zenghelis', 'If', 'Did']
Number of samples:  1413
Stats: Labels with their frequencies in the final set
Masc 472
Fem 289
Neut 650
Fem,Masc 2
Number of tokens:  445
length of source dictionary:  2593
length of target dictionary:  3
445
Total instances: 445
['"However', 'Space', 'On', '"So', 'More', 'Uh', 'it', 'My', 'Late', 'Dvořák', '"Josh', 'No', 'Cyclones', '"In', 'Everyone', 'Cyclone', 'Although', 'As', 'Congress', 'Columbia']
Number of samples:  445
Stats: La

epoch [1/10]: 0it [00:00, ?it/s]

Epoch: [1/10], Loss: 0.0276


epoch [2/10]: 0it [00:00, ?it/s]

Epoch: [2/10], Loss: 0.0174


epoch [3/10]: 0it [00:00, ?it/s]

Epoch: [3/10], Loss: 0.0133


epoch [4/10]: 0it [00:00, ?it/s]

Epoch: [4/10], Loss: 0.0116


epoch [5/10]: 0it [00:00, ?it/s]

Epoch: [5/10], Loss: 0.0107


epoch [6/10]: 0it [00:00, ?it/s]

Epoch: [6/10], Loss: 0.0100


epoch [7/10]: 0it [00:00, ?it/s]

Epoch: [7/10], Loss: 0.0095


epoch [8/10]: 0it [00:00, ?it/s]

Epoch: [8/10], Loss: 0.0092


epoch [9/10]: 0it [00:00, ?it/s]

Epoch: [9/10], Loss: 0.0090


epoch [10/10]: 0it [00:00, ?it/s]

Epoch: [10/10], Loss: 0.0088


Evaluating: 0it [00:00, ?it/s]

Score (accuracy) of the probe: 0.98


Evaluating: 0it [00:00, ?it/s]

Score (accuracy) of the probe: 0.51


  0%|          | 0/101 [00:00<?, ?it/s]

Loading json activations from /home/senya/Документы/project/large_data_en_gum/data_Case/activations_train.json...
1815 13.0
Loading json activations from /home/senya/Документы/project/large_data_en_gum/data_Case/activations_te.json...
574 13.0
Number of tokens:  1815
length of source dictionary:  6908
length of target dictionary:  3
1815
Total instances: 1815
['Re', 'Some', 'On', 'Arrogant', 'Sensations', 'Late', 'Achieving', 'Siméon', 'Makes', 'Street', 'Team', 'Spiral', '"We', 'God', 'Previously', 'Anneal', '"Do', 'Atwood', 'Recent', 'Why']
Number of samples:  1815
Stats: Labels with their frequencies in the final set
Gen 645
Acc 303
Nom 867
Number of tokens:  574
length of source dictionary:  2960
length of target dictionary:  3
574
Total instances: 574
['Southerners', 'Even', '"However', 'Some', 'Space', 'Second', 'On', 'Village', 'Besides', '"So', 'These', 'My', 'Late', 'Dvořák', '"Josh', 'No', 'God', '"We', 'Find', 'Cyclones']
Number of samples:  574
Stats: Labels with their freq

epoch [1/10]: 0it [00:00, ?it/s]

Epoch: [1/10], Loss: 0.0325


epoch [2/10]: 0it [00:00, ?it/s]

Epoch: [2/10], Loss: 0.0219


epoch [3/10]: 0it [00:00, ?it/s]

Epoch: [3/10], Loss: 0.0191


epoch [4/10]: 0it [00:00, ?it/s]

Epoch: [4/10], Loss: 0.0182


epoch [5/10]: 0it [00:00, ?it/s]

Epoch: [5/10], Loss: 0.0171


epoch [6/10]: 0it [00:00, ?it/s]

Epoch: [6/10], Loss: 0.0159


epoch [7/10]: 0it [00:00, ?it/s]

Epoch: [7/10], Loss: 0.0154


epoch [8/10]: 0it [00:00, ?it/s]

Epoch: [8/10], Loss: 0.0150


epoch [9/10]: 0it [00:00, ?it/s]

Epoch: [9/10], Loss: 0.0145


epoch [10/10]: 0it [00:00, ?it/s]

Epoch: [10/10], Loss: 0.0143


Evaluating: 0it [00:00, ?it/s]

Score (accuracy) of the probe: 0.93


Evaluating: 0it [00:00, ?it/s]

Score (accuracy) of the probe: 0.72


  0%|          | 0/101 [00:00<?, ?it/s]

Loading json activations from /home/senya/Документы/project/large_data_en_gum/data_NumType/activations_train.json...
919 13.0
Loading json activations from /home/senya/Документы/project/large_data_en_gum/data_NumType/activations_te.json...
317 13.0
Number of tokens:  919
length of source dictionary:  4979
length of target dictionary:  4
919
Total instances: 919
['City', 'Previous', 'Some', 'On', 'Sunday', 'Siméon', '"Also', '"We', 'Next', 'Why', 'Non', 'Fillmore', '1', '9', 'Incidentally', 'Move', 'Built', 'Canadians', 'Builders', 'Keep']
Number of samples:  919
Stats: Labels with their frequencies in the final set
Ord 209
Frac 39
Card 650
Mult 21
Number of tokens:  317
length of source dictionary:  2378
length of target dictionary:  4
317
Total instances: 317
['Even', 'Some', 'Second', 'Later', 'On', 'Press', 'Mandatory', 'More', 'Sociologists', 'These', 'My', 'Late', 'Dvořák', 'No', 'neuroscientists', '"In', 'Everyone', 'Implementing', 'Vava', 'Protection']
Number of samples:  317
St

epoch [1/10]: 0it [00:00, ?it/s]

Epoch: [1/10], Loss: 0.0357


epoch [2/10]: 0it [00:00, ?it/s]

Epoch: [2/10], Loss: 0.0200


epoch [3/10]: 0it [00:00, ?it/s]

Epoch: [3/10], Loss: 0.0155


epoch [4/10]: 0it [00:00, ?it/s]

Epoch: [4/10], Loss: 0.0140


epoch [5/10]: 0it [00:00, ?it/s]

Epoch: [5/10], Loss: 0.0130


epoch [6/10]: 0it [00:00, ?it/s]

Epoch: [6/10], Loss: 0.0127


epoch [7/10]: 0it [00:00, ?it/s]

Epoch: [7/10], Loss: 0.0126


epoch [8/10]: 0it [00:00, ?it/s]

Epoch: [8/10], Loss: 0.0134


epoch [9/10]: 0it [00:00, ?it/s]

Epoch: [9/10], Loss: 0.0138


epoch [10/10]: 0it [00:00, ?it/s]

Epoch: [10/10], Loss: 0.0164


Evaluating: 0it [00:00, ?it/s]

Score (accuracy) of the probe: 0.82


Evaluating: 0it [00:00, ?it/s]

Score (accuracy) of the probe: 0.72


  0%|          | 0/101 [00:00<?, ?it/s]

Loading json activations from /home/senya/Документы/project/large_data_en_gum/data_Definite/activations_train.json...
2417 13.0
Loading json activations from /home/senya/Документы/project/large_data_en_gum/data_Definite/activations_te.json...
814 13.0
Number of tokens:  2417
length of source dictionary:  8968
length of target dictionary:  2
2417
Total instances: 2417
['City', 'Prior', 'Some', 'On', 'Besides', 'Arrogant', 'Dance', '"Such', 'Brampton', 'Achieving', 'Makes', 'Facebook', 'Street', '"We', 'Europe', 'Previously', 'Arrange', 'Next', 'Atwood', 'Data']
Number of samples:  2417
Stats: Labels with their frequencies in the final set
Def 1300
Ind 1117
Number of tokens:  814
length of source dictionary:  4136
length of target dictionary:  2
814
Total instances: 814
['Even', '"However', 'Some', 'Leave', 'Second', 'Use', 'Later', 'On', 'Besides', 'More', 'Uh', 'Sociologists', 'These', 'Michiel', 'Closing', 'North', 'My', 'it', 'Late', 'Whilst']
Number of samples:  814
Stats: Labels wi

epoch [1/10]: 0it [00:00, ?it/s]

Epoch: [1/10], Loss: 0.0247


epoch [2/10]: 0it [00:00, ?it/s]

Epoch: [2/10], Loss: 0.0187


epoch [3/10]: 0it [00:00, ?it/s]

Epoch: [3/10], Loss: 0.0166


epoch [4/10]: 0it [00:00, ?it/s]

Epoch: [4/10], Loss: 0.0162


epoch [5/10]: 0it [00:00, ?it/s]

Epoch: [5/10], Loss: 0.0164


epoch [6/10]: 0it [00:00, ?it/s]

Epoch: [6/10], Loss: 0.0171


epoch [7/10]: 0it [00:00, ?it/s]

Epoch: [7/10], Loss: 0.0177


epoch [8/10]: 0it [00:00, ?it/s]

Epoch: [8/10], Loss: 0.0171


epoch [9/10]: 0it [00:00, ?it/s]

Epoch: [9/10], Loss: 0.0135


epoch [10/10]: 0it [00:00, ?it/s]

Epoch: [10/10], Loss: 0.0123


Evaluating: 0it [00:00, ?it/s]

Score (accuracy) of the probe: 0.94


Evaluating: 0it [00:00, ?it/s]

Score (accuracy) of the probe: 0.75


  0%|          | 0/101 [00:00<?, ?it/s]

Loading json activations from /home/senya/Документы/project/large_data_en_gum/data_Degree/activations_train.json...
1121 13.0
Loading json activations from /home/senya/Документы/project/large_data_en_gum/data_Degree/activations_te.json...
382 13.0
Number of tokens:  1121
length of source dictionary:  5542
length of target dictionary:  3
1121
Total instances: 1121
['Re', 'Some', 'Previous', 'On', 'Arrogant', 'Late', 'Makes', 'Street', '"We', 'Why', '1', 'Move', 'Spread', 'Built', 'Ballet', 'Brock', 'La', 'Keep', 'History', 'If']
Number of samples:  1121
Stats: Labels with their frequencies in the final set
Cmp 151
Pos 867
Sup 103
Number of tokens:  382
length of source dictionary:  2502
length of target dictionary:  3
382
Total instances: 382
['Some', 'Space', 'Use', 'On', 'Very', 'Later', 'Run', 'Mandatory', 'More', 'Sociologists', 'These', 'Winter', 'Closing', 'My', 'Dvořák', 'No', 'neuroscientists', '"We', 'Cyclones', '"In']
Number of samples:  382
Stats: Labels with their frequencie

epoch [1/10]: 0it [00:00, ?it/s]

Epoch: [1/10], Loss: 0.0321


epoch [2/10]: 0it [00:00, ?it/s]

Epoch: [2/10], Loss: 0.0200


epoch [3/10]: 0it [00:00, ?it/s]

Epoch: [3/10], Loss: 0.0197


epoch [4/10]: 0it [00:00, ?it/s]

Epoch: [4/10], Loss: 0.0184


epoch [5/10]: 0it [00:00, ?it/s]

Epoch: [5/10], Loss: 0.0162


epoch [6/10]: 0it [00:00, ?it/s]

Epoch: [6/10], Loss: 0.0142


epoch [7/10]: 0it [00:00, ?it/s]

Epoch: [7/10], Loss: 0.0129


epoch [8/10]: 0it [00:00, ?it/s]

Epoch: [8/10], Loss: 0.0123


epoch [9/10]: 0it [00:00, ?it/s]

Epoch: [9/10], Loss: 0.0120


epoch [10/10]: 0it [00:00, ?it/s]

Epoch: [10/10], Loss: 0.0117


Evaluating: 0it [00:00, ?it/s]

Score (accuracy) of the probe: 0.94


Evaluating: 0it [00:00, ?it/s]

Score (accuracy) of the probe: 0.77


  0%|          | 0/101 [00:00<?, ?it/s]

In [9]:
with open('scores_gum.pkl', 'wb') as f:
    pickle.dump(scores, f)
with open('neurons_gum.pkl', 'wb') as f:
    pickle.dump(ordered_neurons, f)