In [None]:
#!pip install neurox
#!pip install torch

In [1]:
import os
import sys
import unittest
from pathlib import Path
from IPython.display import clear_output

In [2]:
sys.path.append('../src')
from data_classes import get_file_names, ConvertSample, GetEmbeddings
from probing_classes import get_converted_filenames, Experiment, Trainer

In [3]:
data_path = f'{str(Path(os.getcwd()).parents[0])}/data/data_en_ewt'
file1 = data_path+'/en_ewt_Case.csv'
file2 = data_path+'/en_ewt_Definite.csv'
large_data_path = f'{str(Path(os.getcwd()).parents[0])}/data/large_data_en_ewt'

class TestConverter(unittest.TestCase): #tests ConvertSample
    
    def test_get_file_names(self):
        # asserts we have 2 files in data_path (.csv files)
        a = get_file_names(data_path)
        b = 2
        self.assertEqual(len(a), b)
        
    def setUp(self):
        # creates splitter -- train, test, control task
        self.splitter = ConvertSample(file1, train_size=3, test_size=3)
        self.splitter_ = ConvertSample(file2, train_size=3, test_size=3)
        
    def test_read(self):
        # asserts read func works as supposed
        a = len(self.splitter.read())
        b = 9226
        self.assertEqual(a, b)
        
    def test_sampler(self):
        # asserts we have same labels in train and test
        train, test = self.splitter.stupid_sampler()
        self.assertEqual(set(train.values()), set(test.values()))
        
    def test_permute(self):
        # asserts train and control task are different dicts
        train, test = self.splitter.stupid_sampler()    
        b = self.splitter.using_shuffle(train)
        self.assertEqual(train.keys(), b.keys())
    
    def test_create_dicts(self):
        # asserts train and control task are different dicts
        dict_train, dict_test, dict_task = self.splitter.create_dicts()
        self.assertNotEqual(dict_train, dict_task)
                         
    def test_create_paths(self):
        # asserts we actually create paths
        self.assertIsNotNone(self.splitter.create_paths())
    
    def test_writter(self):
        # asserts we get paths
        self.assertIsNotNone(self.splitter.writer())
        self.assertIsNotNone(self.splitter_.writer())

In [4]:
unittest.main(argv=[''], verbosity=2, exit=False)

test_create_dicts (__main__.TestConverter) ... ok
test_create_paths (__main__.TestConverter) ... ok
test_get_file_names (__main__.TestConverter) ... ok
test_permute (__main__.TestConverter) ... ok
test_read (__main__.TestConverter) ... ok
test_sampler (__main__.TestConverter) ... ok
test_writter (__main__.TestConverter) ... ok

----------------------------------------------------------------------
Ran 7 tests in 0.248s

OK


<unittest.main.TestProgram at 0x7f921252e520>

In [5]:
#get embeddings
path_to_file = large_data_path+'/data_Case'
case = sorted(get_file_names(path_to_file))[2:]
emb_case = GetEmbeddings(case[1], case[0])
emb_case.jsons('bert-base-uncased')
clear_output(wait=False)

In [6]:
class TestError(unittest.TestCase):
    def test1(self):
        # asserts that func raises Error
        with self.assertRaises(IndexError):
            get_converted_filenames(large_data_path)

In [7]:
unittest.main(argv=[''], verbosity=2, exit=False)

test_create_dicts (__main__.TestConverter) ... ok
test_create_paths (__main__.TestConverter) ... ok
test_get_file_names (__main__.TestConverter) ... ok
test_permute (__main__.TestConverter) ... ok
test_read (__main__.TestConverter) ... ok
test_sampler (__main__.TestConverter) ... ok
test_writter (__main__.TestConverter) ... ok
test1 (__main__.TestError) ... ok

----------------------------------------------------------------------
Ran 8 tests in 0.150s

OK


<unittest.main.TestProgram at 0x7f9212539130>

In [8]:
#get embeddings
path_to_file = large_data_path+'/data_Definite'
defin = sorted(get_file_names(path_to_file))[2:]
emb_defin = GetEmbeddings(defin[1], defin[0])
emb_defin.jsons('bert-base-uncased')
clear_output(wait=False)

In [9]:
true, control = get_converted_filenames(large_data_path)

In [10]:
class TestExperiment(unittest.TestCase): # tests Experiment
    
    def test_get_converted_filenames(self):
        # asserts large_data_path has 2 directories
        a = get_converted_filenames(large_data_path)
        b = 2
        self.assertEqual(len(a), b)
        
    def setUp(self):
        # creates Experiment object
        self.exp = Experiment(*true[0])
        
    def test_Exp_attributes(self):
        # checks if attributes are extractable
        self.assertEqual(self.exp.dataset, 'en_ewt')
        self.assertEqual(self.exp.category, 'Case')
        
    def test_data_size(self):
        # asserts we return a tuple of size 3
        self.assertEqual(len(self.exp.data_size()), 3)
        
        
class TestTrainer(unittest.TestCase): # tests Trainer
    
    def setUp(self):
        # creates Trainer object for one and multiple categories
        self.train_one = Trainer(true[0])
        self.train_two = Trainer(true)
    
    def test_type_str(self):
        # asserts differentiates type
        self.assertTrue(self.train_one.type is str)
        
    def test_type_list(self):
        self.assertTrue(self.train_two.type is list)

In [11]:
unittest.main(argv=[''], verbosity=2, exit=False)

test_create_dicts (__main__.TestConverter) ... ok
test_create_paths (__main__.TestConverter) ... ok
test_get_file_names (__main__.TestConverter) ... ok
test_permute (__main__.TestConverter) ... ok
test_read (__main__.TestConverter) ... ok
test_sampler (__main__.TestConverter) ... ok
test_writter (__main__.TestConverter) ... ok
test1 (__main__.TestError) ... FAIL
test_Exp_attributes (__main__.TestExperiment) ... ok
test_data_size (__main__.TestExperiment) ... 

Loading json activations from /home/senya/Документы/project/data/large_data_en_ewt/data_Case/activations_train.json...
3 13.0
Loading json activations from /home/senya/Документы/project/data/large_data_en_ewt/data_Case/activations_te.json...
3 13.0
Number of tokens:  3
length of source dictionary:  59
length of target dictionary:  3
3
Total instances: 3
['To', 'Thanks', 'I']
Number of samples:  3
Stats: Labels with their frequencies in the final set
Gen 1
Nom 1
Acc 1
Number of tokens:  3
length of source dictionary:  30
length of target dictionary:  3
3
Total instances: 3
['Please', 'Sounds', 'We']
Number of samples:  3
Stats: Labels with their frequencies in the final set
Gen 1
Nom 1
Acc 1
Loading json activations from /home/senya/Документы/project/data/large_data_en_ewt/data_Case/activations_train.json...


ok
test_get_converted_filenames (__main__.TestExperiment) ... 

3 13.0
Loading json activations from /home/senya/Документы/project/data/large_data_en_ewt/data_Case/activations_te.json...
3 13.0
Number of tokens:  3
length of source dictionary:  59
length of target dictionary:  3
3
Total instances: 3
['To', 'Thanks', 'I']
Number of samples:  3
Stats: Labels with their frequencies in the final set
Gen 1
Nom 1
Acc 1
Number of tokens:  3
length of source dictionary:  30
length of target dictionary:  3
3
Total instances: 3
['Please', 'Sounds', 'We']
Number of samples:  3
Stats: Labels with their frequencies in the final set
Gen 1
Nom 1
Acc 1
Loading json activations from /home/senya/Документы/project/data/large_data_en_ewt/data_Case/activations_train.json...
3 13.0
Loading json activations from /home/senya/Документы/project/data/large_data_en_ewt/data_Case/activations_te.json...


ok
test_type_list (__main__.TestTrainer) ... ok
test_type_str (__main__.TestTrainer) ... ok

FAIL: test1 (__main__.TestError)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/tmp/ipykernel_191318/75263631.py", line 5, in test1
    get_converted_filenames(large_data_path)
AssertionError: IndexError not raised

----------------------------------------------------------------------
Ran 13 tests in 0.751s

FAILED (failures=1)


3 13.0
Number of tokens:  3
length of source dictionary:  59
length of target dictionary:  3
3
Total instances: 3
['To', 'Thanks', 'I']
Number of samples:  3
Stats: Labels with their frequencies in the final set
Gen 1
Nom 1
Acc 1
Number of tokens:  3
length of source dictionary:  30
length of target dictionary:  3
3
Total instances: 3
['Please', 'Sounds', 'We']
Number of samples:  3
Stats: Labels with their frequencies in the final set
Gen 1
Nom 1
Acc 1


<unittest.main.TestProgram at 0x7f92124572b0>