### Importing Dependencies

In [1]:
import pickle

def load_set(directory):
    try:
        with open(f"{directory}/texts.pkl", "rb") as fp:
            processed_texts = pickle.load(fp)
        
        with open(f"{directory}/labels.pkl", "rb") as fp:
            labels = pickle.load(fp)
    
    except:
        print(f'{directory} files not found. Please run the preprocess.ipynb before!')
    
    return processed_texts, labels

In [2]:
processed_texts, labels = load_set('train')
processed_val_texts, val_labels = load_set('val')
processed_test_texts, test_labels = load_set('test')

#### Vectorization

In [3]:
import spacy

nlp = spacy.load('en_core_web_trf', disable=['parser', 'ner', 'tagger', 'attribute_ruler', 'lemmatizer'])
nlp.pipe_names

  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


['transformer']

In [4]:
import pickle
from alive_progress import alive_it

def vectorize_set(texts, directory):
    print(f'Preprocessing {directory} data')

    vectorized_texts = [doc._.trf_data.all_outputs[0] for doc in (nlp.pipe(texts[:1000]))]


    from pathlib import Path
    Path(f"{directory}").mkdir(parents=True, exist_ok=True)

    with open(f"{directory}/v_texts.pkl", "wb") as fp:
        pickle.dump(vectorized_texts, fp)
    

    return vectorized_texts

In [5]:
vectorized_texts = vectorize_set(processed_texts, 'train')

Preprocessing train data


In [6]:
vectorized_val_texts = vectorize_set(processed_val_texts, 'val')

Preprocessing val data


In [None]:
vectorized_test_texts = vectorize_set(processed_test_texts, 'test')