### Importing Dependencies

In [1]:
import pickle

def load_set(directory):
    try:
        with open(f"{directory}/texts.pkl", "rb") as fp:
            processed_texts = pickle.load(fp)
        
        with open(f"{directory}/labels.pkl", "rb") as fp:
            labels = pickle.load(fp)
    
    except:
        print(f'{directory} files not found. Please run the preprocess.ipynb before!')
    
    return processed_texts, labels

In [2]:
processed_texts, labels = load_set('train')
processed_val_texts, val_labels = load_set('val')
processed_test_texts, test_labels = load_set('test')

#### Vectorization

In [3]:
import spacy
from thinc.api import set_gpu_allocator, require_gpu

set_gpu_allocator("pytorch")
require_gpu(0)

nlp = spacy.load('en_core_web_trf', disable=['parser', 'ner', 'tagger', 'attribute_ruler', 'lemmatizer'])
nlp.pipe_names

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


['transformer']

In [4]:
import pickle
from tqdm import tqdm

def vectorize_set(texts, directory):
    print(f'Preprocessing {directory} data')

    docs = (doc for doc in nlp.pipe(texts))
    vectorized_texts = []
    for doc in tqdm(docs, total=len(texts), ncols=80):
        vectorized_texts.append(doc._.trf_data.all_outputs[0])

    from pathlib import Path
    Path(f"{directory}").mkdir(parents=True, exist_ok=True)

    with open(f"{directory}/v_texts.pkl", "wb") as fp:
        pickle.dump(vectorized_texts, fp)
    

    return vectorized_texts

In [5]:
vectorized_texts = vectorize_set(processed_texts, 'train')

Preprocessing train data


 29%|██████████▏                        | 54336/185658 [05:58<14:27, 151.37it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 94.00 MiB. GPU 

In [None]:
vectorized_val_texts = vectorize_set(processed_val_texts, 'val')

NameError: name 'processed_val_texts' is not defined

In [None]:
vectorized_test_texts = vectorize_set(processed_test_texts, 'test')

Preprocessing test data
