## Make Document Embeddings

**NOTE:** This notebook was run on a Paperspace Gradient instance.

In [None]:
import os

import numpy as np
import pandas as pd

from sentence_transformers import SentenceTransformer

from utils.data import load_data
from utils.embeddings import (
    make_embeddings,
    save_embeddings
)

from tqdm import tqdm
tqdm.pandas()

## Load the data
**NOTE:** This notebook was run on a Paperspace Gradient. 
- Fetch from the Gradient dataset.

In [None]:
DATA = 'training_data/training_data.csv'
df = load_data(DATA)
print(f'N sentences: {len(df)}')
print(f'N unique posts/comments: {len(df["full_id"].unique())}')

### Inspect data:

In [None]:
df.head()

## Make Embeddings
- Choose the SentenceTransformer model by setting the `EMBEDDING_MODEL` parameter: here, I use **[all-mpnet-base-v2](https://huggingface.co/sentence-transformers/all-mpnet-base-v2)**.
- The embeddings will automatically be saved to the `embeddings` subfolder.
- If the path for the chosen embedding model exists, then encoding is skipped.

In [None]:
%%time

EMBEDDING_MODEL = 'all-mpnet-base-v2'
EMBEDDING_MODEL_PATH = os.path.join('embeddings', f'{EMBEDDING_MODEL}.pickle')

if not os.path.exists(EMBEDDING_MODEL_PATH):
    print(f'No {EMBEDDING_MODEL} embeddings found: making embeddings')
    embeddings = make_embeddings(df=df, doc_column='text', model=EMBEDDING_MODEL)
    
    print(f'Saving {EMBEDDING_MODEL} embeddings')
    save_embeddings(embeddings, EMBEDDING_MODEL_PATH)
else:
    print(f'Found {EMBEDDING_MODEL} at {EMBEDDING_MODEL_PATH}: skipping encoding.')
    
print('Done!')

`---Complete---`