In [1]:
import pandas as pd
import numpy as np
import requests
import json

In [2]:
PARQUET_DIR = 'hello_dolly.parquet' # name of directory where we will save formatted data
DATA_URL = "https://github.com/databrickslabs/dolly/blob/master/data/databricks-dolly-15k.jsonl?raw=true"

rows = []
resp = requests.get(DATA_URL, stream=True)
for line in resp.iter_lines():
    if line: # filter out keep-alive new lines
        decoded_line = line.decode('utf-8')
        rows.append(json.loads(decoded_line))

dolly_data = pd.DataFrame(rows)

Which text columns may be missing?

In [3]:
def fraction_present(df, col):
    nonempty = [ 1 if len(x) > 0 else 0 for x in df[col] ]
    return np.mean(nonempty)

text_cols = ['instruction', 'context', 'response']
print("Fraction of each text column containing data:")
for text_col in text_cols:
    print(text_col, ': ', fraction_present(dolly_data, text_col))

Fraction of each text column containing data:
instruction :  1.0
context :  0.3062475023311576
response :  1.0


Compute embeddings for instruction and response columns:

In [4]:
from sentence_transformers import SentenceTransformer
sentxformer = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

dolly_data['instruction_vector'] = np.float32(sentxformer.encode(dolly_data['instruction'].values)).tolist()
dolly_data['response_vector'] = np.float32(sentxformer.encode(dolly_data['response'].values)).tolist()

# Save the featurized dataset

In [5]:
# Partition dataset
dolly_data['bucket'] = [ hash(k) % 10 for k in dolly_data['instruction'] ]

dolly_data.to_parquet(PARQUET_DIR, partition_cols=['bucket'])