In [1]:
import json
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
def load_mock_data(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    return data

data = load_mock_data('mock_data/table_records.json')
metadata = load_mock_data('mock_data/metadata.json')

In [3]:
# Initialize the model
model = SentenceTransformer('all-MiniLM-L6-v2')

In [9]:
# Serialize data records
def serialize_records(data):
    records = []
    for i in range(len(data['StationID'])):
        record = f"StationID: {data['StationID'][i]}, Date: {data['Date'][i]}, Temperature: {data['Temperature'][i]}, Humidity: {data['Humidity'][i]}, WindSpeed: {data['WindSpeed'][i]}"
        records.append(record)
    return records

data_str = serialize_records(data)

In [10]:
data_str

['StationID: WS001, Date: 2024-02-14, Temperature: 25, Humidity: 80, WindSpeed: 15',
 'StationID: WS002, Date: 2024-02-14, Temperature: 22, Humidity: 85, WindSpeed: 10']

In [11]:
# Serialize metadata
metadata_str = ' '.join([f"{key}: {value}" for key, value in metadata.items() if isinstance(value, str)])


In [12]:
metadata_str

"TableName: WeatherStationData DataLineage: Derived from raw sensor readings aggregated daily. UpdateHistory: Last updated on 2024-02-14. EarliestData: Data collection started in 2020. TableDescription: Daily weather measurements from various stations. LevelOfDetail: Measurements are aggregated at the daily level. GeoTimeGranularity: Each record represents one day's data per station, global coverage. UsageData: 1000 queries last month. HighLevelTask: Can be used for climate change research, local weather forecasting."

In [13]:
# Embed data records
data_embeddings = model.encode(data_str)

In [14]:
# Embed metadata
metadata_embeddings = model.encode(metadata_str)

In [16]:
data_embeddings.shape

(2, 384)

In [17]:
metadata_embeddings.shape

(384,)