In [2]:
!pip install -q openai h5py

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [None]:
import os
import json
import openai
import time
import random
import multiprocessing
import h5py
from openai import OpenAI

# Initialize OpenAI API key
API_KEY = 'sk-proj-7MAfZbOm9lPY28pubTiRT3BlbkFJGgn73o5e6sVCjoTfoFAP'
openai.api_key = API_KEY

# Function to process a JSON file and extract the relevant data
def extract_data(file_path):
    with open(file_path) as file:
        data = json.load(file)
    
    extracted_data = []
    for document in data:
        for turn in document['TURNS']:
            for name in turn['NAMES']:
                extracted_data.append({
                    'name': name,
                    'utterance': ' '.join(turn['UTTERANCES']),
                    'turn_number': turn['NUMBER'],
                    'file_path': file_path
                })
    return extracted_data

# Wrapper function to enable multiprocessing
def extract_data_wrapper(args):
    return extract_data(*args)

# Load data from JSON files in the specified folder and its subdirectories using multiprocessing
def load_data(folder_path, num_workers=4):
    all_texts = []
    all_metadata = []
    file_paths = []

    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith('.json'):
                file_paths.append(os.path.join(root, file))

    with multiprocessing.Pool(num_workers) as pool:
        results = pool.map(extract_data_wrapper, [(file_path,) for file_path in file_paths])

    for result in results:
        for item in result:
            all_texts.append(item['utterance'])
            all_metadata.append({
                'name': item['name'],
                'turn_number': item['turn_number'],
                'file_path': item['file_path']
            })

    print(f"Processed {len(file_paths)} files and extracted {len(all_texts)} utterances.")
    return all_texts, all_metadata

# Define the Embedding System
class EmbeddingSystem:
    def __init__(self, api_key, embedding_model):
        self.client = OpenAI(api_key=api_key)
        self.model = embedding_model

    def create_embeddings(self, texts, num_workers=4):
        manager = multiprocessing.Manager()
        return_dict = manager.dict()
        jobs = []

        total_texts = len(texts)
        chunk_size = 5  # Adjust this number based on your needs

        #print(f"Total texts to embed: {total_texts}")
        total_processes = (total_texts + chunk_size - 1) // chunk_size
        #print(f"Running embedding with {total_processes} processes.")

        current_process = 0
        for i in range(0, total_texts, chunk_size):
            texts_chunk = texts[i:i + chunk_size]

            p = multiprocessing.Process(target=self.worker, args=(texts_chunk, return_dict, current_process))
            jobs.append(p)
            p.start()
            current_process += 1

        for proc in jobs:
            proc.join()

        embeddings = []
        for result in return_dict.values():
            embeddings.extend(result)

        #print(f"Total embeddings generated: {len(embeddings)}")
        return embeddings

    def worker(self, texts, return_dict, index):
        embeddings = []
        for text in texts:
            while True:
                try:
                    embedding = self.get_embedding(text)
                    embeddings.append(embedding)
                    break
                except openai.RateLimitError:
                    wait_time = random.uniform(1, 60)  # Randomized wait time
                    #print(f"Rate limit hit. Process {index} waiting for {wait_time} seconds.")
                    time.sleep(wait_time)
                except openai.APIError as e:
                    print(f"OpenAI API returned an API Error: {e}")
                    break
                except openai.APIConnectionError as e:
                    print(f"Failed to connect to OpenAI API: {e}")
                    time.sleep(2)
                except Exception as e:
                    print(f"Unexpected error in process {index}: {e}")
                    break
        return_dict[index] = embeddings
        #print(f"Process {index} completed with {len(texts)} texts.")

    def get_embedding(self, text):
        text = text.replace("\n", " ")
        return self.client.embeddings.create(input=[text], model=self.model).data[0].embedding

def save_embeddings(embeddings, metadata, filename, model_name):
    with h5py.File(filename, 'a') as f:
        # Append new embeddings
        if 'embeddings' not in f:
            f.create_dataset('embeddings', data=embeddings, maxshape=(None, len(embeddings[0])))
            names_encoded = [meta['name'].encode('utf8') for meta in metadata]
            turn_numbers = [meta['turn_number'] for meta in metadata]
            file_paths_encoded = [meta['file_path'].encode('utf8') for meta in metadata]
            model_names_encoded = [model_name.encode('utf8') for _ in metadata]
            f.create_dataset('names', data=names_encoded, maxshape=(None,))
            f.create_dataset('turn_numbers', data=turn_numbers, maxshape=(None,))
            f.create_dataset('file_paths', data=file_paths_encoded, maxshape=(None,))
            f.create_dataset('model_names', data=model_names_encoded, maxshape=(None,))
        else:
            f['embeddings'].resize((f['embeddings'].shape[0] + len(embeddings)), axis=0)
            f['embeddings'][-len(embeddings):] = embeddings
            names_encoded = [meta['name'].encode('utf8') for meta in metadata]
            turn_numbers = [meta['turn_number'] for meta in metadata]
            file_paths_encoded = [meta['file_path'].encode('utf8') for meta in metadata]
            model_names_encoded = [model_name.encode('utf8') for _ in metadata]
            f['names'].resize((f['names'].shape[0] + len(names_encoded)), axis=0)
            f['names'][-len(names_encoded):] = names_encoded
            f['turn_numbers'].resize((f['turn_numbers'].shape[0] + len(turn_numbers)), axis=0)
            f['turn_numbers'][-len(turn_numbers):] = turn_numbers
            f['file_paths'].resize((f['file_paths'].shape[0] + len(file_paths_encoded)), axis=0)
            f['file_paths'][-len(file_paths_encoded):] = file_paths_encoded
            f['model_names'].resize((f['model_names'].shape[0] + len(model_names_encoded)), axis=0)
            f['model_names'][-len(model_names_encoded):] = model_names_encoded

    #print(f"Chunk processed and saved.")

# Worker function for multiprocessing
def embedding_worker(texts_chunk, metadata_chunk, model_config, api_key, chunk_index, output_file, lock):
    embedding_system = EmbeddingSystem(api_key=api_key, embedding_model=model_config['name'])
    embeddings = embedding_system.create_embeddings(texts_chunk)

    # Save embeddings for the chunk
    with lock:
        save_embeddings(embeddings, metadata_chunk, output_file, model_config['name'])

# Main function to execute the script
def main():
    folder_path = '/workspace/slice-monorepo/cl_cr3/aligneddata/c=3'
    embedding_model = 'text-embedding-3-small'  # Replace with the desired OpenAI model
    num_workers = 2
    output_file = 'utterance_embeddings.h5'
    
    # Load data
    print("Loading data...")
    texts, metadata = load_data(folder_path, num_workers=num_workers)
    total_texts = len(texts)
    chunk_size = 512  # Batch size for processing
    chunks = int(total_texts/chunk_size)
    # Initialize multiprocessing manager and lock
    manager = multiprocessing.Manager()
    lock = manager.Lock()
    jobs = []

    # Divide the work into chunks
    chunked_texts = [texts[i:i + chunk_size] for i in range(0, total_texts, chunk_size)]
    chunked_metadata = [metadata[i:i + chunk_size] for i in range(0, total_texts, chunk_size)]
    # Run a set number of workers at a time
    for chunk_index, (texts_chunk, metadata_chunk) in enumerate(zip(chunked_texts, chunked_metadata)):
        print(f"Chunk: {chunk_index}/{chunks}")
        p = multiprocessing.Process(target=embedding_worker, args=(texts_chunk, metadata_chunk, {'name': embedding_model}, API_KEY, chunk_index, output_file, lock))
        jobs.append(p)
        p.start()
        
        # Ensure no more than num_workers are running at once
        if len(jobs) >= num_workers:
            for job in jobs:
                job.join()
            jobs = []  # Reset jobs list for the next set of workers

    # Wait for remaining jobs to finish
    for job in jobs:
        job.join()
    
    print(f"All chunks processed and saved to {output_file}")

if __name__ == '__main__':
    main()


In [3]:
import h5py

def view_h5_file(file_path, num_entries=5):
    with h5py.File(file_path, 'r') as f:
        # Display the first few entries in the embeddings dataset
        print(f"size: {len(f)}")
        embeddings = f['embeddings'][:num_entries]
        names = [name.decode('utf8') for name in f['names'][:num_entries]]
        turn_numbers = f['turn_numbers'][:num_entries]
        file_paths = [path.decode('utf8') for path in f['file_paths'][:num_entries]]
        model_names = [model.decode('utf8') for model in f['model_names'][:num_entries]]

        print("First few entries in the HDF5 file:")
        for i in range(num_entries):
            print(f"Entry {i + 1}:")
            print(f"  Name: {names[i]}")
            print(f"  Turn Number: {turn_numbers[i]}")
            print(f"  File Path: {file_paths[i]}")
            print(f"  Model Name: {model_names[i]}")
            print(f"  Embedding: {embeddings[i]}")
            print(f"  Embedding Size: {len(embeddings[i])}")
            print()

if __name__ == "__main__":
    file_path = 'utterance_embeddings.h5'  # Update with your actual file path
    view_h5_file(file_path)


size: 5
First few entries in the HDF5 file:
Entry 1:
  Name: MARISHA
  Turn Number: 496
  File Path: /workspace/slice-monorepo/cl_cr3/aligneddata/c=3/C1E003_3_0.json
  Model Name: text-embedding-3-small
  Embedding: [-0.00624625 -0.01879092 -0.01577863 ... -0.02597606 -0.02259865
  0.01091464]
  Embedding Size: 1536

Entry 2:
  Name: LAURA
  Turn Number: 497
  File Path: /workspace/slice-monorepo/cl_cr3/aligneddata/c=3/C1E003_3_0.json
  Model Name: text-embedding-3-small
  Embedding: [-1.59449628e-05 -4.13936675e-02 -2.04442795e-02 ... -1.60119496e-02
 -1.74262542e-02  1.21604940e-02]
  Embedding Size: 1536

Entry 3:
  Name: SAM
  Turn Number: 498
  File Path: /workspace/slice-monorepo/cl_cr3/aligneddata/c=3/C1E003_3_0.json
  Model Name: text-embedding-3-small
  Embedding: [ 0.00522397  0.00704117 -0.0155793  ... -0.01513947  0.01615803
 -0.00351672]
  Embedding Size: 1536

Entry 4:
  Name: LAURA
  Turn Number: 499
  File Path: /workspace/slice-monorepo/cl_cr3/aligneddata/c=3/C1E003_3_

In [5]:
!pip install pandas

Collecting pandas
  Downloading pandas-2.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (19 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2024.1-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2024.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.0/13.0 MB[0m [31m141.5 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25hDownloading pytz-2024.1-py2.py3-none-any.whl (505 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m505.5/505.5 kB[0m [31m148.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tzdata-2024.1-py2.py3-none-any.whl (345 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m345.4/345.4 kB[0m [31m120.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pytz, tzdata, pandas
Successfully inst

In [6]:
import h5py
import numpy as np
import umap
import matplotlib.pyplot as plt
import pandas as pd

# Function to load embeddings and metadata from the HDF5 file
def load_embeddings(file_path):
    with h5py.File(file_path, 'r') as f:
        embeddings = f['embeddings'][:]
        names = f['names'][:]
        turn_numbers = f['turn_numbers'][:]
        file_paths = f['file_paths'][:]
        model_names = f['model_names'][:]
    
    metadata = {
        'names': [name.decode('utf8') for name in names],
        'turn_numbers': turn_numbers,
        'file_paths': [file_path.decode('utf8') for file_path in file_paths],
        'model_names': [model_name.decode('utf8') for model_name in model_names]
    }
    return embeddings, metadata

# Function to compute and print standard statistics on the embeddings
def compute_statistics(embeddings):
    df = pd.DataFrame(embeddings)
    stats = df.describe()
    print("Embedding Statistics:")
    print(stats)

# Function to visualize embeddings using UMAP
def visualize_embeddings(embeddings, metadata):
    # Memory-efficient UMAP
    reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, metric='cosine', low_memory=True)
    umap_embeddings = reducer.fit_transform(embeddings)
    
    plt.figure(figsize=(12, 8))
    scatter = plt.scatter(umap_embeddings[:, 0], umap_embeddings[:, 1], c=metadata['turn_numbers'], cmap='Spectral', s=5)
    plt.colorbar(scatter, label='Turn Numbers')
    plt.title('UMAP projection of the embeddings')
    plt.xlabel('UMAP 1')
    plt.ylabel('UMAP 2')
    plt.show()

    plt.figure(figsize=(12, 8))
    scatter = plt.scatter(umap_embeddings[:, 0], umap_embeddings[:, 1], c=[hash(name) % 10 for name in metadata['names']], cmap='tab10', s=5)
    plt.colorbar(scatter, label='Names')
    plt.title('UMAP projection of the embeddings by Names')
    plt.xlabel('UMAP 1')
    plt.ylabel('UMAP 2')
    plt.show()

    plt.figure(figsize=(12, 8))
    scatter = plt.scatter(umap_embeddings[:, 0], umap_embeddings[:, 1], c=[hash(fp) % 10 for fp in metadata['file_paths']], cmap='tab10', s=5)
    plt.colorbar(scatter, label='File Paths')
    plt.title('UMAP projection of the embeddings by File Paths')
    plt.xlabel('UMAP 1')
    plt.ylabel('UMAP 2')
    plt.show()

# Main function
def main(embeddings_path):
    embeddings, metadata = load_embeddings(embeddings_path)
    
    # Compute and print statistics
    compute_statistics(embeddings)
    
    # Visualize embeddings using UMAP
    visualize_embeddings(embeddings, metadata)

if __name__ == "__main__":
    embeddings_path = 'utterance_embeddings.h5'  # Path to the HDF5 file containing embeddings
    main(embeddings_path)


ModuleNotFoundError: No module named 'cuml'