In [None]:
pip install openai sentence-transformers scikit-learn numpy matplotlib hdbscan umap-learn tensorflow

In [5]:
##############################

##     Repeated BATCH     ##

##############################

import json
import requests

API_KEY = 'sk-proj-7MAfZbOm9lPY28pubTiRT3BlbkFJGgn73o5e6sVCjoTfoFAP'

def upload_file(file_path, purpose):
    url = "https://api.openai.com/v1/files"
    headers = {
        "Authorization": f"Bearer {API_KEY}",
    }
    files = {
        'file': open(file_path, 'rb'),
        'purpose': (None, purpose),
    }
    response = requests.post(url, headers=headers, files=files)
    return response.json()

def create_batch(input_file_id, endpoint, completion_window):
    url = "https://api.openai.com/v1/batches"
    headers = {
        "Authorization": f"Bearer {API_KEY}",
        "Content-Type": "application/json",
    }
    data = {
        "input_file_id": input_file_id,
        "endpoint": endpoint,
        "completion_window": completion_window,
    }
    response = requests.post(url, headers=headers, json=data)
    return response.json()

def create_jsonl_file(prompt, n, file_path):
    data = [{
        "custom_id": f"request-{i+1}",
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": "gpt-3.5-turbo",
            "messages": [
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt}
            ],
            "max_tokens": 1000
        }
    } for i in range(n)]
    
    with open(file_path, 'w') as f:
        for item in data:
            f.write(json.dumps(item) + '\n')

def main():
    prompt = "I want a script that generates an analysis of some text data. Pull the text data from any source of text it doesn't matter what. Get the text data from a source you know of."
    n = 100  # number of jobs
    file_path = "batch_test.jsonl"
    endpoint = "/v1/chat/completions"
    completion_window = "24h"

    # Step 1: Create the file
    create_jsonl_file(prompt, n, file_path)
    
    # Step 2: Upload the file
    upload_response = upload_file(file_path, "batch")
    input_file_id = upload_response['id']
    print(f"File uploaded. File ID: {input_file_id}")

    # Step 3: Create the batch job
    batch_response = create_batch(input_file_id, endpoint, completion_window)
    batch_id = batch_response['id']
    print(f"Batch job created. Batch ID: {batch_id}")

if __name__ == "__main__":
    main()


File uploaded. File ID: file-9jyr72r96jaxWQCY1TN3vGNU
Batch job created. Batch ID: batch_RreSlmceawjRlFaBLSP6Bv5K


In [25]:
##############################

##     HP Sweep BATCH     ##

##############################

import json
import requests
import itertools
import random

API_KEY = 'sk-proj-7MAfZbOm9lPY28pubTiRT3BlbkFJGgn73o5e6sVCjoTfoFAP'

def upload_file(file_path, purpose):
    url = "https://api.openai.com/v1/files"
    headers = {
        "Authorization": f"Bearer {API_KEY}",
    }
    files = {
        'file': open(file_path, 'rb'),
        'purpose': (None, purpose),
    }
    response = requests.post(url, headers=headers, files=files)
    return response.json()

def create_batch(input_file_id, endpoint, completion_window):
    url = "https://api.openai.com/v1/batches"
    headers = {
        "Authorization": f"Bearer {API_KEY}",
        "Content-Type": "application/json",
    }
    data = {
        "input_file_id": input_file_id,
        "endpoint": endpoint,
        "completion_window": completion_window,
    }
    response = requests.post(url, headers=headers, json=data)
    return response.json()

def create_jsonl_file(prompts, parameter_combinations, file_path):
    data = []
    for i, (prompt, params) in enumerate(zip(prompts, parameter_combinations)):
        request_data = {
            "custom_id": f"request-{i+1}",
            "method": "POST",
            "url": "/v1/chat/completions",
            "body": {
                "model": params[0],
                "messages": [
                    {"role": "system", "content": "You are a helpful assistant."},
                    {"role": "user", "content": prompt}
                ],
                "max_tokens": params[1],
                "temperature": params[2],
                "top_p": params[3],
                "frequency_penalty": params[4],
                "presence_penalty": params[5],
                "stop": params[6],
                "user": params[7],
                "logprobs": True,
                "top_logprobs": 5
            }
        }
        data.append(request_data)

    with open(file_path, 'w') as f:
        for item in data:
            f.write(json.dumps(item) + '\n')

def generate_parameter_combinations(models, max_tokens_list, temperatures, top_ps, frequency_penalties, presence_penalties, stops, users):
    combinations = list(itertools.product(models, max_tokens_list, temperatures, top_ps, frequency_penalties, presence_penalties, stops, users))
    random.shuffle(combinations)  # Randomly shuffle to ensure random selection
    return combinations

def main():
    prompt = "I want a script that generates an analysis of some text data. Pull the text data from any source of text it doesn't matter what. Get the text data from a source you know of."
    total_combinations = 2000  # Total number of unique parameter combinations to generate
    n = 200  # number of jobs to use from the generated combinations
    file_path = "200_batch_test.jsonl"
    endpoint = "/v1/chat/completions"
    completion_window = "24h"

    # Define the parameter ranges for the sweep
    models = ["gpt-3.5-turbo"]
    max_tokens_list = [200, 300, 500, 700]
    temperatures = [0.5, 0.7, 0.9, 1.0]
    top_ps = [0.8, 0.9, 1.0]
    frequency_penalties = [0, 0.5, 1.0]
    presence_penalties = [0, 0.5, 1.0]
    stops = [["\n"], [".", "?", "!"], None]
    users = ["user_charles"]

    # Generate a large pool of parameter combinations
    parameter_combinations = generate_parameter_combinations(models, max_tokens_list, temperatures, top_ps, frequency_penalties, presence_penalties, stops, users)

    # Select `n` random combinations from the pool
    selected_combinations = random.sample(parameter_combinations, n)
    prompts = [prompt] * len(selected_combinations)

    # Step 1: Create the file with the selected parameter combinations
    create_jsonl_file(prompts, selected_combinations, file_path)
    
    # Step 2: Upload the file
    upload_response = upload_file(file_path, "batch")
    input_file_id = upload_response['id']
    print(f"File uploaded. File ID: {input_file_id}")

    # Step 3: Create the batch job
    batch_response = create_batch(input_file_id, endpoint, completion_window)
    batch_id = batch_response['id']
    print(f"Batch job created. Batch ID: {batch_id}")

if __name__ == "__main__":
    main()


File uploaded. File ID: file-Px7HiajPuZtpdeFUXCpDcYaL
Batch job created. Batch ID: batch_TYryq4oxmpN6MxoOtMQZRVLO


In [41]:
##############################

##     Status All Batch    ##

##############################

import requests
import json
from datetime import datetime

API_KEY = 'sk-proj-7MAfZbOm9lPY28pubTiRT3BlbkFJGgn73o5e6sVCjoTfoFAP'

def list_batches(limit=20, after=None):
    url = "https://api.openai.com/v1/batches"
    headers = {
        "Authorization": f"Bearer {API_KEY}",
        "Content-Type": "application/json",
    }
    params = {
        "limit": limit
    }
    if after:
        params["after"] = after

    response = requests.get(url, headers=headers, params=params)
    return response.json()

def get_status_label(status):
    if status == "completed":
        return "Finished"
    elif status == "failed":
        return "Fail"
    elif status == "cancelling":
        return "canceling"
    elif status == "cancelled":
        return "canceled"
    else:
        return "In Progress"

def main():
    limit = 100  # Adjust the limit as needed
    list_response = list_batches(limit=limit)

    print("Batch statuses:")
    for batch in list_response['data']:
        batch_id = batch['id']
        created_at = datetime.utcfromtimestamp(batch['created_at']).strftime('%Y-%m-%d %H:%M:%S')
        status_label = get_status_label(batch['status'])
        print(f"{batch_id}:{created_at}::{status_label}")

if __name__ == "__main__":
    main()


Batch statuses:
batch_TYryq4oxmpN6MxoOtMQZRVLO:2024-05-21 17:43:28::In Progress
batch_QsQq1inCRe7658QDxPfPop1W:2024-05-21 17:32:09::canceled
batch_RreSlmceawjRlFaBLSP6Bv5K:2024-05-21 16:56:52::canceling
batch_RcRye5IdrkCDNwNiNZO4FWa1:2024-05-21 16:54:17::Fail
batch_GBx1zUqapiftq7vgL1VTltPH:2024-05-21 16:52:03::Fail


In [39]:
##############################

##     Status 1 BATCH     ##

##############################

import requests
import json

API_KEY = 'sk-proj-7MAfZbOm9lPY28pubTiRT3BlbkFJGgn73o5e6sVCjoTfoFAP'

def retrieve_batch(batch_id):
    url = f"https://api.openai.com/v1/batches/{batch_id}"
    headers = {
        "Authorization": f"Bearer {API_KEY}",
        "Content-Type": "application/json",
    }
    response = requests.get(url, headers=headers)
    return response.json()

def main():
    batch_id = "batch_RreSlmceawjRlFaBLSP6Bv5K"

    # Retrieve the job status and info
    batch_status = retrieve_batch(batch_id)
    print("Batch status and information:")
    print(json.dumps(batch_status, indent=2))

if __name__ == "__main__":
    main()


Batch status and information:
{
  "id": "batch_RreSlmceawjRlFaBLSP6Bv5K",
  "object": "batch",
  "endpoint": "/v1/chat/completions",
  "errors": null,
  "input_file_id": "file-9jyr72r96jaxWQCY1TN3vGNU",
  "completion_window": "24h",
  "status": "in_progress",
  "output_file_id": null,
  "error_file_id": null,
  "created_at": 1716310612,
  "in_progress_at": 1716310613,
  "expires_at": 1716397012,
  "finalizing_at": null,
  "completed_at": null,
  "failed_at": null,
  "expired_at": null,
  "cancelling_at": null,
  "cancelled_at": null,
  "request_counts": {
    "total": 10,
    "completed": 0,
    "failed": 0
  },
  "metadata": null
}


In [40]:
##############################

##     Cancel BATCH     ##

##############################

import requests

API_KEY = 'sk-proj-7MAfZbOm9lPY28pubTiRT3BlbkFJGgn73o5e6sVCjoTfoFAP'

def cancel_batch(batch_id):
    url = f"https://api.openai.com/v1/batches/{batch_id}/cancel"
    headers = {
        "Authorization": f"Bearer {API_KEY}",
        "Content-Type": "application/json",
    }
    response = requests.post(url, headers=headers)
    return response.json()

def main():
    batch_id = "batch_RreSlmceawjRlFaBLSP6Bv5K"  # replace with the actual batch ID

    # Cancel the batch
    cancel_response = cancel_batch(batch_id)
    print("Cancel batch response:")
    print(cancel_response)

if __name__ == "__main__":
    main()


Cancel batch response:
{'id': 'batch_RreSlmceawjRlFaBLSP6Bv5K', 'object': 'batch', 'endpoint': '/v1/chat/completions', 'errors': None, 'input_file_id': 'file-9jyr72r96jaxWQCY1TN3vGNU', 'completion_window': '24h', 'status': 'cancelling', 'output_file_id': None, 'error_file_id': None, 'created_at': 1716310612, 'in_progress_at': 1716310613, 'expires_at': 1716397012, 'finalizing_at': None, 'completed_at': None, 'failed_at': None, 'expired_at': None, 'cancelling_at': 1716316970, 'cancelled_at': None, 'request_counts': {'total': 10, 'completed': 0, 'failed': 0}, 'metadata': None}


In [28]:
##############################

##     RETRIEVE BATCH     ##

##############################

import requests
import json

API_KEY = 'sk-proj-7MAfZbOm9lPY28pubTiRT3BlbkFJGgn73o5e6sVCjoTfoFAP'

def retrieve_batch(batch_id):
    url = f"https://api.openai.com/v1/batches/{batch_id}"
    headers = {
        "Authorization": f"Bearer {API_KEY}",
        "Content-Type": "application/json",
    }
    response = requests.get(url, headers=headers)
    return response.json()

def main():
    batch_id = "your_batch_id_here"  # replace with the actual batch ID

    # Step 4: Retrieve the job
    final_status = retrieve_batch(batch_id)
    print(json.dumps(final_status, indent=2))

if __name__ == "__main__":
    main()


{
  "error": {
    "message": "Invalid 'batch_id': 'your_batch_id_here'. Expected an ID that begins with 'batch'.",
    "type": "invalid_request_error",
    "param": "batch_id",
    "code": "invalid_value"
  }
}


In [None]:
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from scipy.ndimage import gaussian_filter

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = AutoTokenizer.from_pretrained('intfloat/e5-mistral-7b-instruct')
model = AutoModel.from_pretrained('intfloat/e5-mistral-7b-instruct').to(device)

def single_layer_pool(hidden_states, layer_index):
    """
    Extract embeddings from a specific layer.
    """
    selected_layer = hidden_states[layer_index]
    return selected_layer.mean(dim=1)  # Average over the token dimension to get a single vector per sequence

def embed_text(input_texts, max_length=500, layer_index=-1, scaling_factor=1.0):
    try:
        # Tokenize the input texts with truncation and without padding
        batch_dict = tokenizer(input_texts, max_length=max_length, padding=False, truncation=True, return_tensors='pt').to(device)
        
        # Check the length of the tokenized input and ignore if it's less than 100
        token_length = batch_dict['input_ids'].shape[1]
        if token_length < 100:
            return None
        
        # Set output_hidden_states to True
        outputs = model(**batch_dict, output_hidden_states=True)
        
        # Use the new pooling method with specified layer
        embeddings = single_layer_pool(outputs.hidden_states, layer_index)
        
        # Normalize embeddings
        embeddings = F.normalize(embeddings, p=2, dim=1)
        
        # Scale embeddings
        embeddings = embeddings * scaling_factor
        
        # Convert the embeddings to lists
        embedding_lists = embeddings.cpu().tolist()
        
        return embedding_lists
    
    except Exception as e:
        print(f"An error occurred: {e}")
    
    finally:
        try:
            del batch_dict
            del outputs
            del embeddings
            torch.cuda.empty_cache()
        except:
            pass

max_input_text_len = 500  # Set a specific maximum length for truncation
min_input_text_len = 100  # Minimum length threshold
layer_index = -1  # Use the last layer by default
scaling_factor = 1.0  # Scaling factor to amplify the embedding values

file_path = './breakingchat.txt'
user_list, chatgpt_list = parse_conversations(file_path)

print("Analyzing User Texts")
embedding_user_list = []
for text in user_list:
    embeddings = embed_text(text, max_length=max_input_text_len, layer_index=layer_index, scaling_factor=scaling_factor)
    if embeddings is not None:
        embedding_user_list.append(embeddings)

print("Analyzing ChatGPT Texts")
embedding_chatgpt_list = []
for text in chatgpt_list:
    embeddings = embed_text(text, max_length=max_input_text_len, layer_index=layer_index, scaling_factor=scaling_factor)
    if embeddings is not None:
        embedding_chatgpt_list.append(embeddings)

# Convert lists of lists to numpy arrays and reshape to remove the extra dimension
embedding_user_array = np.squeeze(np.array(embedding_user_list), axis=1)
embedding_chatgpt_array = np.squeeze(np.array(embedding_chatgpt_list), axis=1)

# Calculate variances
user_variance = np.var(embedding_user_array, axis=0).mean()
chatgpt_variance = np.var(embedding_chatgpt_array, axis=0).mean()

print(f"Mean Variance in User Embeddings: {user_variance}")
print(f"Mean Variance in ChatGPT Embeddings: {chatgpt_variance}")

# Assume the following functions and imports are provided
import tensorflow as tf

def train_autoencoder(input_data, latent_dim, num_epochs):
    encoder_input = tf.keras.Input(shape=(input_data.shape[-1],))
    
    # Encoder configuration
    encoded = tf.keras.layers.Dense(1024, activation='relu')(encoder_input)
    encoded = tf.keras.layers.BatchNormalization()(encoded)
    encoded = tf.keras.layers.LeakyReLU()(encoded)
    encoded = tf.keras.layers.Dense(512, activation='relu')(encoded)
    encoded = tf.keras.layers.Dropout(0.3)(encoded)
    encoded = tf.keras.layers.Dense(256, activation='relu')(encoded)
    encoded = tf.keras.layers.BatchNormalization()(encoded)
    encoded = tf.keras.layers.LeakyReLU()(encoded)
    encoded_output = tf.keras.layers.Dense(latent_dim, activation='linear')(encoded)

    encoder = tf.keras.Model(encoder_input, encoded_output, name='encoder')

    # Decoder configuration
    decoder_input = tf.keras.Input(shape=(latent_dim,))
    decoded = tf.keras.layers.Dense(256, activation='relu')(decoder_input)
    decoded = tf.keras.layers.BatchNormalization()(decoded)
    decoded = tf.keras.layers.LeakyReLU()(decoded)
    decoded = tf.keras.layers.Dense(512, activation='relu')(decoded)
    decoded = tf.keras.layers.Dropout(0.3)(decoded)
    decoded = tf.keras.layers.Dense(1024, activation='relu')(decoded)
    decoded = tf.keras.layers.BatchNormalization()(decoded)
    decoded = tf.keras.layers.LeakyReLU()(decoded)
    decoded_output = tf.keras.layers.Dense(input_data.shape[-1], activation='sigmoid')(decoded)

    decoder = tf.keras.Model(decoder_input, decoded_output, name='decoder')

    # Autoencoder connecting encoder and decoder
    autoencoder_input = encoder_input
    encoded_embedding = encoder(autoencoder_input)
    decoded_embedding = decoder(encoded_embedding)
    autoencoder = tf.keras.Model(autoencoder_input, decoded_embedding, name='autoencoder')
    autoencoder.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001), loss='mean_squared_error')

    # Move the model to GPU if available
    with tf.device('/GPU:0' if tf.config.list_physical_devices('GPU') else '/CPU:0'):
        # Train the autoencoder
        autoencoder.fit(input_data, input_data, epochs=num_epochs, batch_size=32, verbose=1)
    
    return encoder

def visualize_2d_grid(encoded_2d_grid, grid_size):
    # Applying Gaussian filter to smooth the grids
    smoothed_grids = np.array([gaussian_filter(grid, sigma=2) for grid in encoded_2d_grid])

    # Setting up the figure and 3D axis
    fig = plt.figure(figsize=(12, 8))
    ax = fig.add_subplot(111, projection='3d')
    
    # Creating a meshgrid for the x and y coordinates
    X, Y = np.meshgrid(range(grid_size), range(grid_size))
    
    # Plotting each grid with an increasing offset in Z
    offset = 0.0
    for i in range(smoothed_grids.shape[0]):
        Z = smoothed_grids[i] + i * offset  # Offset each grid
        ax.plot_surface(X, Y, Z, cmap='viridis', alpha=0.7)

    # Setting labels and title
    ax.set_title('3D Visualization of Smoothed Text Embeddings')
    ax.set_xlabel('Dimension 1')
    ax.set_ylabel('Dimension 2')
    ax.set_zlabel('Embedding Value')

    # Display the plot
    plt.show()

# Free up GPU memory by deleting the model
del model
torch.cuda.empty_cache()

# Train the autoencoder and visualize embeddings
grid_size = 50
latent_dim = grid_size * grid_size
num_epochs = 500

combined_input = embedding_chatgpt_array
encoder = train_autoencoder(combined_input, latent_dim, num_epochs)

# Predict using the trained encoder
encoded_2d_grid = encoder.predict(combined_input).reshape(-1, grid_size, grid_size)
print("Variance in Encoded Outputs for ChatGPT Data:", np.var(encoded_2d_grid, axis=0).mean())
visualize_2d_grid(encoded_2d_grid, grid_size)

# Now predict using user data
combined_input = embedding_user_array
encoded_2d_grid = encoder.predict(combined_input).reshape(-1, grid_size, grid_size)
print("Variance in Encoded Outputs for User Data:", np.var(encoded_2d_grid, axis=0).mean())
visualize_2d_grid(encoded_2d_grid, grid_size)


In [None]:
embedding_user_list = []
embedding_chatgpt_list = []
max_input_text_len = 1000
for text in user_list:
    embedding_user_list.append(embed_text(text[0:max_input_text_len]))
for text in chatgpt_list:
    embedding_chatgpt_list.append(embed_text(text[0:max_input_text_len]))

In [None]:
from sentence_transformers import SentenceTransformer

def get_embeddings(texts):
    model = SentenceTransformer('sentence-transformers/paraphrase-mpnet-base-v2')
    embeddings = model.encode(texts, show_progress_bar=True)
    return embeddings
# Note: You might need to limit the number of texts processed at once if the list is very large
embedding_user_list = get_embeddings(user_list[:max_input_text_len])
embedding_chatgpt_list = get_embeddings(chatgpt_list[:max_input_text_len])
import numpy as np

# Calculate variance for each set of embeddin gs
user_variance = np.var(embedding_user_list, axis=0).mean()
chatgpt_variance = np.var(embedding_chatgpt_list, axis=0).mean()

print(f"Mean Variance in User Embeddings: {user_variance}")
print(f"Mean Variance in ChatGPT Embeddings: {chatgpt_variance}")

from sentence_transformers import SentenceTransformer

def get_embeddings(texts):
    model = SentenceTransformer('sentence-transformers/paraphrase-mpnet-base-v2')
    embeddings = model.encode(texts, show_progress_bar=True)
    return embeddings

# Assuming `user_list` and `chatgpt_list` contain your texts
embedding_user_array = get_embeddings(user_list[:max_input_text_len])
embedding_chatgpt_array = get_embeddings(chatgpt_list[:max_input_text_len])

# Ensure both arrays are of the same shape
if embedding_user_array.shape[1] != embedding_chatgpt_array.shape[1]:
    raise ValueError("Embedding dimensions do not match and cannot be concatenated.")

# Here we use only ChatGPT array for training as an example
combined_input = embedding_chatgpt_array

grid_size = 30
latent_dim = grid_size * grid_size

# Train the autoencoder
encoder = train_autoencoder(combined_input, latent_dim, 30)

# Predict using the trained encoder
encoded_2d_grid = encoder.predict(combined_input).reshape(-1, grid_size, grid_size)
print("Variance in Encoded Outputs for ChatGPT Data:", np.var(encoded_2d_grid, axis=0).mean())
visualize_2d_grid(encoded_2d_grid, grid_size)

# Now predict using user data
combined_input = embedding_user_array
encoded_2d_grid = encoder.predict(combined_input).reshape(-1, grid_size, grid_size)
print("Variance in Encoded Outputs for User Data:", np.var(encoded_2d_grid, axis=0).mean())
visualize_2d_grid(encoded_2d_grid, grid_size)



In [None]:
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel
import numpy as np

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def single_layer_pool(hidden_states, layer_index):
    """
    Extract embeddings from a specific layer.
    """
    selected_layer = hidden_states[layer_index]
    return selected_layer.mean(dim=1)  # Average over the token dimension to get a single vector per sequence

def embed_text(input_texts, max_length=500, layer_index=-1, scaling_factor=1.0):
    try:
        # Tokenize the input texts with truncation and without padding
        batch_dict = tokenizer(input_texts, max_length=max_length, padding=False, truncation=True, return_tensors='pt').to(device)
        
        # Check the length of the tokenized input and ignore if it's less than 100
        token_length = batch_dict['input_ids'].shape[1]
        if token_length < 100:
            return None
        
        # Set output_hidden_states to True
        outputs = model(**batch_dict, output_hidden_states=True)
        
        # Use the new pooling method with specified layer
        embeddings = single_layer_pool(outputs.hidden_states, layer_index)
        
        # Normalize embeddings
        embeddings = F.normalize(embeddings, p=2, dim=1)
        
        # Scale embeddings
        embeddings = embeddings * scaling_factor
        
        # Convert the embeddings to lists
        embedding_lists = embeddings.cpu().tolist()
        
        return embedding_lists
    
    except Exception as e:
        print(f"An error occurred: {e}")
    
    finally:
        try:
            del batch_dict
            del outputs
            del embeddings
            torch.cuda.empty_cache()
        except:
            pass

max_input_text_len = 500  # Set a specific maximum length for truncation
min_input_text_len = 100  # Minimum length threshold
layer_index = -1  # Use the last layer by default
scaling_factor = 1000.0  # Scaling factor to amplify the embedding values

print("Analyzing User Texts")
embedding_user_list = []
for text in user_list:
    embeddings = embed_text(text, max_length=max_input_text_len, layer_index=layer_index, scaling_factor=scaling_factor)
    if embeddings is not None:
        embedding_user_list.append(embeddings)

print("Analyzing ChatGPT Texts")
embedding_chatgpt_list = []
for text in chatgpt_list:
    embeddings = embed_text(text, max_length=max_input_text_len, layer_index=layer_index, scaling_factor=scaling_factor)
    if embeddings is not None:
        embedding_chatgpt_list.append(embeddings)

# Convert lists of lists to numpy arrays
embedding_user_array = np.array(embedding_user_list)
embedding_chatgpt_array = np.array(embedding_chatgpt_list)

# Calculate variances
user_variance = np.var(embedding_user_array, axis=0).mean()
chatgpt_variance = np.var(embedding_chatgpt_array, axis=0).mean()

print(f"Mean Variance in User Embeddings: {user_variance}")
print(f"Mean Variance in ChatGPT Embeddings: {chatgpt_variance}")


In [None]:
import openai
import numpy as np
from sentence_transformers import SentenceTransformer
import tensorflow as tf
import matplotlib.pyplot as plt
from scipy.ndimage import gaussian_filter

# Initialize OpenAI client
client = openai.OpenAI(api_key="sk-proj-7MAfZbOm9lPY28pubTiRT3BlbkFJGgn73o5e6sVCjoTfoFAP")
MODEL = "gpt-4o"

# Function to get responses from GPT-4o
def get_responses(prompt, n, max_tokens=1000, temperature=0.7, top_p=1.0, frequency_penalty=0.0, presence_penalty=0.0):
    responses = []
    log_probs = []
    for _ in range(n):
        response = client.chat.completions.create(
            model=MODEL,
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt}
            ],
            max_tokens=max_tokens,
            temperature=temperature,
            top_p=top_p,
            frequency_penalty=frequency_penalty,
            presence_penalty=presence_penalty,
            logprobs=True
        )
        responses.append(response.choices[0].message.content)
        log_probs.append(response.choices[0].logprobs)
    return responses, log_probs

# Function to get embeddings for the responses
def get_embeddings(responses):
    model = SentenceTransformer('sentence-transformers/paraphrase-mpnet-base-v2')
    embeddings = model.encode(responses)
    return embeddings

# Function to convert log probabilities to probabilities using softmax
def softmax(logits):
    exp_logits = np.exp(logits - np.max(logits))
    return exp_logits / exp_logits.sum()

def analyze_log_probs(log_probs):
    probability_map = []
    for choice in log_probs:
        # Access the 'content' attribute which is the list of 'ChatCompletionTokenLogprob'
        tokens = [token_logprob.token for token_logprob in choice.content]
        token_logprobs = [token_logprob.logprob for token_logprob in choice.content]

        # Convert log probabilities to probabilities using softmax
        token_probs = softmax(token_logprobs)

        probability_map.append({
            'tokens': tokens,
            'token_probs': token_probs,
            'token_logprobs': token_logprobs  # Store raw log probabilities for plotting
        })
    
    return probability_map

# Function to train an autoencoder
def train_autoencoder(input_data, latent_dim):
    encoder_input = tf.keras.Input(shape=(input_data.shape[-1],))
    encoded = tf.keras.layers.Dense(512, activation='relu')(encoder_input)
    encoded = tf.keras.layers.Dropout(0.5)(encoded)
    encoded = tf.keras.layers.Dense(256, activation='relu')(encoded)
    encoded = tf.keras.layers.Dense(128, activation='relu')(encoded)
    encoded_output = tf.keras.layers.Dense(latent_dim, activation='linear')(encoded)

    encoder = tf.keras.Model(encoder_input, encoded_output, name='encoder')

    decoder_input = tf.keras.Input(shape=(latent_dim,))
    decoded = tf.keras.layers.Dense(128, activation='relu')(decoder_input)
    decoded = tf.keras.layers.Dense(256, activation='relu')(decoded)
    decoded = tf.keras.layers.Dense(512, activation='relu')(decoded)
    decoded_output = tf.keras.layers.Dense(input_data.shape[-1], activation='sigmoid')(decoded)

    decoder = tf.keras.Model(decoder_input, decoded_output, name='decoder')

    autoencoder_input = encoder_input
    encoded_embedding = encoder(autoencoder_input)
    decoded_embedding = decoder(encoded_embedding)

    autoencoder = tf.keras.Model(autoencoder_input, decoded_embedding, name='autoencoder')
    autoencoder.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001), loss='mean_squared_error')

    autoencoder.fit(input_data, input_data, epochs=10, batch_size=16, verbose=1)
    return encoder

# Function to visualize 2D grid in 3D with smoothing
def visualize_2d_grid(encoded_2d_grid, grid_size):
    smoothed_grids = np.array([gaussian_filter(grid, sigma=1) for grid in encoded_2d_grid])

    fig = plt.figure(figsize=(12, 8))
    ax = fig.add_subplot(111, projection='3d')
    X, Y = np.meshgrid(range(grid_size), range(grid_size))
    for i in range(smoothed_grids.shape[0]):
        Z = smoothed_grids[i]
        ax.plot_surface(X, Y, Z, cmap='viridis', alpha=0.7)

    ax.set_title('3D Visualization of Smoothed Text Embeddings')
    ax.set_xlabel('Dimension 1')
    ax.set_ylabel('Dimension 2')
    ax.set_zlabel('Embedding Value')
    plt.show()

def extract_token_probs(data):
    # Initialize an empty list to store the token probabilities
    token_probs_list = []
    
    # Iterate over each entry in the data list
    for entry in data:
        # Check if 'token_probs' key exists in the dictionary
        if 'token_probs' in entry:
            # Append the numpy array of token probabilities to the list
            token_probs_list.append(entry['token_probs'])
    
    # Return the list of token probabilities
    return token_probs_list
    
def pad_token_probs(probabilities, pad_value=0):
    # Find the maximum length of the token probabilities list
    max_length = max(len(probs) for probs in probabilities)
    # Pad each probabilities list to the maximum length
    padded_probabilities = np.array([np.pad(probs, (0, max_length - len(probs)), 'constant', constant_values=pad_value) for probs in probabilities])
    return padded_probabilities


prompt = "I want a script that generates an analysis of some text data. Pull the text data from any source of text it doesn't matter what."
n = 10
responses, log_probs = get_responses(prompt, n)
embeddings = get_embeddings(responses)

probability_map = analyze_log_probs(log_probs)
token_probs_list = extract_token_probs(probability_map)

token_probabilities = pad_token_probs([entry['token_probs'] for entry in probability_map])

# Combine embeddings and token probabilities
combined_input = np.hstack((embeddings, token_probabilities))

grid_size = 30
latent_dim = grid_size * grid_size
encoder = train_autoencoder(combined_input, latent_dim)

encoded_2d_grid = encoder(combined_input).numpy().reshape(-1, grid_size, grid_size)
visualize_2d_grid(encoded_2d_grid, grid_size)


In [None]:
# Function to pad the token probabilities to the same length
def pad_token_probs(probabilities, pad_value=0):
    # Find the maximum length of the token probabilities list
    max_length = max(len(probs) for probs in probabilities)
    # Pad each probabilities list to the maximum length
    padded_probabilities = np.array([np.pad(probs, (0, max_length - len(probs)), 'constant', constant_values=pad_value) for probs in probabilities])
    return padded_probabilities

# Usage in your main workflow
token_probabilities = pad_token_probs([entry['token_probs'] for entry in probability_map])

combined_input = np.hstack((embeddings, token_probabilities))

In [None]:
def train_autoencoder(input_data, latent_dim):
    encoder_input = tf.keras.Input(shape=(input_data.shape[-1],))
    encoded = tf.keras.layers.Dense(512, activation='relu')(encoder_input)
    encoded = tf.keras.layers.Dropout(0.5)(encoded)
    encoded = tf.keras.layers.Dense(256, activation='relu')(encoded)
    encoded = tf.keras.layers.Dense(128, activation='relu')(encoded)
    encoded_output = tf.keras.layers.Dense(latent_dim, activation='linear')(encoded)

    encoder = tf.keras.Model(encoder_input, encoded_output, name='encoder')

    decoder_input = tf.keras.Input(shape=(latent_dim,))
    decoded = tf.keras.layers.Dense(128, activation='relu')(decoder_input)
    decoded = tf.keras.layers.Dense(256, activation='relu')(decoded)
    decoded = tf.keras.layers.Dense(512, activation='relu')(decoded)
    decoded_output = tf.keras.layers.Dense(input_data.shape[-1], activation='sigmoid')(decoded)

    decoder = tf.keras.Model(decoder_input, decoded_output, name='decoder')

    autoencoder_input = encoder_input
    encoded_embedding = encoder(autoencoder_input)
    decoded_embedding = decoder(encoded_embedding)

    autoencoder = tf.keras.Model(autoencoder_input, decoded_embedding, name='autoencoder')
    autoencoder.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001), loss='mean_squared_error')

    autoencoder.fit(input_data, input_data, epochs=50, batch_size=16, verbose=1)
    return encoder
# Function to visualize 2D grid in 3D with smoothing
def visualize_2d_grid(encoded_2d_grid, grid_size):
    smoothed_grids = np.array([gaussian_filter(grid, sigma=5) for grid in encoded_2d_grid])

    fig = plt.figure(figsize=(12, 8))
    ax = fig.add_subplot(111, projection='3d')
    X, Y = np.meshgrid(range(grid_size), range(grid_size))
    for i in range(smoothed_grids.shape[0]):
        Z = smoothed_grids[i]
        ax.plot_surface(X, Y, Z, cmap='viridis', alpha=0.7)

    ax.set_title('3D Visualization of Smoothed Text Embeddings')
    ax.set_xlabel('Dimension 1')
    ax.set_ylabel('Dimension 2')
    ax.set_zlabel('Embedding Value')
    plt.show()
grid_size = 100
latent_dim = grid_size * grid_size
encoder = train_autoencoder(combined_input, latent_dim)
encoded_2d_grid = encoder(combined_input).numpy().reshape(-1, grid_size, grid_size)
visualize_2d_grid(encoded_2d_grid, grid_size)

In [None]:
def train_autoencoder(input_data, latent_dim):
    encoder_input = tf.keras.Input(shape=(input_data.shape[-1],))
    encoded = tf.keras.layers.Dense(512, activation='relu')(encoder_input)
    encoded = tf.keras.layers.Dropout(0.5)(encoded)
    encoded = tf.keras.layers.Dense(256, activation='relu')(encoded)
    encoded = tf.keras.layers.Dense(128, activation='relu')(encoded)
    encoded_output = tf.keras.layers.Dense(latent_dim, activation='linear')(encoded)

    encoder = tf.keras.Model(encoder_input, encoded_output, name='encoder')

    decoder_input = tf.keras.Input(shape=(latent_dim,))
    decoded = tf.keras.layers.Dense(128, activation='relu')(decoder_input)
    decoded = tf.keras.layers.Dense(256, activation='relu')(decoded)
    decoded = tf.keras.layers.Dense(512, activation='relu')(decoded)
    decoded_output = tf.keras.layers.Dense(input_data.shape[-1], activation='sigmoid')(decoded)

    decoder = tf.keras.Model(decoder_input, decoded_output, name='decoder')

    autoencoder_input = encoder_input
    encoded_embedding = encoder(autoencoder_input)
    decoded_embedding = decoder(encoded_embedding)

    autoencoder = tf.keras.Model(autoencoder_input, decoded_embedding, name='autoencoder')
    autoencoder.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001), loss='mean_squared_error')

    autoencoder.fit(input_data, input_data, epochs=30, batch_size=16, verbose=1)
    return encoder
# Function to visualize 2D grid in 3D with smoothing
import numpy as np
import matplotlib.pyplot as plt
from scipy.ndimage import gaussian_filter
from mpl_toolkits.mplot3d import Axes3D

def visualize_2d_grid(encoded_2d_grid, grid_size):
    # Applying Gaussian filter to smooth the grids
    smoothed_grids = np.array([gaussian_filter(grid, sigma=1) for grid in encoded_2d_grid])

    # Setting up the figure and 3D axis
    fig = plt.figure(figsize=(12, 8))
    ax = fig.add_subplot(111, projection='3d')
    
    # Creating a meshgrid for the x and y coordinates
    X, Y = np.meshgrid(range(grid_size), range(grid_size))
    
    # Plotting each grid with an increasing offset in Z
    offset = 0.5
    for i in range(smoothed_grids.shape[0]):
        Z = smoothed_grids[i] + i * offset  # Offset each grid
        ax.plot_surface(X, Y, Z, cmap='viridis', alpha=0.7)

    # Setting labels and title
    ax.set_title('3D Visualization of Smoothed Text Embeddings')
    ax.set_xlabel('Dimension 1')
    ax.set_ylabel('Dimension 2')
    ax.set_zlabel('Embedding Value')

    # Display the plot
    plt.show()

grid_size = 30
latent_dim = grid_size * grid_size
encoder = train_autoencoder(combined_input, latent_dim)
encoded_2d_grid = encoder(combined_input).numpy().reshape(-1, grid_size, grid_size)
visualize_2d_grid(encoded_2d_grid, grid_size)

In [None]:
batch_dict = tokenizer(input_texts, max_length=max_length, padding=True, truncation=True, return_tensors='pt').to(device)

# Set output_hidden_states to True
outputs = model(**batch_dict, output_hidden_states=True)
# Use the new pooling method
embeddings = multi_layer_pool(outputs.hidden_states, batch_dict['attention_mask'])

# normalize embeddings
embeddings = F.normalize(embeddings, p=2, dim=1)

# Convert the embeddings to lists
embedding_lists = embeddings.cpu().tolist()
for i, embedding in enumerate(embedding_lists):
    print(f'Embedding for input {i+1}: {embedding}')

# To print the similarity scores
scores = (embeddings[:2] @ embeddings[2:].T) * 100
print("Similarity scores:", scores.tolist())


In [None]:
import torch
import torch.nn.functional as F

from torch import Tensor
from transformers import AutoTokenizer, AutoModel

# Check if GPU is available and set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def get_detailed_instruct(task_description: str, query: str) -> str:
    return f'Instruct: {task_description}\nQuery: {query}'

# Each query must come with a one-sentence instruction that describes the task
task = 'Given a web search query, retrieve relevant passages that answer the query'
queries = [
    get_detailed_instruct(task, 'how much protein should a female eat'),
    get_detailed_instruct(task, 'summit define')
]
# No need to add instruction for retrieval documents
documents = [
    "As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day. But, as you can see from this chart, you'll need to increase that if you're expecting or training for a marathon. Check out the chart below to see how much protein you should be eating each day.",
    "Definition of summit for English Language Learners. : 1  the highest point of a mountain : the top of a mountain. : 2  the highest level. : 3  a meeting or series of meetings between the leaders of two or more governments."
]
input_texts = queries + documents

tokenizer = AutoTokenizer.from_pretrained('intfloat/e5-mistral-7b-instruct')
model = AutoModel.from_pretrained('intfloat/e5-mistral-7b-instruct').to(device)


In [None]:
def multi_layer_pool(hidden_states: list, attention_mask: Tensor, num_layers: int = 1) -> Tensor:
    """
    Extract embeddings from the last 'num_layers' layers and concatenate them.
    """
    # Get the last num_layers layers
    all_layers = hidden_states[-num_layers:]  
    # Concatenate them on the embedding dimension
    concatenated_layers = torch.cat(all_layers, dim=-1)  

    left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
    if left_padding:
        return concatenated_layers[:, -1]
    else:
        sequence_lengths = attention_mask.sum(dim=1) - 1
        batch_size = concatenated_layers.shape[0]
        return concatenated_layers[torch.arange(batch_size, device=concatenated_layers.device), sequence_lengths]

max_length = 4096
# Tokenize the input texts
batch_dict = tokenizer(input_texts, max_length=max_length, padding=True, truncation=True, return_tensors='pt').to(device)

# Set output_hidden_states to True
outputs = model(**batch_dict, output_hidden_states=True)
# Use the new pooling method
embeddings = multi_layer_pool(outputs.hidden_states, batch_dict['attention_mask'])

# normalize embeddings
embeddings = F.normalize(embeddings, p=2, dim=1)

# Convert the embeddings to lists
embedding_lists = embeddings.cpu().tolist()

print(len(embedding_lists[0]))
print(embedding_lists[0][0:100])

# To print the similarity scores
scores = (embeddings[:2] @ embeddings[2:].T) * 100
print("Similarity scores:", scores.tolist())

In [None]:

import openai
client = openai.OpenAI(api_key="sk-proj-7MAfZbOm9lPY28pubTiRT3BlbkFJGgn73o5e6sVCjoTfoFAP")
MODEL = "gpt-4o"

def get_full_response(prompt, n=1, max_tokens=1000, temperature=0.7, top_p=1.0, frequency_penalty=0.0, presence_penalty=0.0):
    for _ in range(n):
        response = client.chat.completions.create(
            model="gpt-4-turbo",  # Replace with your model ID as needed
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt}
            ],
            max_tokens=max_tokens,
            temperature=temperature,
            top_p=top_p,
            frequency_penalty=frequency_penalty,
            presence_penalty=presence_penalty,
            logprobs=True,  # Enable logprobs
            top_logprobs=10  # Specify number of top log probabilities to return
        )

        # Print the entire response object
        return response

# Example usage
response = get_full_response("Tell me a joke", n=1)

import re

def extract_response_info(response):
    # Extract the conversation response
    response_match = re.search(r'content="(.*?)"', str(response), re.DOTALL)
    response_text = response_match.group(1) if response_match else ""

    # Extract the logprobs of the response
    logprobs_match = re.findall(r'logprob=(-?\d+\.\d+)', str(response))
    logprobs = [float(logprob) for logprob in logprobs_match]

    return response_text, logprobs
    
reply, logprobs= extract_response_info(str(response))

In [None]:
def extract_chat_completion_data(response):
    # Data structure to hold the results
    data = {
        "Response Content": "",
        "Logprobs": [],
        "Top Logprob Words": [],
        "Top Logprob Values": []
    }
    
    # Assume the first choice for simplification; adapt as needed for multiple choices
    if response.choices:
        choice = response.choices[0]
        data["Response Content"] = choice.message.content
        
        # Extract token logprob information
        for token_logprob in choice.logprobs.content:
            # Append the logprob of the current token to the list
            data["Logprobs"].append(token_logprob.logprob)
            
            # For collecting top logprob words and their values
            top_words = []
            top_values = []
            
            # Extract top logprob details
            for top_logprob in token_logprob.top_logprobs:
                top_words.append(top_logprob.token)
                top_values.append(top_logprob.logprob)
            
            # Append each token's top logprob words and values
            data["Top Logprob Words"].append(top_words)
            data["Top Logprob Values"].append(top_values)

    return data

# Example usage:
# Assuming you have a response object from the OpenAI API
# print(extract_chat_completion_data(response))

# Example usage:
# Assuming you have a response object from the OpenAI API
print(extract_chat_completion_data(response))


In [None]:
print(type(response))
print(response)


In [None]:
def parse_conversations(file_path):
    with open(file_path, 'r') as file:
        content = file.read()

    conversations = content.split('\n\n')
    user_messages = []
    chatgpt_messages = []

    current_label = None
    current_message = []

    for conversation in conversations:
        lines = conversation.strip().split('\n')
        if len(lines) >= 1:
            if lines[0] == '#USER':
                if current_label == '##ChatGPT':
                    chatgpt_messages.append(' '.join(current_message))
                current_label = '#USER'
                current_message = lines[1:]
            elif lines[0] == '##ChatGPT':
                if current_label == '#USER':
                    user_messages.append(' '.join(current_message))
                current_label = '##ChatGPT'
                current_message = lines[1:]
            else:
                current_message.extend(lines)

    if current_label == '#USER':
        user_messages.append(' '.join(current_message))
    elif current_label == '##ChatGPT':
        chatgpt_messages.append(' '.join(current_message))

    return user_messages, chatgpt_messages

# Example usage
file_path = './breakingchat.txt'
user_list, chatgpt_list = parse_conversations(file_path)

"""
print("User Messages:")
for message in user_list:
    print(message)
    print()

print("ChatGPT Messages:")
for message in chatgpt_list:
    print(message)
    print()
"""

In [None]:
####################################################################################################################################
####################################################################################################################################
####################################################################################################################################
####################################################################################################################################
####################################################################################################################################
####################################################################################################################################
####################################################################################################################################
####################################################################################################################################
####################################################################################################################################
####################################################################################################################################
####################################################################################################################################
####################################################################################################################################
####################################################################################################################################
####################################################################################################################################
####################################################################################################################################
####################################################################################################################################
####################################################################################################################################
####################################################################################################################################
####################################################################################################################################
####################################################################################################################################
####################################################################################################################################
####################################################################################################################################
####################################################################################################################################
####################################################################################################################################
####################################################################################################################################
####################################################################################################################################
####################################################################################################################################
####################################################################################################################################
####################################################################################################################################
####################################################################################################################################
####################################################################################################################################
####################################################################################################################################
####################################################################################################################################
####################################################################################################################################
####################################################################################################################################
####################################################################################################################################
####################################################################################################################################
####################################################################################################################################
####################################################################################################################################
####################################################################################################################################
####################################################################################################################################
####################################################################################################################################
####################################################################################################################################
####################################################################################################################################
####################################################################################################################################
####################################################################################################################################
####################################################################################################################################
####################################################################################################################################


In [None]:
import openai
import numpy as np
from sentence_transformers import SentenceTransformer
import tensorflow as tf
import matplotlib.pyplot as plt
from scipy.ndimage import gaussian_filter

# Initialize OpenAI client
client = openai.OpenAI(api_key="sk-proj-7MAfZbOm9lPY28pubTiRT3BlbkFJGgn73o5e6sVCjoTfoFAP")
MODEL = "gpt-4o"

# Function to get responses from GPT-4o
def get_responses(prompt, n, max_tokens=100, temperature=0.7, top_p=1.0, frequency_penalty=0.0, presence_penalty=0.0):
    responses = []
    log_probs = []
    for _ in range(n):
        response = client.chat.completions.create(
            model=MODEL,
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt}
            ],
            max_tokens=max_tokens,
            temperature=temperature,
            top_p=top_p,
            frequency_penalty=frequency_penalty,
            presence_penalty=presence_penalty,
            logprobs=True
        )
        responses.append(response.choices[0].message.content)
        log_probs.append(response.choices[0].logprobs)
    return responses, log_probs

# Function to get embeddings for the responses
def get_embeddings(responses):
    model = SentenceTransformer('sentence-transformers/paraphrase-mpnet-base-v2')
    embeddings = model.encode(responses)
    return embeddings

# Function to convert log probabilities to probabilities using softmax
def softmax(logits):
    exp_logits = np.exp(logits - np.max(logits))
    return exp_logits / exp_logits.sum()

def analyze_log_probs(log_probs):
    probability_map = []
    for choice in log_probs:
        tokens = [token_logprob.token for token_logprob in choice.content]
        token_logprobs = [token_logprob.logprob for token_logprob in choice.content]
        token_probs = softmax(token_logprobs)
        probability_map.append({
            'tokens': tokens,
            'token_probs': token_probs,
            'token_logprobs': token_logprobs
        })
    return probability_map

# Function to train an autoencoder
def train_autoencoder(input_data, latent_dim):
    encoder_input = tf.keras.Input(shape=(input_data.shape[-1],))
    encoded = tf.keras.layers.Dense(512, activation='relu')(encoder_input)
    encoded = tf.keras.layers.Dropout(0.5)(encoded)
    encoded = tf.keras.layers.Dense(256, activation='relu')(encoded)
    encoded = tf.keras.layers.Dense(128, activation='relu')(encoded)
    encoded_output = tf.keras.layers.Dense(latent_dim, activation='linear')(encoded)
    encoder = tf.keras.Model(encoder_input, encoded_output, name='encoder')
    decoder_input = tf.keras.Input(shape=(latent_dim,))
    decoded = tf.keras.layers.Dense(128, activation='relu')(decoder_input)
    decoded = tf.keras.layers.Dense(256, activation='relu')(decoded)
    decoded = tf.keras.layers.Dense(512, activation='relu')(decoded)
    decoded_output = tf.keras.layers.Dense(input_data.shape[-1], activation='sigmoid')(decoded)
    decoder = tf.keras.Model(decoder_input, decoded_output, name='decoder')
    autoencoder_input = encoder_input
    encoded_embedding = encoder(autoencoder_input)
    decoded_embedding = decoder(encoded_embedding)
    autoencoder = tf.keras.Model(autoencoder_input, decoded_embedding, name='autoencoder')
    autoencoder.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001), loss='mean_squared_error')
    autoencoder.fit(input_data, input_data, epochs=20, batch_size=8, verbose=1)
    return encoder

# Function to visualize 2D grid in 3D with smoothing
def visualize_2d_grid(encoded_2d_grid, grid_size):
    smoothed_grids = np.array([gaussian_filter(grid, sigma=1) for grid in encoded_2d_grid])
    fig = plt.figure(figsize=(12, 8))
    ax = fig.add_subplot(111, projection='3d')
    X, Y = np.meshgrid(range(grid_size), range(grid_size))
    for i in range(smoothed_grids.shape[0]):
        Z = smoothed_grids[i]
        ax.plot_surface(X, Y, Z, cmap='viridis', alpha=0.7)
    ax.set_title('3D Visualization of Smoothed Text Embeddings')
    ax.set_xlabel('Dimension 1')
    ax.set_ylabel('Dimension 2')
    ax.set_zlabel('Embedding Value')
    plt.show()

# Main function to orchestrate the workflow
def main():
    prompt = "make me a short unique poem about aliens fighting on venus over earths iranian goat population gaining sentience"
    n = 20
    responses, log_probs = get_responses(prompt, n)
    embeddings = get_embeddings(responses)
    probability_map = analyze_log_probs(log_probs)
    token_probs_list = np.array([entry['token_probs'] for entry in probability_map])
    combined_input = np.hstack((embeddings, token_probs_list))
    grid_size = 30
    latent_dim = grid_size * grid_size
    encoder = train_autoencoder(combined_input, latent_dim)
    encoded_2d_grid = encoder(combined_input).numpy().reshape(-1, grid_size, grid_size)
    visualize_2d_grid(encoded_2d_grid, grid_size)

if __name__ == "__main__":
    main()


In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import matplotlib.pyplot as plt

# Step 1: Prepare the Text
sample_text = "Natural language processing (NLP) involves the interaction between computers and humans through language. It enables machines to read, understand, and derive meaning from human languages."

# Step 2: Vectorize the Text using Universal Sentence Encoder from TensorFlow Hub
# Load Universal Sentence Encoder
embedding_layer = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder/4", input_shape=[], dtype=tf.string, trainable=False)

# Convert the sample text into an embedding
text_embedding = embedding_layer([sample_text])
print("Original Embedding Shape:", text_embedding.shape)

# Step 3: Build and Train an Autoencoder Model to reduce dimensionality to a 2D array
# Define the encoder model
encoder_input = tf.keras.Input(shape=(text_embedding.shape[-1],))
encoded = tf.keras.layers.Dense(128, activation='relu')(encoder_input)
encoded = tf.keras.layers.Dense(64, activation='relu')(encoded)
encoded = tf.keras.layers.Dense(32, activation='relu')(encoded)
encoded_output = tf.keras.layers.Dense(2 * 2, activation='linear')(encoded)  # 2x2 array output

encoder = tf.keras.Model(encoder_input, encoded_output, name='encoder')

# Define the decoder model
decoder_input = tf.keras.Input(shape=(2 * 2,))
decoded = tf.keras.layers.Dense(32, activation='relu')(decoder_input)
decoded = tf.keras.layers.Dense(64, activation='relu')(decoded)
decoded = tf.keras.layers.Dense(128, activation='relu')(decoded)
decoded_output = tf.keras.layers.Dense(text_embedding.shape[-1], activation='sigmoid')(decoded)

decoder = tf.keras.Model(decoder_input, decoded_output, name='decoder')

# Combine encoder and decoder into an autoencoder model
autoencoder_input = encoder_input
encoded_embedding = encoder(autoencoder_input)
decoded_embedding = decoder(encoded_embedding)

autoencoder = tf.keras.Model(autoencoder_input, decoded_embedding, name='autoencoder')
autoencoder.compile(optimizer='adam', loss='mean_squared_error')

# Train the autoencoder
# Here, we use the text embedding itself as the target (autoencoder learns to reconstruct its input)
autoencoder.fit(text_embedding, text_embedding, epochs=100, batch_size=1, verbose=0)

# Predict the 2D coordinates
encoded_2d_array = encoder(text_embedding)
print("2D Array for the Input Text:", encoded_2d_array.numpy().reshape(2, 2))

# Step 4: Visualize the 2D Array Result
encoded_2d_array_reshaped = encoded_2d_array.numpy().reshape(2, 2)

plt.figure(figsize=(6, 6))
plt.imshow(encoded_2d_array_reshaped, cmap='viridis', interpolation='nearest')
plt.colorbar()
plt.title('2D Array Embedding of the Text')
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')
plt.grid(True)
plt.show()


In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.datasets import fetch_20newsgroups

# Step 1: Prepare Multiple Text Samples
newsgroups = fetch_20newsgroups(subset='train')
texts = newsgroups.data[:1000]  # Limit to 1000 samples for this example

# Step 2: Convert Text Samples into Embeddings using Universal Sentence Encoder
embedding_layer = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder/4", input_shape=[], dtype=tf.string, trainable=False)

text_embeddings = embedding_layer(texts)
print("Original Embedding Shape:", text_embeddings.shape)

# Step 3: Build and Train an Autoencoder to reduce dimensionality to a large 2D grid
grid_size = 30  # 30x30 grid
latent_dim = grid_size * grid_size

# Define the encoder model
encoder_input = tf.keras.Input(shape=(text_embeddings.shape[-1],))
encoded = tf.keras.layers.Dense(512, activation='relu')(encoder_input)
encoded = tf.keras.layers.Dense(256, activation='relu')(encoded)
encoded = tf.keras.layers.Dense(128, activation='relu')(encoded)
encoded_output = tf.keras.layers.Dense(latent_dim, activation='linear')(encoded)  # Larger 2D grid output

encoder = tf.keras.Model(encoder_input, encoded_output, name='encoder')

# Define the decoder model
decoder_input = tf.keras.Input(shape=(latent_dim,))
decoded = tf.keras.layers.Dense(128, activation='relu')(decoder_input)
decoded = tf.keras.layers.Dense(256, activation='relu')(decoded)
decoded = tf.keras.layers.Dense(512, activation='relu')(decoded)
decoded_output = tf.keras.layers.Dense(text_embeddings.shape[-1], activation='sigmoid')(decoded)

decoder = tf.keras.Model(decoder_input, decoded_output, name='decoder')

# Combine encoder and decoder into an autoencoder model
autoencoder_input = encoder_input
encoded_embedding = encoder(autoencoder_input)
decoded_embedding = decoder(encoded_embedding)

autoencoder = tf.keras.Model(autoencoder_input, decoded_embedding, name='autoencoder')
autoencoder.compile(optimizer='adam', loss='mean_squared_error')

# Train the autoencoder
autoencoder.fit(text_embeddings, text_embeddings, epochs=50, batch_size=32, verbose=1)

# Step 4: Visualize the 2D Grid in 3D
encoded_2d_grid = encoder(text_embeddings).numpy().reshape(-1, grid_size, grid_size)

fig = plt.figure(figsize=(12, 8))
ax = fig.add_subplot(111, projection='3d')

X, Y = np.meshgrid(range(grid_size), range(grid_size))
for i in range(encoded_2d_grid.shape[0]):
    Z = encoded_2d_grid[i]
    ax.plot_surface(X, Y, Z, cmap='viridis')

ax.set_title('3D Visualization of Text Embeddings')
ax.set_xlabel('Dimension 1')
ax.set_ylabel('Dimension 2')
ax.set_zlabel('Embedding Value')
plt.show()


In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.datasets import fetch_20newsgroups
from scipy.ndimage import gaussian_filter

# Step 1: Prepare Multiple Text Samples
newsgroups = fetch_20newsgroups(subset='train')
texts = newsgroups.data[:1000]  # Limit to 1000 samples for this example

# Step 2: Convert Text Samples into Embeddings using Universal Sentence Encoder
embedding_layer = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder/4", input_shape=[], dtype=tf.string, trainable=False)

text_embeddings = embedding_layer(texts)
print("Original Embedding Shape:", text_embeddings.shape)

# Step 3: Build and Train an Autoencoder to reduce dimensionality to a large 2D grid
grid_size = 30  # 30x30 grid
latent_dim = grid_size * grid_size

# Define the encoder model
encoder_input = tf.keras.Input(shape=(text_embeddings.shape[-1],))
encoded = tf.keras.layers.Dense(512, activation='relu')(encoder_input)
encoded = tf.keras.layers.Dense(256, activation='relu')(encoded)
encoded = tf.keras.layers.Dense(128, activation='relu')(encoded)
encoded_output = tf.keras.layers.Dense(latent_dim, activation='linear')(encoded)  # Larger 2D grid output

encoder = tf.keras.Model(encoder_input, encoded_output, name='encoder')

# Define the decoder model
decoder_input = tf.keras.Input(shape=(latent_dim,))
decoded = tf.keras.layers.Dense(128, activation='relu')(decoder_input)
decoded = tf.keras.layers.Dense(256, activation='relu')(decoded)
decoded = tf.keras.layers.Dense(512, activation='relu')(decoded)
decoded_output = tf.keras.layers.Dense(text_embeddings.shape[-1], activation='sigmoid')(decoded)

decoder = tf.keras.Model(decoder_input, decoded_output, name='decoder')

# Combine encoder and decoder into an autoencoder model
autoencoder_input = encoder_input
encoded_embedding = encoder(autoencoder_input)
decoded_embedding = decoder(encoded_embedding)

autoencoder = tf.keras.Model(autoencoder_input, decoded_embedding, name='autoencoder')
autoencoder.compile(optimizer='adam', loss='mean_squared_error')

# Train the autoencoder
autoencoder.fit(text_embeddings, text_embeddings, epochs=50, batch_size=32, verbose=1)

# Step 4: Visualize the 2D Grid in 3D with Smoothing
encoded_2d_grid = encoder(text_embeddings).numpy().reshape(-1, grid_size, grid_size)

# Apply Gaussian smoothing to each sample's 2D grid
smoothed_grids = np.array([gaussian_filter(grid, sigma=1) for grid in encoded_2d_grid])

fig = plt.figure(figsize=(12, 8))
ax = fig.add_subplot(111, projection='3d')

X, Y = np.meshgrid(range(grid_size), range(grid_size))
for i in range(smoothed_grids.shape[0]):
    Z = smoothed_grids[i]
    ax.plot_surface(X, Y, Z, cmap='viridis', alpha=0.7)

ax.set_title('3D Visualization of Smoothed Text Embeddings')
ax.set_xlabel('Dimension 1')
ax.set_ylabel('Dimension 2')
ax.set_zlabel('Embedding Value')
plt.show()


In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.datasets import fetch_20newsgroups
from scipy.ndimage import gaussian_filter

# Step 1: Prepare Multiple Text Samples
newsgroups = fetch_20newsgroups(subset='train')
texts = newsgroups.data[:1000]  # Limit to 1000 samples for this example

# Step 2: Convert Text Samples into Embeddings using Universal Sentence Encoder
embedding_layer = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder/4", input_shape=[], dtype=tf.string, trainable=False)

text_embeddings = embedding_layer(texts)
print("Original Embedding Shape:", text_embeddings.shape)

# Step 3: Build and Train an Autoencoder to reduce dimensionality to a large 2D grid
grid_size = 30  # 30x30 grid
latent_dim = grid_size * grid_size

encoder_input = tf.keras.Input(shape=(text_embeddings.shape[-1],))
encoded = tf.keras.layers.Dense(512, activation='relu')(encoder_input)
encoded = tf.keras.layers.Dropout(0.5)(encoded)  # Adding dropout
encoded = tf.keras.layers.Dense(256, activation='relu')(encoded)
encoded = tf.keras.layers.Dense(128, activation='relu')(encoded)
encoded_output = tf.keras.layers.Dense(latent_dim, activation='linear')(encoded)

encoder = tf.keras.Model(encoder_input, encoded_output, name='encoder')

# Define the decoder model
decoder_input = tf.keras.Input(shape=(latent_dim,))
decoded = tf.keras.layers.Dense(128, activation='relu')(decoder_input)
decoded = tf.keras.layers.Dense(256, activation='relu')(decoded)
decoded = tf.keras.layers.Dense(512, activation='relu')(decoded)
decoded_output = tf.keras.layers.Dense(text_embeddings.shape[-1], activation='sigmoid')(decoded)

decoder = tf.keras.Model(decoder_input, decoded_output, name='decoder')

# Combine encoder and decoder into an autoencoder model
autoencoder_input = encoder_input
encoded_embedding = encoder(autoencoder_input)
decoded_embedding = decoder(encoded_embedding)

autoencoder = tf.keras.Model(autoencoder_input, decoded_embedding, name='autoencoder')
autoencoder.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001), loss='mean_squared_error')

autoencoder.fit(text_embeddings, text_embeddings, epochs=10, batch_size=16, verbose=1)

# Step 4: Visualize the 2D Grid in 3D with Smoothing
encoded_2d_grid = encoder(text_embeddings).numpy().reshape(-1, grid_size, grid_size)

# Apply Gaussian smoothing to each sample's 2D grid
smoothed_grids = np.array([gaussian_filter(grid, sigma=1) for grid in encoded_2d_grid])

fig = plt.figure(figsize=(12, 8))
ax = fig.add_subplot(111, projection='3d')

X, Y = np.meshgrid(range(grid_size), range(grid_size))
for i in range(smoothed_grids.shape[0]):
    Z = smoothed_grids[i]
    ax.plot_surface(X, Y, Z, cmap='viridis', alpha=0.7)

ax.set_title('3D Visualization of Smoothed Text Embeddings')
ax.set_xlabel('Dimension 1')
ax.set_ylabel('Dimension 2')
ax.set_zlabel('Embedding Value')
plt.show()


In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.datasets import fetch_20newsgroups
from scipy.ndimage import gaussian_filter

# Step 1: Prepare Multiple Text Samples
newsgroups = fetch_20newsgroups(subset='train')
texts = newsgroups.data[:10]  # Limit to 1000 samples for this example

# Step 2: Convert Text Samples into Embeddings using Universal Sentence Encoder
embedding_layer = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder/4", input_shape=[], dtype=tf.string, trainable=False)

text_embeddings = embedding_layer(texts)
print("Original Embedding Shape:", text_embeddings.shape)

# Step 3: Build and Train an Autoencoder to reduce dimensionality to a large 2D grid
grid_size = 10  # 30x30 grid
latent_dim = grid_size * grid_size

# Define the encoder model
encoder_input = tf.keras.Input(shape=(text_embeddings.shape[-1],))
encoded = tf.keras.layers.Dense(512, activation='relu')(encoder_input)
encoded = tf.keras.layers.Dense(256, activation='relu')(encoded)
encoded = tf.keras.layers.Dense(128, activation='relu')(encoded)
encoded_output = tf.keras.layers.Dense(latent_dim, activation='linear')(encoded)  # Larger 2D grid output

encoder = tf.keras.Model(encoder_input, encoded_output, name='encoder')

# Define the decoder model
decoder_input = tf.keras.Input(shape=(latent_dim,))
decoded = tf.keras.layers.Dense(128, activation='relu')(decoder_input)
decoded = tf.keras.layers.Dense(256, activation='relu')(decoded)
decoded = tf.keras.layers.Dense(512, activation='relu')(decoded)
decoded_output = tf.keras.layers.Dense(text_embeddings.shape[-1], activation='sigmoid')(decoded)

decoder = tf.keras.Model(decoder_input, decoded_output, name='decoder')

# Combine encoder and decoder into an autoencoder model
autoencoder_input = encoder_input
encoded_embedding = encoder(autoencoder_input)
decoded_embedding = decoder(encoded_embedding)

autoencoder = tf.keras.Model(autoencoder_input, decoded_embedding, name='autoencoder')
autoencoder.compile(optimizer='adam', loss='mean_squared_error')

# Train the autoencoder
autoencoder.fit(text_embeddings, text_embeddings, epochs=10, batch_size=16, verbose=1)

# Step 4: Visualize the 2D Grid in 3D with Smoothing
encoded_2d_grid = encoder(text_embeddings).numpy().reshape(-1, grid_size, grid_size)

# Apply Gaussian smoothing to each sample's 2D grid
smoothed_grids = np.array([gaussian_filter(grid, sigma=5) for grid in encoded_2d_grid])

fig = plt.figure(figsize=(12, 8))
ax = fig.add_subplot(111, projection='3d')

X, Y = np.meshgrid(range(grid_size), range(grid_size))
for i in range(smoothed_grids.shape[0]):
    Z = smoothed_grids[i]
    ax.plot_surface(X, Y, Z, cmap='viridis', alpha=0.7)

ax.set_title('3D Visualization of Smoothed Text Embeddings')
ax.set_xlabel('Dimension 1')
ax.set_ylabel('Dimension 2')
ax.set_zlabel('Embedding Value')
plt.show()


In [None]:
pip install openai sentence-transformers scikit-learn numpy matplotlib hdbscan umap-learn


In [None]:
import openai
import os
import numpy as np
from sentence_transformers import SentenceTransformer
import matplotlib.pyplot as plt
import umap
from hdbscan import HDBSCAN
from sklearn.cluster import KMeans

client = openai.OpenAI(api_key="sk-proj-7MAfZbOm9lPY28pubTiRT3BlbkFJGgn73o5e6sVCjoTfoFAP")
MODEL = "gpt-4o"

# Function to get responses from GPT-4o
def get_responses(prompt, n, max_tokens=100, temperature=0.7, top_p=1.0, frequency_penalty=0.0, presence_penalty=0.0):
    responses = []
    log_probs = []
    for _ in range(n):
        response = client.chat.completions.create(
            model=MODEL,
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt}
            ],
            max_tokens=max_tokens,
            temperature=temperature,
            top_p=top_p,
            frequency_penalty=frequency_penalty,
            presence_penalty=presence_penalty,
            logprobs=True
        )
        responses.append(response.choices[0].message.content)
        log_probs.append(response.choices[0].logprobs)
    return responses, log_probs


# Function to get embeddings for the responses
def get_embeddings(responses):
    model = SentenceTransformer('sentence-transformers/paraphrase-mpnet-base-v2')
    embeddings = model.encode(responses)
    return embeddings

# Function to cluster embeddings (HDBSCAN or K-Means)
def cluster_embeddings(embeddings, method='hdbscan'):
    if method == 'hdbscan':
        clusterer = HDBSCAN(min_cluster_size=15, cluster_selection_method='leaf')
        cluster_labels = clusterer.fit_predict(embeddings)
    else:
        clusterer = KMeans(n_clusters=5)
        cluster_labels = clusterer.fit_predict(embeddings)
    return cluster_labels

# Function to visualize embeddings using UMAP
def visualize_embeddings(embeddings, labels):
    n_neighbors = min(15, len(embeddings) - 1)
    reducer = umap.UMAP(n_components=2, n_neighbors=n_neighbors)
    reduced_embeddings = reducer.fit_transform(embeddings)
    plt.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], c=labels, cmap='Spectral')
    plt.colorbar()
    plt.show()

def plot_raw_logits(probability_map):
    for entry in probability_map:
        tokens = entry['tokens']
        log_probs = entry['token_logprobs']  # Assuming this key will be added to the dictionary
        
        # Ensure we have valid data
        if tokens and log_probs and len(tokens) == len(log_probs):
            plt.figure(figsize=(10, 5))
            plt.bar(tokens, log_probs)
            plt.xlabel('Tokens')
            plt.ylabel('Log Probabilities')
            plt.title('Raw Log Probabilities of Tokens')
            plt.xticks(rotation=45)  # Rotate x-axis labels for better visibility
            plt.show()


def softmax(logits):
    exp_logits = np.exp(logits - np.max(logits))
    return exp_logits / exp_logits.sum()

def analyze_log_probs(log_probs):
    probability_map = []
    for choice in log_probs:
        # Access the 'content' attribute which is the list of 'ChatCompletionTokenLogprob'
        tokens = [token_logprob.token for token_logprob in choice.content]
        token_logprobs = [token_logprob.logprob for token_logprob in choice.content]

        # Convert log probabilities to probabilities using softmax
        token_probs = softmax(token_logprobs)

        probability_map.append({
            'tokens': tokens,
            'token_probs': token_probs,
            'token_logprobs': token_logprobs  # Store raw log probabilities for plotting
        })
    
    return probability_map

def plot_probabilities(probability_map):
    for entry in probability_map:
        tokens = entry['tokens']
        token_probs = entry['token_probs']
        
        # Ensure we have valid data
        if tokens and token_probs is not None and len(tokens) == len(token_probs):
            plt.figure(figsize=(10, 5))
            plt.bar(tokens, token_probs)
            plt.xlabel('Tokens')
            plt.ylabel('Probabilities')
            plt.title('Token Probabilities')
            plt.xticks(rotation=45)  # Rotate x-axis labels for better visibility
            plt.show()


# Main function to orchestrate the workflow
def main():
    prompt = "make me a short unique poem about aliens fighting on venus over earths iranian goat population gaining sentience"
    n = 10
    
    responses, log_probs = get_responses(
    prompt=prompt,
    n=5,
    max_tokens=100,
    temperature=2,
    top_p=0.3,
    frequency_penalty=0.1,
    presence_penalty=0.3
    )

    embeddings = get_embeddings(responses)
    
    # Choose clustering method ('hdbscan' or 'kmeans')
    clustering_method = 'hdbscan'
    cluster_labels = cluster_embeddings(embeddings, method=clustering_method)
    
    #print("Clustering Results:")
    #for i, response in enumerate(responses):
        #print(f"Cluster {cluster_labels[i]}: {response}")
    
    visualize_embeddings(embeddings, cluster_labels)

    print(log_probs)
    
    probability_map = analyze_log_probs(log_probs)
    print(probability_map)
    plot_probabilities(probability_map)
    plot_raw_logits(probability_map)

if __name__ == "__main__":
    main()


In [None]:
import openai
import numpy as np
from sentence_transformers import SentenceTransformer
import tensorflow as tf
import matplotlib.pyplot as plt
from scipy.ndimage import gaussian_filter

# Initialize OpenAI client
client = openai.OpenAI(api_key="sk-proj-7MAfZbOm9lPY28pubTiRT3BlbkFJGgn73o5e6sVCjoTfoFAP")
MODEL = "gpt-4o"

# Function to get responses from GPT-4o
def get_responses(prompt, n, max_tokens=100, temperature=0.7, top_p=1.0, frequency_penalty=0.0, presence_penalty=0.0):
    responses = []
    log_probs = []
    for _ in range(n):
        response = client.chat.completions.create(
            model=MODEL,
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt}
            ],
            max_tokens=max_tokens,
            temperature=temperature,
            top_p=top_p,
            frequency_penalty=frequency_penalty,
            presence_penalty=presence_penalty,
            logprobs=True
        )
        responses.append(response.choices[0].message.content)
        log_probs.append(response.choices[0].logprobs)
    return responses, log_probs

# Function to get embeddings for the responses
def get_embeddings(responses):
    model = SentenceTransformer('sentence-transformers/paraphrase-mpnet-base-v2')
    embeddings = model.encode(responses)
    return embeddings

# Function to convert log probabilities to probabilities using softmax
def softmax(logits):
    exp_logits = np.exp(logits - np.max(logits))
    return exp_logits / exp_logits.sum()

def analyze_log_probs(log_probs):
    probability_map = []
    for choice in log_probs:
        # Access the 'content' attribute which is the list of 'ChatCompletionTokenLogprob'
        tokens = [token_logprob.token for token_logprob in choice.content]
        token_logprobs = [token_logprob.logprob for token_logprob in choice.content]

        # Convert log probabilities to probabilities using softmax
        token_probs = softmax(token_logprobs)

        probability_map.append({
            'tokens': tokens,
            'token_probs': token_probs,
            'token_logprobs': token_logprobs  # Store raw log probabilities for plotting
        })
    
    return probability_map

# Function to train an autoencoder
def train_autoencoder(input_data, latent_dim):
    encoder_input = tf.keras.Input(shape=(input_data.shape[-1],))
    encoded = tf.keras.layers.Dense(512, activation='relu')(encoder_input)
    encoded = tf.keras.layers.Dropout(0.5)(encoded)
    encoded = tf.keras.layers.Dense(256, activation='relu')(encoded)
    encoded = tf.keras.layers.Dense(128, activation='relu')(encoded)
    encoded_output = tf.keras.layers.Dense(latent_dim, activation='linear')(encoded)

    encoder = tf.keras.Model(encoder_input, encoded_output, name='encoder')

    decoder_input = tf.keras.Input(shape=(latent_dim,))
    decoded = tf.keras.layers.Dense(128, activation='relu')(decoder_input)
    decoded = tf.keras.layers.Dense(256, activation='relu')(decoded)
    decoded = tf.keras.layers.Dense(512, activation='relu')(decoded)
    decoded_output = tf.keras.layers.Dense(input_data.shape[-1], activation='sigmoid')(decoded)

    decoder = tf.keras.Model(decoder_input, decoded_output, name='decoder')

    autoencoder_input = encoder_input
    encoded_embedding = encoder(autoencoder_input)
    decoded_embedding = decoder(encoded_embedding)

    autoencoder = tf.keras.Model(autoencoder_input, decoded_embedding, name='autoencoder')
    autoencoder.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001), loss='mean_squared_error')

    autoencoder.fit(input_data, input_data, epochs=10, batch_size=16, verbose=1)
    return encoder

# Function to visualize 2D grid in 3D with smoothing
def visualize_2d_grid(encoded_2d_grid, grid_size):
    smoothed_grids = np.array([gaussian_filter(grid, sigma=1) for grid in encoded_2d_grid])

    fig = plt.figure(figsize=(12, 8))
    ax = fig.add_subplot(111, projection='3d')
    X, Y = np.meshgrid(range(grid_size), range(grid_size))
    for i in range(smoothed_grids.shape[0]):
        Z = smoothed_grids[i]
        ax.plot_surface(X, Y, Z, cmap='viridis', alpha=0.7)

    ax.set_title('3D Visualization of Smoothed Text Embeddings')
    ax.set_xlabel('Dimension 1')
    ax.set_ylabel('Dimension 2')
    ax.set_zlabel('Embedding Value')
    plt.show()

def extract_token_probs(data):
    # Initialize an empty list to store the token probabilities
    token_probs_list = []
    
    # Iterate over each entry in the data list
    for entry in data:
        # Check if 'token_probs' key exists in the dictionary
        if 'token_probs' in entry:
            # Append the numpy array of token probabilities to the list
            token_probs_list.append(entry['token_probs'])
    
    # Return the list of token probabilities
    return token_probs_list

# Main function to orchestrate the workflow
def main():
    prompt = "make me a short unique poem about aliens fighting on venus over earths iranian goat population gaining sentience"
    n = 1
    responses, log_probs = get_responses(prompt, n)
    embeddings = get_embeddings(responses)

    probability_map = analyze_log_probs(log_probs)
    token_probs_list = extract_token_probs(probability_map)
    
    # Extract token probabilities from the probability map
    token_probabilities = np.array([entry['token_probs'] for entry in probability_map])
    
    # Combine embeddings and token probabilities
    combined_input = np.hstack((embeddings, token_probabilities))

    grid_size = 30
    latent_dim = grid_size * grid_size
    encoder = train_autoencoder(combined_input, latent_dim)

    encoded_2d_grid = encoder(combined_input).numpy().reshape(-1, grid_size, grid_size)
    visualize_2d_grid(encoded_2d_grid, grid_size)

if __name__ == "__main__":
    main()
