In [1]:
import os
from openai import AzureOpenAI
from dotenv import load_dotenv
import numpy as np
import sys
import json

In [None]:


def load_azure_openai_client():
    load_dotenv()  # Load environment variables from .env.

    endpoint = os.getenv("ENDPOINT_URL", "https://your-azure-openai-endpoint.openai.azure.com/")
    deployment = os.getenv("DEPLOYMENT_NAME", "your-deployment-name")

    client = AzureOpenAI(
        azure_endpoint=endpoint,
        api_version="2024-05-01-preview",
        api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    )

    return client, deployment

def calculatePerplexity_gpt3(prompt, client, deployment):
    prompt = prompt.replace('\x00','')
    responses = None
    while responses is None:
        try:
            responses = client.chat.completions.create(
                model=deployment,
                messages=[
                    {
                        "role": "user",
                        "content": [
                            {
                                "type": "text",
                                "text": prompt
                            }
                        ]
                    },
                ],
                max_tokens=0,
                logprobs=True,
                top_logprobs=5,
                temperature=1.0,
                top_p=0.95,
                frequency_penalty=0,
                presence_penalty=0,
                stop=None,
                stream=False
            )
        except Exception as e:
            print(f"API request failed: {e}")
            responses = None
    data = responses.choices[0].logprobs
    all_prob = [token_logprob.logprob for token_logprob in data.content if token_logprob.logprob is not None]
    p1 = np.exp(-np.mean(all_prob))
    return p1, all_prob, np.mean(all_prob)

def read_and_chunk_file(file_path, chunk_size=1000):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    # Split text into chunks of specified size
    chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
    return chunks

def process_file(file_path, client, deployment):
    chunks = read_and_chunk_file(file_path)
    scores = []
    for idx, chunk in enumerate(chunks):
        p1, all_prob, mean_logprob = calculatePerplexity_gpt3(chunk, client, deployment)
        scores.append({'chunk': idx, 'perplexity': p1, 'mean_logprob': mean_logprob})
        print(f"File: {file_path}, Chunk {idx}: Perplexity={p1}, Mean Log Probability={mean_logprob}")
    return scores

def main():
    if len(sys.argv) < 2:
        print("Usage: python run2.py <path_to_text_file1> [<path_to_text_file2> ...]")
        sys.exit(1)
    client, deployment = load_azure_openai_client()
    all_scores = {}
    for file_path in sys.argv[1:]:
        if not os.path.exists(file_path):
            print(f"File not found: {file_path}")
            continue
        scores = process_file(file_path, client, deployment)
        all_scores[file_path] = scores
    # Optionally, save all scores to a JSON file
    
    with open('all_scores.json', 'w') as outfile:
        json.dump(all_scores, outfile, indent=4)

if __name__ == '__main__':
    main()