In [None]:
!pip install -q openai

In [None]:
API_KEY = 'sk-proj-7MAfZbOm9lPY28pubTiRT3BlbkFJGgn73o5e6sVCjoTfoFAP'

In [None]:
You are a dataset conversion bot. You are to generate a set of question response pairs that explain the attached data chunk.

In [None]:
import os
import json
import openai
import re
import uuid
import time
import random
import multiprocessing
from openai import OpenAI

# Function to process a JSON file and extract summary chunks with metadata
def extract_summary_chunks_with_metadata(file_path):
    with open(file_path) as file:
        data = json.load(file)
        
    summary_chunks = []
    for document in data:
        if 'CHUNK' in document:
            chunk_data = {
                'chunk': document['CHUNK'],
                'metadata': document['ALIGNMENT'],
                'filename': os.path.basename(file_path)
            }
            summary_chunks.append(chunk_data)
    return summary_chunks

# Function to process all JSON files in a folder and extract summary chunks with metadata
def process_folder(folder_path):
    all_summary_chunks = []
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith('.json'):
                file_path = os.path.join(root, file)
                summary_chunks = extract_summary_chunks_with_metadata(file_path)
                all_summary_chunks.extend(summary_chunks)
    return all_summary_chunks

# Function to generate questions based on summary chunks using GPT-4
def generate_responses(api_key, input_text, model_config):
    client = OpenAI(api_key=api_key)
    responses = []

    while True:
        try:
            response = client.chat.completions.create(
                model=model_config.get('name', "gpt-3.5-turbo"),
                messages=[
                    {"role": "system", "content": "You are a dataset conversion bot. You are to generate a set of question response pairs that explain the attached data chunk."},
                    {"role": "user", "content": input_text}
                ],
                max_tokens=model_config.get('max_tokens', 300),
                temperature=model_config.get('temperature', 0.7),
                top_p=model_config.get('top_p', 0.9)
            )

            response_data = {
                'uuid': str(uuid.uuid4()),
                'response_content': response.choices[0].message.content,
                'configuration': {
                    'max_tokens': model_config.get('max_tokens', 300),
                    'temperature': model_config.get('temperature', 0.7),
                    'top_p': model_config.get('top_p', 0.9),
                    'model': model_config.get('name', "gpt-3.5-turbo")
                }
            }
            responses.append(response_data)
            break
        except openai.RateLimitError as e:
            wait_time = random.uniform(1, 300)
            print(f"Rate limit hit. Waiting for {wait_time} seconds. Error: {e}")
            time.sleep(wait_time)
        except openai.APIError as e:
            print(f"OpenAI API returned an API Error: {e}")
            break
        except openai.APIConnectionError as e:
            print(f"Failed to connect to OpenAI API: {e}")
            time.sleep(2)
        except Exception as e:
            print(f"Unexpected error: {e}")
            break

    return responses

# Function to parse question-answer pairs from the response content
def parse_qa_pairs(response_content):
    qa_pairs = []
    pairs = response_content.split('\n\n')
    for pair in pairs:
        if 'Q:' in pair and 'A:' in pair:
            question = pair.split('Q:')[1].split('A:')[0].strip()
            answer = pair.split('A:')[1].strip()
            qa_pairs.append({'question': question, 'answer': answer})
    return qa_pairs

# Worker function for multiprocessing
def worker(chunk_data_list, model_config, api_key, return_list, progress_dict, lock):
    for chunk_data in chunk_data_list:
        responses = generate_responses(api_key, chunk_data['chunk'], model_config)
        for response in responses:
            qa_pairs = parse_qa_pairs(response['response_content'])
            result = {
                'chunk': chunk_data['chunk'],
                'metadata': chunk_data['metadata'],
                'filename': chunk_data['filename'],
                'qa_pairs': qa_pairs,
                'inference_metadata': response['configuration']
            }
            return_list.append(result)
        # Update progress
        with lock:
            progress_dict['processed_chunks'] += 1
            print(f"Processed {progress_dict['processed_chunks']} / {progress_dict['total_chunks']} chunks")

# Main function to execute the script
def main():
    # Specify your OpenAI API key
    API_KEY = 'sk-proj-7MAfZbOm9lPY28pubTiRT3BlbkFJGgn73o5e6sVCjoTfoFAP'
    
    # Specify the folder path containing the JSON files
    folder_path = '/workspace/slice-monorepo/cl_cr3/aligneddata'
    
    # Extract summary chunks from all JSON files in the folder
    print("Extracting summary chunks from JSON files...")
    summary_chunks = process_folder(folder_path)
    total_chunks = len(summary_chunks)
    print(f"Extracted {total_chunks} summary chunks.")
    
    # Configuration for OpenAI model
    model_config = {
        "name": "gpt-3.5-turbo",
        "max_tokens": 300,
        "temperature": 0.7,
        "top_p": 0.9
    }
    
    # Define number of workers and chunk size
    num_workers = 20
    chunk_size = 5
    
    # Using multiprocessing to process chunks in parallel
    manager = multiprocessing.Manager()
    return_list = manager.list()
    progress_dict = manager.dict({'processed_chunks': 0, 'total_chunks': total_chunks})
    lock = manager.Lock()
    jobs = []

    # Divide the work into chunks of size `chunk_size`
    chunked_data = [summary_chunks[i:i + chunk_size] for i in range(0, total_chunks, chunk_size)]
    
    # Run a set number of workers at a time
    for i in range(0, len(chunked_data), num_workers):
        current_jobs = chunked_data[i:i + num_workers]
        for chunk_data_list in current_jobs:
            p = multiprocessing.Process(target=worker, args=(chunk_data_list, model_config, API_KEY, return_list, progress_dict, lock))
            jobs.append(p)
            p.start()
        
        for job in jobs:
            job.join()
        jobs = []  # Reset jobs list for the next set of workers
    
    # Collecting the results
    generated_questions = list(return_list)
    
    # Save the generated questions to a JSON file
    output_file = 'generated_questions.json'
    with open(output_file, 'w') as f:
        json.dump(generated_questions, f, indent=4)
    
    print(f"Generated questions saved to {output_file}")

if __name__ == '__main__':
    main()


In [None]:
import json
import os
import re

def process_json_file(file_path):
    # Open the JSON file
    with open(file_path) as file:
        data = json.load(file)

    # Initialize metrics
    total_chunks = len(data)
    total_turns = 0
    total_words = 0
    unique_names = set()
    name_utterances = {}

    # Iterate over each document (chunk)
    for document in data:
        # Update total turns
        total_turns += len(document['TURNS'])

        # Iterate over each turn
        for turn in document['TURNS']:
            # Update unique names
            unique_names.update(turn['NAMES'])

            # Update name utterances
            for name in turn['NAMES']:
                if name not in name_utterances:
                    name_utterances[name] = []
                name_utterances[name].append(turn['NUMBER'])

            # Update total words
            for utterance in turn['UTTERANCES']:
                total_words += len(utterance.split())

    # Return metrics
    return total_chunks, total_turns, total_words, unique_names, name_utterances

def process_folder(folder_path):
    # Initialize summary metrics
    total_files = 0
    total_chunks = 0
    total_turns = 0
    total_words = 0
    unique_names = set()
    common_names = None
    name_utterances = {}

    # Initialize campaign-specific metrics
    campaign_metrics = {}

    # Iterate over all files and subdirectories in the folder
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            # Check if the file has a .json extension
            if file.endswith('.json'):
                file_path = os.path.join(root, file)
                chunks, turns, words, names, file_name_utterances = process_json_file(file_path)

                # Extract the campaign number from the file name
                campaign_number = re.findall(r'C\d+', file)[0]

                # Update summary metrics
                total_files += 1
                total_chunks += chunks
                total_turns += turns
                total_words += words
                unique_names.update(names)

                # Update common names
                if common_names is None:
                    common_names = set(names)
                else:
                    common_names &= names

                # Update name utterances
                for name, utterances in file_name_utterances.items():
                    if name not in name_utterances:
                        name_utterances[name] = []
                    name_utterances[name].extend(utterances)

                # Update campaign-specific metrics
                if campaign_number not in campaign_metrics:
                    campaign_metrics[campaign_number] = {
                        'total_files': 0,
                        'total_chunks': 0,
                        'total_turns': 0,
                        'total_words': 0,
                        'unique_names': set(),
                        'common_names': None,
                        'name_utterances': {}
                    }
                campaign_metrics[campaign_number]['total_files'] += 1
                campaign_metrics[campaign_number]['total_chunks'] += chunks
                campaign_metrics[campaign_number]['total_turns'] += turns
                campaign_metrics[campaign_number]['total_words'] += words
                campaign_metrics[campaign_number]['unique_names'].update(names)

                if campaign_metrics[campaign_number]['common_names'] is None:
                    campaign_metrics[campaign_number]['common_names'] = set(names)
                else:
                    campaign_metrics[campaign_number]['common_names'] &= names

                for name, utterances in file_name_utterances.items():
                    if name not in campaign_metrics[campaign_number]['name_utterances']:
                        campaign_metrics[campaign_number]['name_utterances'][name] = []
                    campaign_metrics[campaign_number]['name_utterances'][name].extend(utterances)

    # Calculate averages
    avg_chunks_per_file = total_chunks / total_files
    avg_turns_per_file = total_turns / total_files
    avg_words_per_utterance = total_words / total_turns

    # Calculate average steps between utterances for common names
    avg_steps_between_utterances = {}
    for name in common_names:
        utterances = name_utterances[name]
        steps = [utterances[i] - utterances[i-1] for i in range(1, len(utterances))]
        avg_steps = sum(steps) / len(steps)
        avg_steps_between_utterances[name] = avg_steps

    # Calculate campaign-specific averages
    for campaign_number, metrics in campaign_metrics.items():
        metrics['avg_chunks_per_file'] = metrics['total_chunks'] / metrics['total_files']
        metrics['avg_turns_per_file'] = metrics['total_turns'] / metrics['total_files']
        metrics['avg_words_per_utterance'] = metrics['total_words'] / metrics['total_turns']

        metrics['avg_steps_between_utterances'] = {}
        for name in metrics['common_names']:
            utterances = metrics['name_utterances'][name]
            steps = [utterances[i] - utterances[i-1] for i in range(1, len(utterances))]
            avg_steps = sum(steps) / len(steps)
            metrics['avg_steps_between_utterances'][name] = avg_steps

    # Print summary metrics
    print("Summary Metrics:")
    print(f"Total Files: {total_files}")
    print(f"Total Chunks: {total_chunks}")
    print(f"Total Turns: {total_turns}")
    print(f"Total Words: {total_words}")
    print(f"Unique Names: {', '.join(unique_names)}")
    print(f"Number of Unique Names: {len(unique_names)}")
    print(f"Average Chunks per File: {avg_chunks_per_file:.2f}")
    print(f"Average Turns per File: {avg_turns_per_file:.2f}")
    print(f"Average Words per Utterance: {avg_words_per_utterance:.2f}")
    print(f"Common Names: {', '.join(common_names)}")
    print("Average Steps Between Utterances for Common Names:")
    for name, avg_steps in avg_steps_between_utterances.items():
        print(f"  {name}: {avg_steps:.2f}")

    # Print campaign-specific metrics
    for campaign_number, metrics in campaign_metrics.items():
        print(f"\nCampaign {campaign_number} Metrics:")
        print(f"Total Files: {metrics['total_files']}")
        print(f"Total Chunks: {metrics['total_chunks']}")
        print(f"Total Turns: {metrics['total_turns']}")
        print(f"Total Words: {metrics['total_words']}")
        print(f"Unique Names: {', '.join(metrics['unique_names'])}")
        print(f"Number of Unique Names: {len(metrics['unique_names'])}")
        print(f"Average Chunks per File: {metrics['avg_chunks_per_file']:.2f}")
        print(f"Average Turns per File: {metrics['avg_turns_per_file']:.2f}")
        print(f"Average Words per Utterance: {metrics['avg_words_per_utterance']:.2f}")
        print(f"Common Names: {', '.join(metrics['common_names'])}")
        print("Average Steps Between Utterances for Common Names:")
        for name, avg_steps in metrics['avg_steps_between_utterances'].items():
            print(f"  {name}: {avg_steps:.2f}")

# Specify the folder path
folder_path = '/workspace/slice-monorepo/cl_cr3/aligneddata'

# Process all JSON files in the folder and its subfolders
process_folder(folder_path)