In [9]:
import os
import json
import re
import numpy as np
import matplotlib.pyplot as plt

def summarize_folder(input_folder):
    num_files = 0
    total_lines = 0
    file_sizes = []

    for filename in os.listdir(input_folder):
        if filename.endswith('.txt'):
            num_files += 1
            file_path = os.path.join(input_folder, filename)
            with open(file_path, 'r') as file:
                lines = file.readlines()
                total_lines += len(lines)
                file_sizes.append(os.path.getsize(file_path))

    return num_files, total_lines, file_sizes

def parse_chat_log(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()

    steps = []
    current_step = None
    chatgpt_text = ""
    code_blocks = []

    chatgpt_pattern = re.compile(r'^ChatGPT')
    code_start_patterns = {
        'python': re.compile(r'^python$'),
        'bash': re.compile(r'^bash$')
    }
    code_end_pattern = re.compile(r'```')

    inside_code_block = False
    current_code_block = []
    current_language = None

    for line in lines:
        print(f"Processing line: {line.strip()}")
        if chatgpt_pattern.search(line):
            print("Matched ChatGPT pattern")
            if current_step:
                current_step['chatgpt'] = chatgpt_text.strip()
                current_step['code_blocks'] = code_blocks
                steps.append(current_step)
                chatgpt_text = ""
                code_blocks = []
            current_step = {}
        else:
            chatgpt_text += line
        
        if any(pattern.match(line.strip()) for pattern in code_start_patterns.values()):
            inside_code_block = True
            current_code_block = []
            current_language = next(lang for lang, pattern in code_start_patterns.items() if pattern.match(line.strip()))
            print(f"Starting {current_language} code block")
        elif code_end_pattern.match(line.strip()):
            if inside_code_block:
                code_blocks.append({
                    'language': current_language,
                    'code': "\n".join(current_code_block).strip()
                })
                inside_code_block = False
                current_language = None
                print(f"Ending code block")
        elif inside_code_block:
            current_code_block.append(line.strip())

    if current_step:
        current_step['chatgpt'] = chatgpt_text.strip()
        current_step['code_blocks'] = code_blocks
        steps.append(current_step)

    print(f"Parsed steps: {json.dumps(steps, indent=4)}")
    return steps

def process_folder(input_folder):
    all_data = []
    for filename in os.listdir(input_folder):
        if filename.endswith('.txt'):
            file_path = os.path.join(input_folder, filename)
            chat_log_data = parse_chat_log(file_path)
            all_data.extend(chat_log_data)
    return all_data

def generate_metrics_and_plots(data):
    num_steps = len(data)
    num_code_blocks = sum(len(step['code_blocks']) for step in data)
    avg_code_blocks_per_step = num_code_blocks / num_steps if num_steps > 0 else 0

    print(f"Total steps: {num_steps}")
    print(f"Total code blocks: {num_code_blocks}")
    print(f"Average code blocks per step: {avg_code_blocks_per_step:.2f}")

    if num_steps == 0:
        print("No steps found in the data.")
        return

    # Plotting the distribution of code blocks per step
    code_blocks_per_step = [len(step['code_blocks']) for step in data]
    plt.figure(figsize=(10, 6))
    plt.hist(code_blocks_per_step, bins=range(1, max(code_blocks_per_step) + 2), align='left', rwidth=0.8)
    plt.title('Distribution of Code Blocks per Step')
    plt.xlabel('Number of Code Blocks')
    plt.ylabel('Frequency')
    plt.grid(True)
    plt.show()

def main():
    input_folder = '/workspace/slice-monorepo/logger/chat_files/text'  # Updated input folder path

    num_files, total_lines, file_sizes = summarize_folder(input_folder)
    print(f"Number of files: {num_files}")
    print(f"Total lines across all files: {total_lines}")
    print(f"Average file size: {sum(file_sizes)/len(file_sizes) if file_sizes else 0:.2f} bytes")
    print(f"Max file size: {max(file_sizes) if file_sizes else 0} bytes")
    print(f"Min file size: {min(file_sizes) if file_sizes else 0} bytes")

    chat_log_data = process_folder(input_folder)

    # Print some example parsed data for verification
    for step in chat_log_data[:3]:  # Print first 3 steps for example
        print(json.dumps(step, indent=4))

    # Generate metrics and plots
    generate_metrics_and_plots(chat_log_data)

if __name__ == "__main__":
    main()


Number of files: 6
Total lines across all files: 23875
Average file size: 138837.00 bytes
Max file size: 388129 bytes
Min file size: 35592 bytes
Processing line: 
Processing line: 
Processing line: 
Processing line: If I wanted an extremely general logging system basically I just want to wrap any and all python bash file starts with something that simply takes every printout and saves it with the time and date.
Processing line: 
Processing line: 
Processing line: So wether I did something like this:
Processing line: python train_dolly.py --input_model databricks/dolly-v2-3b --gpu_family a100
Processing line: or this
Processing line: command = [
Processing line: "deepspeed", "training/trainer.py",
Processing line: "--input-model", input_model,
Processing line: "--deepspeed", deepspeed_config,
Processing line: "--training-dataset", training_dataset,
Processing line: "--epochs", "2",
Processing line: "--local-output-dir", local_output_dir,
Processing line: "--per-device-train-batch-size",