Using openAI to clean the text with proper context!

In [None]:
import openai
from dotenv import load_dotenv
import os
import re
import pandas as pd
import time
import json

# --- Configuration ---
# Load environment variables from a .env file.
# Make sure you have a .env file in the same directory with the line:
# OPENAI_API_KEY="your_key_here"
load_dotenv()

# Set your OpenAI API key from the environment variable.
# The script will use the openai.OpenAI() client which automatically
# looks for this environment variable.
if "OPENAI_API_KEY" not in os.environ:
    raise ValueError("OPENAI_API_KEY not found in .env file or environment variables. Please add it.")

# Initialize the OpenAI client
client = openai.OpenAI()

def read_instruction_file(file_path):
    """
    Reads the content of an instruction file.

    Args:
        file_path (str): The path to the instruction file.

    Returns:
        str: The content of the file, or an empty string if not found.
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            return f.read().strip()
    except FileNotFoundError:
        print(f"Warning: Instruction file not found at {file_path}. An empty string will be used for the system prompt.")
        return ""
    except Exception as e:
        print(f"Error reading file {file_path}: {e}")
        return ""

def clean_text_batch_json_openai(batch_dict, model_name="gpt-4o-latest", system_prompt_content="", temperature=1):
    """
    Cleans a batch of texts using a single OpenAI API call with JSON mode.

    Args:
        batch_dict (dict): A dictionary of texts to clean, with the original index as the key.
                           Example: {10: "text to clean", 15: "another text"}
        model_name (str): The name of the OpenAI model to use.
        system_prompt_content (str): The system prompt content to use for cleaning.
        temperature (float): The sampling temperature for generation (0.0 to 2.0).

    Returns:
        dict: A dictionary with original indices as keys and cleaned texts as values.
              Returns an empty dictionary on failure.
    """
    # --- Read instructions from file to use as the system prompt ---
    # This guides the model's behavior for the entire conversation.
    # Use the system_prompt_content passed as parameter, or read from file if empty
    if not system_prompt_content:
        system_prompt_content = read_instruction_file('system_prompt_cleaning_v2.txt')

    # --- Construct the user prompt ---
    # The user prompt provides the data and instructs the model on the desired output format.
    # We serialize the dictionary of texts into a JSON string to send to the model.
    texts_to_clean_json = json.dumps(batch_dict, indent=2)
    user_prompt = f"""
Please clean each text in the following JSON object based on the rules provided in the system prompt.
The keys in the object are the original identifiers.
Your response must be a single, valid JSON object that contains the exact same keys as the input,
but with the cleaned text as the corresponding values.
Do not add any commentary, explanations, or markdown formatting around your response.
Your entire output must be only the JSON object.

Input Texts:
{texts_to_clean_json}
"""

    try:
        # --- Call the OpenAI API ---
        # We use the Chat Completions endpoint, which is standard for GPT-4 models.
        response = client.chat.completions.create(
            model=model_name,
            # We enable JSON Mode to guarantee the output is a valid JSON string.
            response_format={"type": "json_object"},
            temperature=temperature,
            messages=[
                {"role": "system", "content": system_prompt_content},
                {"role": "user", "content": user_prompt}
            ]
        )
        
        # print("user prompt:", user_prompt)
        # The model's response content should be a JSON string.
        # We parse it into a Python dictionary.
        response_content = response.choices[0].message.content
        cleaned_results_dict = json.loads(response_content)
        return cleaned_results_dict
        
    except json.JSONDecodeError as e:
        print(f"Error: Failed to decode JSON from model response. Error: {e}")
        # It's helpful to see what the model returned if it wasn't valid JSON.
        if 'response_content' in locals():
            print(f"Model response was:\n---\n{response_content}\n---")
        return {}
    except openai.APIError as e:
        print(f"An OpenAI API error occurred: {e}")
        time.sleep(1) # Wait a moment before potential retries
        return {}
    except Exception as e:
        print(f"An unexpected error occurred during the API call: {e}")
        time.sleep(1)
        return {}


def process_csv_openai(input_file, output_file, model_name, system_prompt_content, batch_size=50):
    """
    Reads a CSV, processes a column in batches using the OpenAI API, and saves the result.
    
    Args:
        input_file (str): Path to the input CSV file.
        output_file (str): Path to the output CSV file.
        model_name (str): The name of the OpenAI model to use.
        system_prompt_content (str): The system prompt content to use for cleaning.
        batch_size (int): Number of rows to process in each batch.
    """
    try:
        df = pd.read_csv(input_file)
    except FileNotFoundError:
        print(f"Error: Input file not found at {input_file}")
        return

    # Ensure the required column exists
    if 'Segment Text' not in df.columns:
        print("Error: 'Segment Text' column not found in the CSV file.")
        return

    # Create a new column for cleaned text, initialized with a placeholder.
    df['Cleaned Text'] = pd.NA

    # Get a series of the text to be processed, dropping any empty values.
    texts_to_process = df['Segment Text'].dropna()
    
    # Process texts in batches to manage API request size.
    num_batches = (len(texts_to_process) + batch_size - 1) // batch_size
    
    for i in range(0, len(texts_to_process), batch_size):
        batch_series = texts_to_process.iloc[i:i + batch_size]
        
        # Convert the batch Series to a dictionary.
        # The dictionary's keys are the original DataFrame indices, which is crucial
        # for mapping the results back to the correct rows.
        batch_dict = batch_series.to_dict()
        
        print(f"Processing batch {i // batch_size + 1}/{num_batches} (Rows {min(batch_dict.keys())} to {max(batch_dict.keys())})...")

        # Use the model_name and system_prompt_content passed as parameters
        cleaned_batch_dict = clean_text_batch_json_openai(batch_dict, model_name, system_prompt_content)
            
        if cleaned_batch_dict:
            # Map the cleaned results back to the 'Cleaned Text' column.
            # We iterate through the returned dictionary and use the keys (original indices)
            # to place the cleaned text in the correct location in the DataFrame.
            for original_index, cleaned_text in cleaned_batch_dict.items():
                # The keys from the JSON response might be strings, so convert them to int.
                df.loc[int(original_index), 'Cleaned Text'] = cleaned_text
        else:
            print(f"Warning: Batch {i // batch_size + 1} returned no data.")


    # Optionally, drop the original column after processing
    df = df.drop(columns=['Segment Text'])

    # Save the updated DataFrame to a new CSV file.
    # Using 'utf-8-sig' helps Excel open the file correctly with special characters.
    df.to_csv(output_file, index=False, encoding='utf-8-sig')
    print(f"\nProcessing complete. Cleaned data saved to '{output_file}'")

# --- Main Execution ---
if __name__ == "__main__":
    # --- DEFINE YOUR FILE PATHS HERE ---
    # IMPORTANT: Replace this with the actual path to your input CSV file.
    # Using a raw string (r'...') is helpful on Windows to avoid issues with backslashes.
    input_csv_path =  r'D:\SOKM\11 Identity 2 SoKM 2024 - 2025\11 Identity 2 SoKM 2024 - 2025_transcript_english_SE_br_converted.csv'

    # Define the model name you want to use
    model_name = "gpt-4.1"  # Choose the model you want to use
    
    # --- Read the system prompt from a file ---
    system_prompt_content = read_instruction_file('system_prompt_cleaning_v2.3.txt')

    # Use regex to remove any special characters from the model name to create an output file name
    model_name_cleaned = re.sub(r'[^a-zA-Z0-9]', '_', model_name)
    
    # Define the output file path automatically
    output_csv_path = input_csv_path.replace('.csv', f'_cleaned_{model_name_cleaned}.csv')

    # This batch size determines how many rows are sent in a single API request.
    # A size of 25-100 is a good starting point, but you can adjust it based on
    # the average length of your text to avoid exceeding model token limits.
    process_csv_openai(input_csv_path, output_csv_path, model_name, system_prompt_content, batch_size=50)



In [None]:
# Evaluating the performance of the cleaning process
# This section is for evaluating the performance of the cleaning process.
# You can use the WER (Word Error Rate) to measure how well the cleaning process performed.

import jiwer
# Example of how to use jiwer to calculate WER
def calculate_wer(reference, hypothesis):
    """
    Calculate the Word Error Rate (WER) between a reference and a hypothesis.

    Args:
        reference (str): The ground truth text.
        hypothesis (str): The cleaned text to compare against the reference.

    Returns:
        float: The WER value.
    """
    return jiwer.wer(reference, hypothesis)

# Example usage
if __name__ == "__main__":
    # Load the cleaned data and original data
    cleaned_df = pd.read_csv(output_csv_path)
    original_df = pd.read_csv(input_csv_path)
    
    # Ensure cleaned_df has the necessary columns
    # Add the original Segment Text back to the cleaned DataFrame
    if 'Segment Text' not in cleaned_df.columns:
        cleaned_df['Segment Text'] = original_df['Segment Text']
    
    # Verify both required columns exist
    if 'Cleaned Text' not in cleaned_df.columns:
        print("Error: 'Cleaned Text' column not found in cleaned data.")
    else:
        # Calculate WER for each row by comparing original and cleaned text
        wer_scores = []
        for idx in range(len(cleaned_df)):
            if pd.notna(cleaned_df.loc[idx, 'Segment Text']) and pd.notna(cleaned_df.loc[idx, 'Cleaned Text']):
                wer_score = calculate_wer(
                    str(cleaned_df.loc[idx, 'Segment Text']), 
                    str(cleaned_df.loc[idx, 'Cleaned Text'])
                )
                wer_scores.append(wer_score)
            else:
                wer_scores.append(None)  # Handle missing data
        
        cleaned_df['WER'] = wer_scores
        
        # Reorder columns to have Segment Text, Cleaned Text, and WER
        column_order = ['Segment Text', 'Cleaned Text', 'WER']
        # Add any other columns that might exist
        other_columns = [col for col in cleaned_df.columns if col not in column_order]
        final_columns = column_order + other_columns
        cleaned_df = cleaned_df[final_columns]
        
        # Save the results with WER to a new CSV file
        wer_output_path = output_csv_path.replace('.csv', '_with_wer.csv')
        cleaned_df.to_csv(wer_output_path, index=False, encoding='utf-8-sig')
        print(f"\nWER results saved to '{wer_output_path}'")

Using Gemini to clean the text with proper context!

In [None]:
import google.generativeai as genai
from dotenv import load_dotenv
import os
import re
import pandas as pd
import time
import json

# --- Configuration ---
# Load environment variables from a .env file
load_dotenv()

# Set your Google API key
# Ensure you have a .env file with GOOGLE_API_KEY="your_key_here"
api_key = os.getenv("GOOGLE_API_KEY")
if not api_key:
    raise ValueError("GOOGLE_API_KEY not found in .env file. Please add it.")
genai.configure(api_key=api_key)


def read_instruction_file(file_path):
    """Reads the content of an instruction file."""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            return f.read().strip()
    except FileNotFoundError:
        print(f"Warning: Instruction file not found at {file_path}. An empty string will be used.")
        return ""
    except Exception as e:
        print(f"Error reading file {file_path}: {e}")
        return ""

def clean_text_batch_json(batch_dict, system_prompt_content, model_name="gemini-2.5-flash-latest", temperature=0.7):
    """
    Cleans a batch of texts using a single API call with JSON mode.

    Args:
        batch_dict (dict): A dictionary of texts to clean, with original index as key.
                           Example: {10: "text to clean", 15: "another text"}
        system_prompt_content (str): The system prompt content to use for cleaning.
        model_name (str): The name of the Gemini model to use.
        temperature (float): The temperature for generation.

    Returns:
        dict: A dictionary with original indices as keys and cleaned texts as values.
              Returns an empty dictionary on failure.
    """

    # --- Construct the user prompt ---
    # The user prompt contains the data and the specific instructions for formatting the output.
    # We serialize the dictionary of texts into a JSON string to send to the model.
    texts_to_clean_json = json.dumps(batch_dict, indent=2)
    user_prompt = f"""
Please clean each text in the following JSON object.
The keys are the original identifiers.
Return a single, valid JSON object that contains the exact same keys, but with the cleaned text as the values.
Do not add any commentary, explanations, or markdown formatting around your response. Your entire output must be only the JSON object.

Input Texts:
{texts_to_clean_json}
"""

    # --- Set up the Gemini model for JSON output ---
    model = genai.GenerativeModel(
        model_name=model_name,
        system_instruction=system_prompt_content,
        generation_config={
            "temperature": temperature,
            # Crucially, we tell the model we expect a JSON response.
            "response_mime_type": "application/json",
        }
    )

    try:
        # Call the Gemini API to generate the cleaned content
        response = model.generate_content(user_prompt)
        
        # The model's response should be a JSON string. We parse it into a Python dictionary.
        cleaned_results_dict = json.loads(response.text)
        return cleaned_results_dict
        
    except json.JSONDecodeError as e:
        print(f"Error: Failed to decode JSON from model response. Error: {e}")
        print(f"Model response was:\n---\n{response.text}\n---")
        return {} # Return empty dict on failure
    except Exception as e:
        print(f"An unexpected error occurred during API call: {e}")
        # Optional: Add a small delay to handle potential rate limiting issues
        time.sleep(1)
        return {} # Return empty dict on failure


def process_csv(input_file, output_file, model_name, system_prompt_content, batch_size=50):
    """
    Reads a CSV, processes a specific column in batches using the Gemini API's JSON mode,
    and saves the result.
    
    Args:
        input_file (str): Path to the input CSV file.
        output_file (str): Path to the output CSV file.
        model_name (str): The name of the Gemini model to use.
        system_prompt_content (str): The system prompt content to use for cleaning.
        batch_size (int): Number of rows to process in each batch.
    """
    try:
        df = pd.read_csv(input_file)
    except FileNotFoundError:
        print(f"Error: Input file not found at {input_file}")
        return

    if 'Segment Text' not in df.columns:
        print("Error: The 'Segment Text' column is not present in the CSV file.")
        return

    # Create a new column for the cleaned text, initialized as empty
    df['Cleaned Text'] = pd.NA

    # Get a series of the text to be processed, dropping any empty rows
    texts_to_process = df['Segment Text'].dropna()
    
    # Process texts in batches to manage API calls and context window limits
    num_batches = (len(texts_to_process) + batch_size - 1) // batch_size
    
    for i in range(0, len(texts_to_process), batch_size):
        batch_series = texts_to_process[i:i + batch_size]
        
        # Convert the batch Series to a dictionary, using the original DataFrame index as the key.
        # This is critical for mapping the results back correctly.
        batch_dict = batch_series.to_dict()
        
        print(f"Processing batch {i // batch_size + 1}/{num_batches} (Rows {min(batch_dict.keys())}-{max(batch_dict.keys())})...")

        # --- CHOOSE YOUR MODEL HERE ---
        # Use 'gemini-2.5-flash-lite-preview-06-17' for super speed and super cost-effectiveness.
        # Use 'gemini-2.5-flash' for speed and cost-effectiveness.
        # Use 'gemini-2.5-pro' for higher quality results.
        cleaned_batch_dict = clean_text_batch_json(batch_dict, system_prompt_content, model_name)
            
        if cleaned_batch_dict:
            # Map the results back to the 'Cleaned Text' column using the original indices
            for original_index, cleaned_text in cleaned_batch_dict.items():
                # The keys from the returned dict are strings, so we convert them to int for indexing
                df.loc[int(original_index), 'Cleaned Text'] = cleaned_text

    # Drop the original column
    df = df.drop(columns=['Segment Text'])

    # Save the updated DataFrame to a new CSV file
    df.to_csv(output_file, index=False, encoding='utf-8-sig')
    print(f"\nProcessing completed and saved to '{output_file}'")
    
    
# --- Main Execution ---
if __name__ == "__main__":
    # --- DEFINE YOUR FILE PATHS HERE ---
    # IMPORTANT: Replace this with the actual path to your input CSV file.
    # Using a raw string (r'...') is helpful on Windows to avoid issues with backslashes.
    input_csv_path = r'D:\SOKM\Testing\01 Introduction SoKM 2024 - 2025 4k_audio_english_fixed.csv'
    
    # Define the model name you want to use
    model_name = "gemini-2.5-pro"  # Choose the model you want to use
    
    # --- Read the system prompt from a file ---
    system_prompt_file = "system_prompt_cleaning_v1.1.txt"
    system_prompt_content = read_instruction_file(system_prompt_file)
    # system_prompt_content_tag contains only the version number, not the full content 
    # Eg: system_prompt_translate_v1.1 should lead to system_prompt_content_tag = "v1.1"
    # This is useful for generating output file names.
    system_prompt_content_tag = re.search(r"v\d+\.\d+", system_prompt_file).group(0)
    
    
    # Use regex to remove any special characters from the model name to create an output file name
    model_tag = re.sub(r'[^a-zA-Z0-9]', '_', model_name)
    
    output_csv_path = input_csv_path.replace(
        ".csv", f"_cleaned_{model_tag}_p{system_prompt_content_tag}.csv"
    )

    # --- Run the process ---
    # This batch size determines how many rows are sent in a single API request.
    # You can adjust it based on the average length of your text and model context limits.
    # A size of 25-100 is usually a good starting point.
    process_csv(input_csv_path, output_csv_path, model_name, system_prompt_content, batch_size=50)