## Imports 

In [None]:
# Import necessary modules
from openai import AzureOpenAI
import os
import json
import json_parser
from sklearn.model_selection import train_test_split
import pandas as pd

## Synthetic Data Generation using OpenAI API and QnA dataset


In [None]:

# Import necessary libraries

# Read the CSV file and extract the 'Paragraphs' column
# Replace 'your_dataset.csv' with the actual path to your dataset
df = pd.read_csv('your_dataset.csv')
paragraphs = df['Paragraphs']

# Filter paragraphs with length greater than 1000
filtered_paragraphs = [paragraph for paragraph in paragraphs if len(paragraph) > 1000]

# Print the count of such paragraphs
print(f"The dataset contains {len(filtered_paragraphs)} paragraphs with a length greater than 1000.")

# Function to initialize the Azure OpenAI model
def initilize_model():
    """
    Initializes the Azure OpenAI client using environment variables for security.
    """
    client = AzureOpenAI(
        azure_endpoint="https://your-azure-endpoint.openai.azure.com/",
        api_version="2024-02-01",
        api_key=os.environ["AZURE_OPENAI_API_KEY"]
    )
    print("Model Initialized")
    return client

# Function to perform inference on a given paragraph
def inference(client, paragraph):
    """
    Processes a paragraph using the Azure OpenAI model and generates a JSON response.
    """
    # Prompt for the model
    prompt = """You are an advanced language model. Your task is to process a given paragraph and perform the following steps:
    1. Extract 2-5 main key points from the paragraph. The number of key points should depend on the length of the paragraph, and the key points must be derived strictly from the paragraph itself.
    2. Randomly remove some of the extracted key points (up to 1-4 points) from the paragraph to create a modified version of the paragraph with missing information.
    3. Provide the missing key points that were removed.
    4. Clean the paragraph by removing any unnecessary information, ensuring it is concise and focused.

    Your output must be in the following JSON format:
    {
        "Answer": "paragraph with missing key points",
        "key_points": [list of extracted key points],
        "PointsMissed": [list of key points that were removed]
    }

    Before generating the result:
    - Double-check the extracted key points to ensure they are accurate and strictly derived from the paragraph.
    - Verify the modified paragraph to ensure it aligns with the missing key points and is logically consistent.

    Here is the paragraph for you to process:
    """

    # Prepare the messages for the model
    messages = [
        {"role": "system", "content": "You are an advanced language model. Your task is to process a given paragraph and perform the steps as instructed."},
        {"role": "user", "content": prompt + paragraph}
    ]
    print("Processing Paragraph:", paragraph)
    print("Generating JSON Response...")

    # Call the Azure OpenAI model
    response = client.chat.completions.create(
        model="test-gpt-4o",
        messages=messages,
        temperature=0.5,
        max_tokens=900
    )

    # Ensure the response content is valid
    content = response.choices[0].message.content
    if not content.strip():
        raise ValueError("The response content is empty.")
    return content

# Function to parse and save the JSON response
def parse_json(test_data, json_string, output_file):
    """
    Parses the JSON string returned by the model, appends additional data, and saves it to a file.
    """
    try:
        # Clean up the JSON string
        json_string = json_string.strip()
        if json_string.startswith("```json") and json_string.endswith("```"):
            json_string = json_string[7:-3].strip()
        elif json_string.startswith("```") and json_string.endswith("```"):
            json_string = json_string[3:-3].strip()
        
        # Parse the JSON string
        json_data = json.loads(json_string)
        
        # Add the test_data (original paragraph) to the JSON data
        json_data = {"testmessage": test_data, **json_data}
        
        # Append the parsed JSON data to the output file
        if os.path.exists(output_file):
            with open(output_file, 'r+') as file:
                try:
                    existing_data = json.load(file)
                    if isinstance(existing_data, list):
                        existing_data.append(json_data)
                    else:
                        existing_data = [existing_data, json_data]
                except json.JSONDecodeError:
                    existing_data = [json_data]
                file.seek(0)
                json.dump(existing_data, file, indent=4)
        else:
            with open(output_file, 'w') as file:
                json.dump([json_data], file, indent=4)
        
        print(f"JSON data successfully appended to {output_file}")
        return json_data
        
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON: {e}")
        return None
    except ValueError as e:
        print(f"Error: {e}")
        return None

# Main execution block
if __name__ == "__main__":
    # Initialize the model
    client = initilize_model()
    
    # Define the output file
    output_file = 'output.json'

    # Process each filtered paragraph
    for paragraph in filtered_paragraphs:
        test_data = paragraph
        try:
            # Perform inference
            result = inference(client, test_data)
            
            # Parse and save the result
            if result:
                parse_json(test_data, result, output_file)
            else:
                print("Error: No valid result to parse.")
        except Exception as e:
            print(f"Skipping paragraph due to error: {e}")


### Extracting 300 examples for Testing 

In [None]:

# Load the dataset
with open('output.json', 'r') as file:
    data = json.load(file)

# Split the dataset into training and testing sets
train_data, test_data = train_test_split(data, test_size=300, random_state=42)

# Save the training data to a JSON file
with open('train_data.json', 'w') as train_file:
    json.dump(train_data, train_file, indent=4)

# Save the testing data to a JSON file
with open('test_data.json', 'w') as test_file:
    json.dump(test_data, test_file, indent=4)

print("Training and testing datasets created successfully.")

### Convert Existing Training Dataset to alpaca prompt template Format so it can be used for training

In [None]:
import json

instruction = "Identify key points missing from the given answer."

def format_training_data(data):
    formatted_data = []
    for item in data:
        formatted_item = {
            "instruction": instruction,
            "input": { 
                'Answer': item["Answer"],
                "Key_Points": item["key_points"]
            },
            "output": {
                'Points_Missed': item["PointsMissed"]
            }
        }
        formatted_data.append(formatted_item)
    return formatted_data

# Load the training data
train_data_path = 'train_data.json'  # Ensure this points to a valid JSON file
train_data = json.load(open(train_data_path))

# Format the training data
formatted_data = format_training_data(train_data)

# Save the formatted data to a new JSON file
with open('formatted_train_data.json', 'w') as f:
    json.dump(formatted_data, f, indent=4)

print("Formatted training data saved to formatted_train_data.json")