In [25]:
"""
This script transforms a CSV file by performing specific data manipulations.
It changes the names of 'Question' and 'Answer' columns to lowercase, renames 'llm_answer' to 'llm_rag_answer',
removes the 'URL' column, extracts the relevant part of each answer following "ANSWER:", and cleans up the 'answer' 
column to form a single paragraph.

Functions:
- load_csv(file_path): Load a CSV file into a DataFrame.
- lowercase_column_names(df, columns): Change specified column names in a DataFrame to lowercase.
- rename_column(df, old_name, new_name): Rename a column in a DataFrame.
- remove_column(df, column): Remove a column from a DataFrame.
- extract_answer_part(df, column): Extract the part of each answer following "ANSWER:".
- clean_answer_column(df, column): Clean up the answer column to form a single paragraph.
- save_csv(df, file_path): Save a DataFrame to a CSV file.
- transform_csv(input_path, output_path): Perform the entire transformation process on the CSV file.
"""

import pandas as pd

def load_csv(file_path):
    """Load a CSV file into a DataFrame."""
    return pd.read_csv(file_path)

def lowercase_column_names(df, columns):
    """Change specified column names in a DataFrame to lowercase."""
    df.columns = [col.lower() if col in columns else col for col in df.columns]
    return df

def rename_column(df, old_name, new_name):
    """Rename a column in a DataFrame."""
    df.rename(columns={old_name: new_name}, inplace=True)
    return df

def remove_column(df, column):
    """Remove a column from a DataFrame."""
    return df.drop(column, axis=1)

def split_answer(answer):
    extracted_ans = answer.split("ANSWER:\n        \n        ")[-1]
    return extracted_ans

def extract_answer_part(df, column):
    """Extract the part of each answer following 'ANSWER:'."""
    df[column] = df[column].apply(lambda x: split_answer(x))
    return df

def clean_answer_column(df, column):
    """Clean up the answer column to form a single paragraph."""
    df[column] = df[column].apply(lambda x: ' '.join(x.split()))
    return df

def save_csv(df, file_path):
    """Save a DataFrame to a CSV file."""
    df.to_csv(file_path, index=False)

def transform_csv(input_path, output_path):
    """Transform the CSV file according to the specified requirements."""
    df = load_csv(input_path)
    df = lowercase_column_names(df, ['Question', 'Answer'])
    df = rename_column(df, 'llm_answer', 'llm_rag_answer')
    df = remove_column(df, 'URL')
    df = extract_answer_part(df, 'llm_answer')
    df = clean_answer_column(df, 'llm_answer')
    save_csv(df, output_path)

In [4]:
# Define file paths
input_path = '../06_Data/Capstone_Data/llm_testing_results/lora_plus_rag_testing_output.csv'
output_path = 'scoring/data/rag_and_llm_output_test.csv'

In [5]:
df = load_csv(input_path)

In [21]:
df['llm_answer'].iloc[1].split("ANSWER:\n        \n        ")[-1]

'To view your DHCP option sets in Amazon VPC, you can use the Amazon VPC console or the command line. The default settings for a DHCP option set include domain name servers and domain name, but there are no settings for NTP servers or domain name resolution. The default DHCP option set is automatically created when you create a VPC, and it is not possible to modify the default settings. If you need to modify the default settings, you must create a new DHCP option set. The default DHCP option set is used by default when you create a VPC, but you can also associate a custom DHCP option set with a VPC. For more information about creating a custom DHCP option set, see the "Create a custom DHCP option set" section in the "Work with DHCP option sets" topic in the Amazon VPC User Guide.\n\n        The AWS Documentation does not provide a specific example of how to view DHCP option sets in Amazon VPC, but it does provide detailed instructions on how to create, modify, and delete DHCP option se

In [33]:
extract_answer_part(df, 'llm_answer').iloc[1,3]

'To view your DHCP option sets in Amazon VPC, you can use the Amazon VPC console or the command line. The default settings for a DHCP option set include domain name servers and domain name, but there are no settings for NTP servers or domain name resolution. The default DHCP option set is automatically created when you create a VPC, and it is not possible to modify the default settings. If you need to modify the default settings, you must create a new DHCP option set. The default DHCP option set is used by default when you create a VPC, but you can also associate a custom DHCP option set with a VPC. For more information about creating a custom DHCP option set, see the "Create a custom DHCP option set" section in the "Work with DHCP option sets" topic in the Amazon VPC User Guide. The AWS Documentation does not provide a specific example of how to view DHCP option sets in Amazon VPC, but it does provide detailed instructions on how to create, modify, and delete DHCP option sets. The doc

In [None]:
# Run the transformation
transform_csv(input_path, output_path)