In [1]:
import csv

def split_qa_csv(input_filepath, correct_filepath, review_filepath):
    """
    Splits a QA CSV file into two new files based on whether the questions
    can be reasonably answered from a photograph.

    Args:
        input_filepath (str): The path to the input CSV file.
        correct_filepath (str): The path to the output CSV file for correct questions.
        review_filepath (str): The path to the output CSV file for questions needing review.
    """

    review_keywords = [
        "according to",
        "which room in particular",
        "what was located in",
        "what is a tablinum",
        "what is a peristyle",
        "what is an oecus",
        "what is a lararium",
        "what is a rhyton",
         "What is a tablinum",
        "What is a peristyle",
        "What is an oecus",
        "What is a lararium",
        "What is a rhyton",
        "what are the dimensions",
        "who is credited",
        "who created",
        "what is the reference",
        "what book mentions",
         "what does the text under the drawing say",
         "who provided the photograph",
         "what is the location of the painting in relation to the atrium",
         "what is the location of the painting in relation to the bedroom",
         "What city is depicted in the image?",
        "What city are the ruins depicted in the image located in?",
        "What is the likely location of the medallion?",
         "What are some of the elements visible in the painting besides the central scene?",
        "What part of the room is shown in the image?",
        "Which wall is shown in the image?",
        "Which wall of the room is shown in the image?",
        "What is the designation of the room shown in the image?",
        "What is the specific location of the painted decoration within the cubiculum?",
        "What type of room is this located in?",
        "What type of room was this painting found in?",
        "What type of room was this painting found in?",
        "What type of room is the Cubiculum?",
        "What type of room is the cubiculum?",
        "What type of structure might an apodyterium be?",
        "What part of the building is Room 13 located in (according to the image description)?",
         "What is the location of the original painting within the Pompeii site (as per the description)?",
        "What type of room is depicted in the photograph?",
        "What is the location of the wall painting within the building?",
         "What type of building is this room likely part of?",
         "what was depicted on the zoccolo"
    ]

    with open(input_filepath, 'r', encoding='utf-8') as infile, \
            open(correct_filepath, 'w', newline='', encoding='utf-8') as correctfile, \
            open(review_filepath, 'w', newline='', encoding='utf-8') as reviewfile:
        
        reader = csv.DictReader(infile)
        fieldnames = reader.fieldnames
        
        correct_writer = csv.DictWriter(correctfile, fieldnames=fieldnames)
        review_writer = csv.DictWriter(reviewfile, fieldnames=fieldnames)
        
        correct_writer.writeheader()
        review_writer.writeheader()
        
        for row in reader:
            query = row['query'].lower()
            needs_review = any(keyword in query for keyword in review_keywords)
            
            if needs_review:
                review_writer.writerow(row)
            else:
                correct_writer.writerow(row)

# Example usage
input_csv_path = 'artemis_cup_theseus_qa_pairs_filtered.csv'
correct_csv_path = 'artemis_cup_theseus_qa_pairs_correct.csv'
review_csv_path = 'artemis_cup_theseus_qa_pairs_review.csv'

split_qa_csv(input_csv_path, correct_csv_path, review_csv_path)