In [1]:
import os
import csv

def check_and_clean_csv(image_folder, csv_file1, csv_file2):
    """
    Checks for missing images, deletes corresponding entries in CSV files,
    and overwrites the existing CSV files with cleaned data.

    Args:
        image_folder: Path to the folder containing images.
        csv_file1: Path to the first CSV file.
        csv_file2: Path to the second CSV file.
    """

    # Get image filenames without extension
    image_filenames = set(int(os.path.splitext(filename)[0]) for filename in os.listdir(image_folder) if filename.endswith('.jpg'))

    # Set to store missing images
    missing_images = set()

    # Find missing images
    if image_filenames:
        max_image_id = max(image_filenames)
        all_image_ids = set(range(1, max_image_id + 1))
        missing_images = all_image_ids - image_filenames

    try:
        with open(csv_file1, 'r', encoding='utf-8') as csvfile1, \
             open(csv_file2, 'r', encoding='utf-8') as csvfile2:

            reader1 = csv.reader(csvfile1)
            reader2 = csv.reader(csvfile2)

            # Read headers
            header1 = next(reader1)
            header2 = next(reader2)

            # Identify the index of the 'movie_id' column in each CSV file
            movie_id_index1 = header1.index('movie_id')
            movie_id_index2 = header2.index('movie_id')

            # Process csv_file1 and write to temp file
            with open('temp1.csv', 'w', newline='', encoding='utf-8') as temp_csvfile1:
                writer1 = csv.writer(temp_csvfile1)
                writer1.writerow(header1)
                for row1 in reader1:
                    movie_id1 = int(row1[movie_id_index1])
                    if movie_id1 not in missing_images:
                        writer1.writerow(row1)

            # Process csv_file2 and write to temp file
            with open('temp2.csv', 'w', newline='', encoding='utf-8') as temp_csvfile2:
                writer2 = csv.writer(temp_csvfile2)
                writer2.writerow(header2)
                for row2 in reader2:
                    movie_id2 = int(row2[movie_id_index2])
                    if movie_id2 not in missing_images:
                        writer2.writerow(row2)

        # Replace original files with cleaned data
        os.replace('temp1.csv', csv_file1)
        os.replace('temp2.csv', csv_file2)

    except (FileNotFoundError, UnicodeDecodeError, ValueError) as e:
        print(f"Error: {e}")
        print("*Please check file paths and consider using a different encoding (e.g., 'latin-1', 'cp1252') if 'utf-8' doesn't work.*")

    # Print missing images
    if missing_images:
        print("Missing images:")
        for image in sorted(missing_images):
            print(f"{image}.jpg")

# Example usage
image_folder = r"data/posters"
csv_file1 = r"data/items.csv"
csv_file2 = r"data/ratings.csv"

check_and_clean_csv(image_folder, csv_file1, csv_file2)

Missing images:
330.jpg
387.jpg
563.jpg
767.jpg
868.jpg
907.jpg
1026.jpg
1331.jpg
1340.jpg
1346.jpg
1359.jpg
1420.jpg
1431.jpg
1498.jpg
1516.jpg
1536.jpg
1568.jpg
1586.jpg
1667.jpg
