In [None]:
import pandas as pd
from collections import Counter
from PIL import Image
import os

In [1]:

df = pd.read_csv('full_df.csv')  
print(df.head())

   ID  Patient Age Patient Sex Left-Fundus Right-Fundus  \
0   0           69      Female  0_left.jpg  0_right.jpg   
1   1           57        Male  1_left.jpg  1_right.jpg   
2   2           42        Male  2_left.jpg  2_right.jpg   
3   4           53        Male  4_left.jpg  4_right.jpg   
4   5           50      Female  5_left.jpg  5_right.jpg   

                            Left-Diagnostic Keywords  \
0                                           cataract   
1                                      normal fundus   
2  laser spot，moderate non proliferative retinopathy   
3                        macular epiretinal membrane   
4             moderate non proliferative retinopathy   

                Right-Diagnostic Keywords  N  D  G  C  A  H  M  O  \
0                           normal fundus  0  0  0  1  0  0  0  0   
1                           normal fundus  1  0  0  0  0  0  0  0   
2  moderate non proliferative retinopathy  0  1  0  0  0  0  0  1   
3       mild nonproliferative re

In [2]:

label_counts = Counter()

for labels in df['labels']:
    for label in eval(labels):  # Use eval to convert string representation of list to actual list
        label_counts[label] += 1

# Print the number of images in each class
print("\nNumber of images in each class:")
for label, count in label_counts.items():
    print(f"{label}: {count}")


Number of images in each class:
N: 2873
D: 1608
O: 708
M: 232
H: 128
C: 293
A: 266
G: 284


In [3]:
# Load the DataFrame
df = pd.read_csv('full_df.csv')

# Define the file path to the actual images directory
actual_image_dir = 'preprocessed_images'

# Initialize a counter for augmented images by class
class_counts = Counter()

# Filter the DataFrame for the specified classes
specified_classes = {'M', 'C', 'A', 'G'}
filtered_df = df[df['labels'].apply(lambda x: any(label in specified_classes for label in eval(x)))]

# Perform horizontal flip and save the images
augmented_data = []
for index, row in filtered_df.iterrows():
    filename = row['filename']
    
    # Load the image
    image_path = os.path.join(actual_image_dir, filename)
    if os.path.exists(image_path):
        image = Image.open(image_path)
        
        # Perform horizontal flip
        flipped_image = image.transpose(Image.FLIP_LEFT_RIGHT)
        
        # Save the flipped image with the new naming convention in the same directory
        flipped_image_filename = f"H-Flip_{filename}"
        flipped_image_path = os.path.join(actual_image_dir, flipped_image_filename)
        flipped_image.save(flipped_image_path)
        
        # Append the original row data with updated filename to the new DataFrame
        augmented_row = row.copy()
        augmented_row['filename'] = flipped_image_filename
        augmented_data.append(augmented_row)

        # Count the classes in the labels
        for label in eval(row['labels']):
            if label in specified_classes:
                class_counts[label] += 1

# Create a new DataFrame from the augmented data
augmented_df = pd.DataFrame(augmented_data)

# Save the augmented DataFrame to a new CSV file
augmented_df.to_csv('new_augmented_data.csv', index=False)

# Print class counts after augmentation
print("\nClass counts after augmentation:")
for label, count in class_counts.items():
    print(f"{label}: {count}")

print("\nData augmentation complete and saved to 'new_augmented_data.csv'.")



Class counts after augmentation:
M: 232
C: 293
A: 266
G: 284

Data augmentation complete and saved to 'new_augmented_data.csv'.


In [4]:


# Load the original DataFrame
df = pd.read_csv('full_df.csv')

# Load the existing augmented DataFrame
augmented_df = pd.read_csv('new_augmented_data.csv')

# Define the file path to the actual images directory
actual_image_dir = 'preprocessed_images'

# Initialize a counter for augmented images by class
class_counts_before = Counter()
class_counts_after = Counter()

# Filter the DataFrame for the specified classes
specified_classes = {'M', 'C', 'A', 'G'}
filtered_df = df[df['labels'].apply(lambda x: any(label in specified_classes for label in eval(x)))]

# Count classes before augmentation
for labels in filtered_df['labels']:
    for label in eval(labels):
        if label in specified_classes:
            class_counts_before[label] += 1

# Print class counts before augmentation
print("Class counts before augmentation:")
for label, count in class_counts_before.items():
    print(f"{label}: {count}")

# List to store new rows for the augmented DataFrame
new_rows = []

# Perform vertical flip and save the images
for index, row in filtered_df.iterrows():
    filename = row['filename']
    
    # Load the image
    image_path = os.path.join(actual_image_dir, filename)
    if os.path.exists(image_path):
        image = Image.open(image_path)
        
        # Perform vertical flip
        flipped_image = image.transpose(Image.FLIP_TOP_BOTTOM)
        
        # Save the flipped image with the new naming convention in the same directory
        flipped_image_filename = f"V-Flip_{filename}"
        flipped_image_path = os.path.join(actual_image_dir, flipped_image_filename)
        flipped_image.save(flipped_image_path)
        
        # Prepare the augmented row
        augmented_row = row.copy()
        augmented_row['filename'] = flipped_image_filename
        new_rows.append(augmented_row)

        # Count the classes in the labels
        for label in eval(row['labels']):
            if label in specified_classes:
                class_counts_after[label] += 1

# Convert new rows to a DataFrame
new_rows_df = pd.DataFrame(new_rows)

# Concatenate the new rows to the existing DataFrame
augmented_df = pd.concat([augmented_df, new_rows_df], ignore_index=True)

# Save the updated DataFrame back to the same CSV file
augmented_df.to_csv('new_augmented_data.csv', index=False)

# Print class counts after augmentation
print("\nClass counts after vertical flip augmentation:")
for label, count in class_counts_after.items():
    print(f"{label}: {count}")

# Calculate total class counts in the updated augmented DataFrame
total_class_counts = Counter()
for labels in augmented_df['labels']:
    for label in eval(labels):
        total_class_counts[label] += 1

# Print total class counts in the updated augmented DataFrame
print("\nTotal number of images in each class in the updated augmented file:")
for label, count in total_class_counts.items():
    print(f"{label}: {count}")

print(f"\nTotal number of images in the updated augmented file: {len(augmented_df)}")
print("\nVertical flip data augmentation complete and appended to 'new_augmented_data.csv'.")


Class counts before augmentation:
M: 232
C: 293
A: 266
G: 284

Class counts after vertical flip augmentation:
M: 232
C: 293
A: 266
G: 284

Total number of images in each class in the updated augmented file:
M: 464
C: 586
A: 532
G: 568

Total number of images in the updated augmented file: 2150

Vertical flip data augmentation complete and appended to 'new_augmented_data.csv'.


In [5]:
# Load the original DataFrame
df = pd.read_csv('full_df.csv')

# Load the existing augmented DataFrame
augmented_df = pd.read_csv('new_augmented_data.csv')

# Define the file path to the actual images directory
actual_image_dir = 'preprocessed_images'

# Initialize a counter for augmented images by class
class_counts_before = Counter()
class_counts_after = Counter()

# Filter the DataFrame for the specified classes
specified_classes = {'O', 'M', 'C', 'A', 'G'}
filtered_df = df[df['labels'].apply(lambda x: any(label in specified_classes for label in eval(x)))]

# Count classes before augmentation
for labels in filtered_df['labels']:
    for label in eval(labels):
        if label in specified_classes:
            class_counts_before[label] += 1

# Print class counts before augmentation
print("Class counts before augmentation:")
for label, count in class_counts_before.items():
    print(f"{label}: {count}")

# List to store new rows for the augmented DataFrame
new_rows = []

# Perform slight rotation and save the images
rotation_angle = 10  # Rotation by 10 degrees
for index, row in filtered_df.iterrows():
    filename = row['filename']
    
    # Load the image
    image_path = os.path.join(actual_image_dir, filename)
    if os.path.exists(image_path):
        image = Image.open(image_path)
        
        # Perform rotation
        rotated_image = image.rotate(rotation_angle)
        
        # Save the rotated image with the new naming convention in the same directory
        rotated_image_filename = f"Rot{rotation_angle}_{filename}"
        rotated_image_path = os.path.join(actual_image_dir, rotated_image_filename)
        rotated_image.save(rotated_image_path)
        
        # Prepare the augmented row
        augmented_row = row.copy()
        augmented_row['filename'] = rotated_image_filename
        new_rows.append(augmented_row)

        # Count the classes in the labels
        for label in eval(row['labels']):
            if label in specified_classes:
                class_counts_after[label] += 1

# Convert new rows to a DataFrame
new_rows_df = pd.DataFrame(new_rows)

# Concatenate the new rows to the existing DataFrame
augmented_df = pd.concat([augmented_df, new_rows_df], ignore_index=True)

# Save the updated DataFrame back to the same CSV file
augmented_df.to_csv('new_augmented_data.csv', index=False)

# Print class counts after augmentation
print("\nClass counts after rotation augmentation:")
for label, count in class_counts_after.items():
    print(f"{label}: {count}")

# Calculate total class counts in the updated augmented DataFrame
total_class_counts = Counter()
for labels in augmented_df['labels']:
    for label in eval(labels):
        total_class_counts[label] += 1

# Print total class counts in the updated augmented DataFrame
print("\nTotal number of images in each class in the updated augmented file:")
for label, count in total_class_counts.items():
    print(f"{label}: {count}")

print(f"\nTotal number of images in the updated augmented file: {len(augmented_df)}")
print("\nRotation data augmentation complete and appended to 'new_augmented_data.csv'.")


Class counts before augmentation:
O: 708
M: 232
C: 293
A: 266
G: 284

Class counts after rotation augmentation:
O: 708
M: 232
C: 293
A: 266
G: 284

Total number of images in each class in the updated augmented file:
M: 696
C: 879
A: 798
G: 852
O: 708

Total number of images in the updated augmented file: 3933

Rotation data augmentation complete and appended to 'new_augmented_data.csv'.


In [10]:
# Load the original dataset
original_df = pd.read_csv('full_df.csv')

# Load the augmented dataset
augmented_df = pd.read_csv('new_augmented_data.csv')

# Define the criteria for selecting original images
class_filters = {
    'C': original_df[original_df['labels'].apply(lambda x: 'C' in eval(x))],
    'M': original_df[original_df['labels'].apply(lambda x: 'M' in eval(x))],
    'A': original_df[original_df['labels'].apply(lambda x: 'A' in eval(x))],
    'G': original_df[original_df['labels'].apply(lambda x: 'G' in eval(x))],
    'N': original_df[original_df['labels'].apply(lambda x: 'N' in eval(x))].sample(frac=2/3, random_state=1),  # Two thirds of N
    'D': original_df[original_df['labels'].apply(lambda x: 'D' in eval(x))],
    'O': original_df[original_df['labels'].apply(lambda x: 'O' in eval(x))]  # All of O
}

# Create a list to hold filtered images with their corresponding row data
filtered_rows = []

# Add images and their info from the augmented dataset
for _, row in augmented_df.iterrows():
    filtered_rows.append(row)

# Add images and their info for each specified class from the original dataset
for class_label, images in class_filters.items():
    for _, row in images.iterrows():
        filtered_rows.append(row)

# Create a new DataFrame for the filtered results
filtered_df = pd.DataFrame(filtered_rows)

# Save the filtered DataFrame to a new CSV file
filtered_df.to_csv('new_filtered_images.csv', index=False)

print("Filtered images saved to 'new_filtered_images.csv'.")


Filtered images saved to 'new_filtered_images.csv'.


In [7]:
pd.read_csv("filtered_images.csv")

Unnamed: 0,ID,Patient Age,Patient Sex,Left-Fundus,Right-Fundus,Left-Diagnostic Keywords,Right-Diagnostic Keywords,N,D,G,C,A,H,M,O,filepath,labels,target,filename
0,13,60,Female,13_left.jpg,13_right.jpg,pathological myopia,pathological myopia,0,0,0,0,0,0,1,0,../input/ocular-disease-recognition-odir5k/ODI...,['M'],"[0, 0, 0, 0, 0, 0, 1, 0]",H-Flip_13_right.jpg
1,16,54,Female,16_left.jpg,16_right.jpg,normal fundus,pathological myopia,0,0,0,0,0,0,1,0,../input/ocular-disease-recognition-odir5k/ODI...,['M'],"[0, 0, 0, 0, 0, 0, 1, 0]",H-Flip_16_right.jpg
2,18,58,Male,18_left.jpg,18_right.jpg,pathological myopia,pathological myopia,0,0,0,0,0,0,1,0,../input/ocular-disease-recognition-odir5k/ODI...,['M'],"[0, 0, 0, 0, 0, 0, 1, 0]",H-Flip_18_right.jpg
3,24,75,Female,24_left.jpg,24_right.jpg,normal fundus,cataract,0,0,0,1,0,0,0,0,../input/ocular-disease-recognition-odir5k/ODI...,['C'],"[0, 0, 0, 1, 0, 0, 0, 0]",H-Flip_24_right.jpg
4,43,35,Male,43_left.jpg,43_right.jpg,wet age-related macular degeneration,dry age-related macular degeneration，glaucoma,0,0,1,0,1,0,0,0,../input/ocular-disease-recognition-odir5k/ODI...,['A'],"[0, 0, 0, 0, 1, 0, 0, 0]",H-Flip_43_right.jpg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4982,4115,60,Male,4115_left.jpg,4115_right.jpg,mild nonproliferative retinopathy,normal fundus,0,1,0,0,0,0,0,0,../input/ocular-disease-recognition-odir5k/ODI...,['D'],"[0, 1, 0, 0, 0, 0, 0, 0]",4115_left.jpg
4983,4361,62,Female,4361_left.jpg,4361_right.jpg,normal fundus,mild nonproliferative retinopathy,0,1,0,0,0,0,0,0,../input/ocular-disease-recognition-odir5k/ODI...,['D'],"[0, 1, 0, 0, 0, 0, 0, 0]",4361_right.jpg
4984,4532,60,Male,4532_left.jpg,4532_right.jpg,mild nonproliferative retinopathy,moderate non proliferative retinopathy,0,1,0,0,0,0,0,0,../input/ocular-disease-recognition-odir5k/ODI...,['D'],"[0, 1, 0, 0, 0, 0, 0, 0]",4532_left.jpg
4985,850,68,Male,850_left.jpg,850_right.jpg,moderate non proliferative retinopathy，macular...,macular epiretinal membrane，moderate non proli...,0,1,0,0,0,0,0,1,../input/ocular-disease-recognition-odir5k/ODI...,['D'],"[0, 1, 0, 0, 0, 0, 0, 0]",850_left.jpg


In [11]:
# Count the occurrences of each class in the filtered DataFrame
class_counts = Counter()
for labels in filtered_df['labels']:
    for label in eval(labels):
        class_counts[label] += 1

# Print the counts of each class
print("\nNumber of images in each class in the filtered file:")
for label, count in class_counts.items():
    print(f"{label}: {count}")

print(f"\nTotal number of images in the filtered file: {len(filtered_df)}")
print("Filtered images saved to 'new_filtered_images.csv'.")


Number of images in each class in the filtered file:
M: 928
C: 1172
A: 1064
G: 1136
O: 1416
N: 1915
D: 1608

Total number of images in the filtered file: 9239
Filtered images saved to 'new_filtered_images.csv'.
