SCRIPT TO SPLIT IMAGES INTO  ORGAN FOLDERS

In [3]:
import os
import pandas as pd
import shutil

# === CONFIG ===
image_folder = '/home/yash/Desktop/project/train2'         # Flat folder with all images
csv_path = '/home/yash/Desktop/project/cdv/trrain2.csv'    # Original labels CSV
output_root = '/home/yash/Desktop/project/temp/t1'  # Output folder for organs only
new_csv_path = '/home/yash/Desktop/project/temp/new_train2.csv'  # Path for new filtered CSV

# Define a list of tool labels to exclude
tools_labels = [
    "TheOther_Instruments", "HarmonicAce_Head", "HarmonicAce_Body", "MarylandBipolarForceps_Head", 
    "MarylandBipolarForceps_Wrist", "CadiereForceps_Head", "CadiereForceps_Wrist", "CadiereForceps_Body", 
    "Specimenbag", "Stapler_Head", "Stapler_Body", "DrainTube", "Needle", "SmallClipApplier_Head", 
    "SmallClipApplier_Wrist", "SmallClipApplier_Body", "MediumLargeClipApplier_Head", 
    "MediumLargeClipApplier_Wrist", "MediumLargeClipApplier_Body", "SuctionIrrigation",
    "CurvedAtraumaticGrasper_Head", "CurvedAtraumaticGrasper_Body", "MarylandBipolarForceps_Body",
    "Endotip", "Gauze"
]

# Load CSV
df = pd.read_csv(csv_path)

# Clean filename
df['filename'] = df['filename'].apply(lambda x: os.path.basename(x))

# Convert label strings into lists
df['labels'] = df['labels'].apply(lambda x: [label.strip() for label in x.split(',')])

# Remove tool labels
df['labels'] = df['labels'].apply(lambda labels: [label for label in labels if label not in tools_labels])

# Filter out rows with no valid labels
df = df[df['labels'].apply(len) > 0]

# Deduplicate labels per image
df = df.groupby('filename')['labels'].sum().reset_index()
df['labels'] = df['labels'].apply(lambda x: list(set(x)))

# Save new master label CSV
df['labels'] = df['labels'].apply(lambda x: ','.join(x))  # Convert list back to comma-separated string
df.to_csv(new_csv_path, index=False)
print(f"📄 New label CSV saved to: {new_csv_path}")

# Split images into label folders
for _, row in df.iterrows():
    filename = row['filename']
    labels = row['labels'].split(',')  # Now stored as string in CSV
    src_path = os.path.join(image_folder, filename)

    if not os.path.exists(src_path):
        print(f"⚠️  Warning: {filename} not found in image folder, skipping.")
        continue

    for label in labels:
        label_folder = os.path.join(output_root, label)
        os.makedirs(label_folder, exist_ok=True)

        dst_path = os.path.join(label_folder, filename)
        if not os.path.exists(dst_path):
            shutil.copy(src_path, dst_path)

        # Update class-specific CSV too
        label_csv_path = os.path.join(label_folder, 'labels.csv')
        with open(label_csv_path, 'a') as f:
            f.write(f"{filename},{','.join(labels)}\n")

print("✅ Dataset reorganized. Tool labels excluded. New master CSV created.")


📄 New label CSV saved to: /home/yash/Desktop/project/temp/new_train2.csv
✅ Dataset reorganized. Tool labels excluded. New master CSV created.


AUGMENTATION(UPSAMPLE) SCRIPT

In [17]:
import os
import random
import pandas as pd
from PIL import Image
from torchvision import transforms
import warnings

# === CONFIG ===
image_folder = '/home/yash/Desktop/project/new dataset/train3/TheOther_Tissues'  # Gallbladder images only
csv_path = '/home/yash/Desktop/project/new dataset/train3/TheOther_Tissues/new_train3.csv'  # Original labels CSV
num_to_generate = 1  # Number of augmented images

# Output folder (same as the original Gallbladder folder)
output_folder = image_folder

# --- Robust CSV Loading ---
def load_csv(csv_path):
    try:
        df = pd.read_csv(csv_path)
    except pd.errors.ParserError as e:
        warnings.warn(f"ParserError: {e}\nTrying to load with on_bad_lines='skip'.")
        df = pd.read_csv(csv_path, on_bad_lines='skip')
    return df

df = load_csv(csv_path)

# --- Check and Fix Columns ---
expected_columns = ['filename', 'labels']
if not all(col in df.columns for col in expected_columns):
    print(f"❌ CSV columns found: {df.columns.tolist()}")
    raise ValueError(f"CSV must contain columns: {expected_columns}")

# Normalize filename and labels columns
df['filename'] = df['filename'].apply(lambda x: os.path.basename(str(x)))
df['labels'] = df['labels'].apply(lambda x: [label.strip() for label in str(x).split(',')])

# Remove entries with missing image files
initial_count = len(df)
df = df[df['filename'].apply(lambda f: os.path.exists(os.path.join(image_folder, f)))]
removed = initial_count - len(df)
print(f"🧹 Removed {removed} entries pointing to missing image files.")

# Filter for Gallbladder images
gallbladder_df = df[df['labels'].apply(lambda labels: 'Gallbladder' in labels)].reset_index(drop=True)

# --- Check for images ---
image_files = gallbladder_df['filename'].tolist()
if not image_files:
    raise ValueError("No Gallbladder images found in the dataset.")

# --- Augmentation Pipeline ---
augment = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(20),
    transforms.ColorJitter(brightness=0.2, contrast=0.2),
])

# --- Store new label entries ---
labels = []

# Add original entries (with full label set)
for _, row in gallbladder_df.iterrows():
    labels.append({'filename': row['filename'], 'labels': ','.join(row['labels'])})

# --- Generate augmented images ---
for i in range(num_to_generate):
    row = gallbladder_df.sample(n=1).iloc[0]
    original_file = row['filename']
    original_labels = row['labels']
    original_path = os.path.join(image_folder, original_file)

    # Open and augment image
    img = Image.open(original_path).convert('RGB')
    aug_img = augment(img)

    # Generate unique filename for each augmentation
    aug_filename = f"aug_{i}_{original_file}"
    aug_path = os.path.join(output_folder, aug_filename)

    # Save image
    aug_img.save(aug_path)

    # Save label record
    labels.append({'filename': aug_filename, 'labels': ','.join(original_labels)})

# --- Write updated labels to CSV ---
df_new = pd.DataFrame(labels)
df_new.to_csv(csv_path, index=False)

print(f"✅ Augmentation complete: {num_to_generate} new images.")
print(f"📄 CSV updated at: {csv_path}")


🧹 Removed 1 entries pointing to missing image files.
✅ Augmentation complete: 1 new images.
📄 CSV updated at: /home/yash/Desktop/project/new dataset/train3/TheOther_Tissues/new_train3.csv


In [18]:
import os
import shutil
import pandas as pd
from collections import defaultdict

# === MANUAL CONFIGURATION ===
organ_image_folders = [
    "/home/yash/Desktop/project/new dataset/train3/Gallbladder",
    "/home/yash/Desktop/project/new dataset/train3/Liver",
    "/home/yash/Desktop/project/new dataset/train3/Pancreas",
    "/home/yash/Desktop/project/new dataset/train3/Stomach",
    "/home/yash/Desktop/project/new dataset/train3/Spleen",
    "/home/yash/Desktop/project/new dataset/train3/TheOther_Tissues",
    
]

organ_label_csvs = [
    "/home/yash/Desktop/project/new dataset/train3/Gallbladder/new_train3.csv",
    "/home/yash/Desktop/project/new dataset/train3/Liver/new_train3.csv",
    "/home/yash/Desktop/project/new dataset/train3/Pancreas/new_train3.csv",
    "/home/yash/Desktop/project/new dataset/train3/Stomach/new_train3.csv",
    "/home/yash/Desktop/project/new dataset/train3/Spleen/new_train3.csv",
    "/home/yash/Desktop/project/new dataset/train3/TheOther_Tissues/new_train3.csv",
    
]

# Output paths
OUTPUT_IMG_DIR = "/home/yash/Desktop/project/new dataset/ft3"
OUTPUT_CSV_PATH = "/home/yash/Desktop/project/new dataset/ft3/main_labels.csv"
LOG_PATH = "/home/yash/Desktop/project/new dataset/ft3/merge_log.txt"

# Create output folder if not exists
os.makedirs(OUTPUT_IMG_DIR, exist_ok=True)

# Tracker for duplicate filenames
filename_counter = defaultdict(int)
final_rows = []
log_lines = []

# === MERGING ===
for images_dir, labels_csv_path in zip(organ_image_folders, organ_label_csvs):
    if not os.path.exists(labels_csv_path):
        log_lines.append(f"❌ Missing CSV: {labels_csv_path}")
        continue
    if not os.path.exists(images_dir):
        log_lines.append(f"❌ Missing image folder: {images_dir}")
        continue

    df = pd.read_csv(labels_csv_path)

    for _, row in df.iterrows():
        original_filename = row['filename'].strip()
        labels = row['labels'].strip()

        # Ensure filenames are unique by appending a count if needed
        filename_base, ext = os.path.splitext(original_filename)
        count = filename_counter[original_filename]
        new_filename = original_filename if count == 0 else f"{filename_base}_{count}{ext}"
        filename_counter[original_filename] += 1

        # Define source and destination paths
        src_img_path = os.path.join(images_dir, original_filename)
        dst_img_path = os.path.join(OUTPUT_IMG_DIR, new_filename)

        if os.path.exists(src_img_path):
            # Copy the image to the destination folder
            shutil.copy(src_img_path, dst_img_path)
            final_rows.append((new_filename, labels))

            # Log if a duplicate filename is used
            if new_filename != original_filename:
                log_lines.append(f"Duplicated: {original_filename} → {new_filename} | Labels: {labels}")
        else:
            # Log the missing image warning
            warning_msg = f"⚠️ Missing image: {src_img_path}"
            print(warning_msg)
            log_lines.append(warning_msg)

# Write the merged CSV with the final image labels
with open(OUTPUT_CSV_PATH, 'w') as f:
    f.write("filename,labels\n")
    for filename, labels in final_rows:
        f.write(f"{filename},{labels}\n")

# Write the log file for duplicate handling and missing images
with open(LOG_PATH, 'w') as log_file:
    log_file.write("=== Duplicate Image Merge Log ===\n")
    for line in log_lines:
        log_file.write(line + "\n")

# Summary output
print(f"\n✅ All images (including duplicates) saved to: {OUTPUT_IMG_DIR}")
print(f"✅ Merged CSV written to: {OUTPUT_CSV_PATH}")
print(f"📝 Log file saved to: {LOG_PATH}")



✅ All images (including duplicates) saved to: /home/yash/Desktop/project/new dataset/ft3
✅ Merged CSV written to: /home/yash/Desktop/project/new dataset/ft3/main_labels.csv
📝 Log file saved to: /home/yash/Desktop/project/new dataset/ft3/merge_log.txt


SCRIPT TO MERGE FOLDERS

In [5]:
import os
import shutil
import pandas as pd
from collections import defaultdict

# === MANUAL CONFIGURATION ===
organ_image_folders = [
    "/home/yash/Desktop/project/new dataset/train1/Gallbladder",
    "/home/yash/Desktop/project/new dataset/train1/Liver",
    "/home/yash/Desktop/project/new dataset/train1/Pancreas",
    "/home/yash/Desktop/project/new dataset/train1/Stomach",
    "/home/yash/Desktop/project/new dataset/train1/TheOther_Tissues",
    "/home/yash/Desktop/project/new dataset/train1/Spleen"
]

organ_label_csvs = [
    "/home/yash/Desktop/project/new dataset/train1/Gallbladder/new_train1.csv",
    "/home/yash/Desktop/project/new dataset/train1/Liver/new_train1.csv",
    "/home/yash/Desktop/project/new dataset/train1/Pancreas/new_train1.csv",
    "/home/yash/Desktop/project/new dataset/train1/Stomach/new_train1.csv",
    "/home/yash/Desktop/project/new dataset/train1/TheOther_Tissues/new_train1.csv",
    "/home/yash/Desktop/project/new dataset/train1/Spleen/new_train1.csv"
]

# Output paths
OUTPUT_IMG_DIR = "/home/yash/Desktop/project/new dataset/ft1"
OUTPUT_CSV_PATH = "/home/yash/Desktop/project/new dataset/ft1/main_labels.csv"
LOG_PATH = "/home/yash/Desktop/project/new dataset/ft1/merge_log.txt"

# Create output folder if not exists
os.makedirs(OUTPUT_IMG_DIR, exist_ok=True)

# Tracker for duplicate filenames
filename_counter = defaultdict(int)
final_rows = []
log_lines = []

# === MERGING ===
for images_dir, labels_csv_path in zip(organ_image_folders, organ_label_csvs):
    if not os.path.exists(labels_csv_path):
        log_lines.append(f"❌ Missing CSV: {labels_csv_path}")
        continue
    if not os.path.exists(images_dir):
        log_lines.append(f"❌ Missing image folder: {images_dir}")
        continue

    # Read the CSV for labels
    df = pd.read_csv(labels_csv_path)

    # Log for checking if the file exists
    for _, row in df.iterrows():
        original_filename = row['filename']
        labels = row['labels'].strip()
        image_path = os.path.join(images_dir, original_filename)

        # Check if the file exists in the organ folder
        if os.path.exists(image_path):
            print(f"Found image: {original_filename} in {images_dir}")
        else:
            print(f"⚠️ Image not found: {original_filename} in {images_dir}")

        # Handle filename uniqueness
        filename_base, ext = os.path.splitext(original_filename)
        count = filename_counter[original_filename]
        new_filename = original_filename if count == 0 else f"{filename_base}_{count}{ext}"
        filename_counter[original_filename] += 1

        # Source and destination paths for images
        src_img_path = os.path.join(images_dir, original_filename)
        dst_img_path = os.path.join(OUTPUT_IMG_DIR, new_filename)

        if os.path.exists(src_img_path):
            shutil.copy(src_img_path, dst_img_path)
            final_rows.append((new_filename, labels))

            # Log duplicate filenames
            if new_filename != original_filename:
                log_lines.append(f"Duplicated: {original_filename} → {new_filename} | Labels: {labels}")
        else:
            warning_msg = f"⚠️ Missing image: {src_img_path}"
            print(warning_msg)
            log_lines.append(warning_msg)

# Write the merged CSV with the final image labels
with open(OUTPUT_CSV_PATH, 'w') as f:
    f.write("filename,labels\n")
    for filename, labels in final_rows:
        f.write(f"{filename},{labels}\n")

# Write the log file for duplicate handling and missing images
with open(LOG_PATH, 'w') as log_file:
    log_file.write("=== Duplicate Image Merge Log ===\n")
    for line in log_lines:
        log_file.write(line + "\n")

# Summary output
print(f"\n✅ All images (including duplicates) saved to: {OUTPUT_IMG_DIR}")
print(f"✅ Merged CSV written to: {OUTPUT_CSV_PATH}")
print(f"📝 Log file saved to: {LOG_PATH}")


Found image: R001_ch1_video_01_00-15-18-17.jpg in /home/yash/Desktop/project/new dataset/train1/Gallbladder
Found image: R001_ch1_video_01_00-15-45-29.jpg in /home/yash/Desktop/project/new dataset/train1/Gallbladder
Found image: R001_ch1_video_01_00-19-55-03.jpg in /home/yash/Desktop/project/new dataset/train1/Gallbladder
Found image: R001_ch1_video_01_00-21-47-07.jpg in /home/yash/Desktop/project/new dataset/train1/Gallbladder
Found image: R001_ch1_video_01_00-22-06-28.jpg in /home/yash/Desktop/project/new dataset/train1/Gallbladder
Found image: R001_ch1_video_01_00-26-29-12.jpg in /home/yash/Desktop/project/new dataset/train1/Gallbladder
Found image: R001_ch1_video_01_00-30-42-07.jpg in /home/yash/Desktop/project/new dataset/train1/Gallbladder
Found image: R001_ch1_video_01_00-53-21-03.jpg in /home/yash/Desktop/project/new dataset/train1/Gallbladder
Found image: R001_ch1_video_01_00-56-00-11.jpg in /home/yash/Desktop/project/new dataset/train1/Gallbladder
Found image: R001_ch1_video_

script to downsample images


In [6]:
import os
import random
import pandas as pd

# Paths
IMAGE_FOLDER = "/home/yash/Desktop/project/new dataset/train1/Pancreas"  # Update with the path to your image folder
CSV_PATH = "/home/yash/Desktop/project/new dataset/train1/Pancreas/new_train1.csv"  # Update with the path to your CSV file
OUTPUT_CSV_PATH = "/home/yash/Desktop/project/new dataset/train1/Pancreas/new_train2.csv"  # Path to save the updated CSV
NUM_IMAGES_TO_DELETE = 1000  # Number of images to delete

# Read the CSV file
df = pd.read_csv(CSV_PATH)

# List all image files in the folder
all_images = [f for f in os.listdir(IMAGE_FOLDER) if os.path.isfile(os.path.join(IMAGE_FOLDER, f))]

# Filter only augmented images (assuming they contain 'aug' in the filename)
augmented_images = [img for img in all_images if 'aug' in img]

# Check if there are enough augmented images
if len(augmented_images) < NUM_IMAGES_TO_DELETE:
    print(f"⚠️ There are only {len(augmented_images)} augmented images. Deleting all of them.")
    NUM_IMAGES_TO_DELETE = len(augmented_images)

# Randomly select images to delete
images_to_delete = random.sample(augmented_images, NUM_IMAGES_TO_DELETE)

# Remove selected images from the folder
for image in images_to_delete:
    image_path = os.path.join(IMAGE_FOLDER, image)
    if os.path.exists(image_path):
        os.remove(image_path)
        print(f"✅ Deleted: {image}")

# Update the CSV file by removing the rows for the deleted images
df = df[~df['filename'].isin(images_to_delete)]

# Save the updated CSV
df.to_csv(OUTPUT_CSV_PATH, index=False)
print(f"✅ Updated CSV saved to: {OUTPUT_CSV_PATH}")

print(f"✅ {NUM_IMAGES_TO_DELETE} augmented images deleted and CSV updated.")


✅ Deleted: aug_2252_R002_ch1_video_03_00-40-41-21.jpg
✅ Deleted: aug_2392_R001_ch1_video_03_00-52-00-17.jpg
✅ Deleted: aug_1170_R209_ch1_video_01_00-57-20-00.jpg
✅ Deleted: aug_1666_R206_ch1_video_03_00-03-09-28.jpg
✅ Deleted: aug_1176_R084_ch1_video_03_00-05-26-12.jpg
✅ Deleted: aug_1515_R005_ch1_video_01_00-49-32-21.jpg
✅ Deleted: aug_2153_R100_ch1_video_01_00-49-23-05.jpg
✅ Deleted: aug_2073_R301_ch1_video_04_00-15-57-12.jpg
✅ Deleted: aug_1759_R014_ch1_video_03_00-23-24-04.jpg
✅ Deleted: aug_1245_R010_ch1_video_01_00-40-39-20.jpg
✅ Deleted: aug_2023_R002_ch1_video_01_00-41-27-13.jpg
✅ Deleted: aug_727_R048_ch1_video_02_00-11-41-16.jpg
✅ Deleted: aug_729_R002_ch1_video_03_00-40-52-30.jpg
✅ Deleted: aug_1917_R014_ch1_video_03_01-03-19-26.jpg
✅ Deleted: aug_1010_R305_ch1_video_04_00-01-14-12.jpg
✅ Deleted: aug_1643_R301_ch1_video_04_00-36-59-13.jpg
✅ Deleted: aug_1002_R205_ch1_video_05_00-00-02-20.jpg
✅ Deleted: aug_972_R117_ch1_video_03_00-35-47-01.jpg
✅ Deleted: aug_843_R015_ch1_vid

script to verify dataset

In [19]:
import os
import pandas as pd

# === CONFIGURATION ===
CSV_PATH = "/home/yash/Desktop/project/new dataset/ft3/main_labels.csv"
IMAGE_DIR = "/home/yash/Desktop/project/new dataset/ft3/"
OUTPUT_CSV = "/home/yash/Desktop/project/new dataset/ft3/cleaned_and_verified.csv"
LOG_PATH = "/home/yash/Desktop/project/new dataset/ft3/missing_images_log.txt"

# === LOAD CSV SAFELY: Read first column as filename, rest as label string
with open(CSV_PATH, 'r') as f:
    lines = f.readlines()

rows = []
for line in lines[1:]:  # Skip header
    parts = line.strip().split(",")
    if len(parts) >= 2:
        filename = parts[0]
        labels = ",".join(parts[1:])
        rows.append((filename, labels))

# === VERIFY IMAGES EXIST ===
all_images = set(os.listdir(IMAGE_DIR))
missing = []
verified_rows = []

for filename, labels in rows:
    if filename in all_images:
        verified_rows.append((filename, labels))
    else:
        missing.append(filename)

# === SAVE CLEANED CSV ===
df_clean = pd.DataFrame(verified_rows, columns=["filename", "labels"])
df_clean.to_csv(OUTPUT_CSV, index=False)

# === SAVE MISSING LOG ===
with open(LOG_PATH, 'w') as log:
    log.write("=== MISSING IMAGES ===\n")
    for fname in missing:
        log.write(f"{fname}\n")

print(f"✅ Cleaned CSV saved to: {OUTPUT_CSV}")
print(f"📝 Missing images logged to: {LOG_PATH}")
print(f"✅ Found {len(verified_rows)} images")
print(f"❌ Missing {len(missing)} images")


✅ Cleaned CSV saved to: /home/yash/Desktop/project/new dataset/ft3/cleaned_and_verified.csv
📝 Missing images logged to: /home/yash/Desktop/project/new dataset/ft3/missing_images_log.txt
✅ Found 11010 images
❌ Missing 0 images


In [27]:
import os
import shutil
import pandas as pd

# Define your input folders and CSV files
folders = [
    '/home/yash/Desktop/project/new dataset/ft1',
    '/home/yash/Desktop/project/new dataset/ft2',
    '/home/yash/Desktop/project/new dataset/ft3'
]
csv_files = [
    '/home/yash/Desktop/project/new dataset/ft1/main_labels.csv',
    '/home/yash/Desktop/project/new dataset/ft2/main_labels.csv',
    '/home/yash/Desktop/project/new dataset/ft3/main_labels.csv'
]

# Define output folder and output CSV
output_folder = '/home/yash/Desktop/project/new dataset/final'
output_csv = '/home/yash/Desktop/project/new dataset/final/main_labels.csv'

# Create the output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# List to store all label dataframes
all_labels = []

for folder, csv_file in zip(folders, csv_files):
    # Load the CSV
    try:
        df = pd.read_csv(csv_file, on_bad_lines='skip')
    except Exception as e:
        print(f"⚠️ Error reading {csv_file}: {e}")
        continue

    updated_filenames = []

    # Move images and update filenames if needed
    for filename in df['filename']:
        src_path = os.path.join(folder, filename)

        # Check if the file actually exists
        if not os.path.isfile(src_path):
            print(f"⚠️ Skipping missing or invalid file: {src_path}")
            updated_filenames.append(None)  # Mark this as missing
            continue

        dst_path = os.path.join(output_folder, filename)
        new_filename = filename

        # If filename already exists, rename it to avoid conflict
        if os.path.exists(dst_path):
            base, ext = os.path.splitext(filename)
            counter = 1
            while os.path.exists(dst_path):
                new_filename = f"{base}_{counter}{ext}"
                dst_path = os.path.join(output_folder, new_filename)
                counter += 1

        # Copy the file
        shutil.copy2(src_path, dst_path)
        updated_filenames.append(new_filename)

    # Update the DataFrame with possibly new filenames
    df['filename'] = updated_filenames

    # Drop rows where filename was missing
    df = df.dropna(subset=['filename'])

    # Append the cleaned DataFrame
    all_labels.append(df)

# Merge all the cleaned dataframes
merged_df = pd.concat(all_labels, ignore_index=True)

# Save the final merged CSV
merged_df.to_csv(output_csv, index=False)

print("\n✅ Merging complete!")
print(f"📂 Images are saved in: {output_folder}")
print(f"📝 Combined labels CSV saved as: {output_csv}")



✅ Merging complete!
📂 Images are saved in: /home/yash/Desktop/project/new dataset/final
📝 Combined labels CSV saved as: /home/yash/Desktop/project/new dataset/final/main_labels.csv


In [26]:
import os

# List of CSV file paths you want to fix
csv_files = [
    '/home/yash/Desktop/project/new dataset/ft1/main_labels.csv',
    '/home/yash/Desktop/project/new dataset/ft2/main_labels.csv',
    '/home/yash/Desktop/project/new dataset/ft3/main_labels.csv'
]

# Set to True if you want to overwrite the original files
overwrite_original = True

for csv_file in csv_files:
    if not os.path.isfile(csv_file):
        print(f"⚠️ File not found: {csv_file}")
        continue

    print(f"Processing: {csv_file}")

    fixed_lines = []

    with open(csv_file, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    # Process header
    header = lines[0].strip().split(',')
    if len(header) < 2 or header[0].lower() != 'filename':
        print(f"⚠️ Unexpected CSV format in: {csv_file}")
        continue

    fixed_lines.append('filename,labels\n')

    # Process each row
    for line in lines[1:]:
        parts = line.strip().split(',')

        # Ensure there is at least a filename and one label
        if len(parts) < 2:
            print(f"⚠️ Skipping malformed line: {line.strip()}")
            continue

        filename = parts[0]
        labels = ','.join(parts[1:])  # Combine all remaining parts into one label string
        labels = f"\"{labels}\""      # Wrap labels in double quotes

        fixed_line = f"{filename},{labels}\n"
        fixed_lines.append(fixed_line)

    # Determine output file path
    if overwrite_original:
        output_path = csv_file  # Overwrite original file
    else:
        folder, name = os.path.split(csv_file)
        output_path = os.path.join(folder, f"fixed_{name}")

    # Save the fixed CSV
    with open(output_path, 'w', encoding='utf-8') as file:
        file.writelines(fixed_lines)

    print(f"✅ Fixed CSV saved at: {output_path}")

print("\n🎉 All done!")


Processing: /home/yash/Desktop/project/new dataset/ft1/main_labels.csv
✅ Fixed CSV saved at: /home/yash/Desktop/project/new dataset/ft1/main_labels.csv
Processing: /home/yash/Desktop/project/new dataset/ft2/main_labels.csv
✅ Fixed CSV saved at: /home/yash/Desktop/project/new dataset/ft2/main_labels.csv
Processing: /home/yash/Desktop/project/new dataset/ft3/main_labels.csv
✅ Fixed CSV saved at: /home/yash/Desktop/project/new dataset/ft3/main_labels.csv

🎉 All done!


In [28]:
import os
import pandas as pd

# Paths
final_folder = '/home/yash/Desktop/project/new dataset/final'
final_csv = '/home/yash/Desktop/project/new dataset/final/main_labels.csv'

# Load the CSV
df = pd.read_csv(final_csv)

# List of images mentioned in CSV
csv_filenames = set(df['filename'])

# List of images actually present in the folder
folder_filenames = set(f for f in os.listdir(final_folder) if f.lower().endswith(('.jpg', '.jpeg', '.png')))

# Check for missing images
missing_in_folder = csv_filenames - folder_filenames
extra_in_folder = folder_filenames - csv_filenames

# Results
print("\n🔎 Verification Results:")

if not missing_in_folder:
    print("✅ All images listed in CSV exist in the folder!")
else:
    print(f"❌ {len(missing_in_folder)} images listed in CSV are missing in the folder:")
    for missing_file in missing_in_folder:
        print(f"   - {missing_file}")

if not extra_in_folder:
    print("✅ No extra images found in the folder!")
else:
    print(f"⚠️ {len(extra_in_folder)} extra images found in the folder that are not in the CSV:")
    for extra_file in extra_in_folder:
        print(f"   - {extra_file}")

print("\n🎯 Verification complete!")



🔎 Verification Results:
✅ All images listed in CSV exist in the folder!
✅ No extra images found in the folder!

🎯 Verification complete!


In [1]:
import os
import zipfile

# Path of the final folder you want to zip
final_folder = '/home/yash/Desktop/project/new dataset/final'

# Path for the zip file to be created
zip_file_path = '/home/yash/Desktop/project/new dataset/final_dataset.zip'

# Create a zip file
with zipfile.ZipFile(zip_file_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
    # Walk through the folder to add all files
    for root, dirs, files in os.walk(final_folder):
        for file in files:
            file_path = os.path.join(root, file)
            arcname = os.path.relpath(file_path, final_folder)  # Save with relative paths
            zipf.write(file_path, arcname)

print(f"✅ The folder and its contents have been successfully zipped as: {zip_file_path}")


✅ The folder and its contents have been successfully zipped as: /home/yash/Desktop/project/new dataset/final_dataset.zip
