In [3]:
import os
import pandas as pd

# Set the root directory and output CSV path
root_dir = 'garbage_data/CVPR_2024_dataset_Train'
output_csv = 'KDD/dataset/train_image_data.csv'

# Initialize an empty list to store file information
data = []

# Traverse through each category folder
for category in os.listdir(root_dir):
    category_path = os.path.join(root_dir, category)
    
    # Check if it is a directory
    if os.path.isdir(category_path):
        # Traverse each file in the category folder
        for filename in os.listdir(category_path):
            if filename.endswith('.png'):
                # Create the relative file path
                image_path = filename
                # Extract the filename without the extension
                description_text = os.path.splitext(filename)[0]
                
                # Append data as a tuple
                data.append((description_text, image_path, category))

# Create a DataFrame
df = pd.DataFrame(data, columns=['text', 'img_path', 'label'])

# Save the DataFrame to a CSV file
df.to_csv(output_csv, index=False)

print("CSV file created successfully.")


CSV file created successfully.


In [8]:
import os
import shutil

# Set the root directory and target folder
root_dir = 'garbage_data/CVPR_2024_dataset_Val'
target_folder = 'KDD/dataset/images'

# Create the target folder if it doesn't exist
os.makedirs(target_folder, exist_ok=True)

# Traverse through each category folder
for category in os.listdir(root_dir):
    category_path = os.path.join(root_dir, category)
    
    # Check if it is a directory
    if os.path.isdir(category_path):
        # Traverse each file in the category folder
        for filename in os.listdir(category_path):
            if filename.endswith('.png'):
                # Create the source and destination paths
                source_path = os.path.join(category_path, filename)
                dest_path = os.path.join(target_folder, filename)
                
                # Copy the file
                shutil.copy(source_path, dest_path)

print("All images have been copied successfully.")


All images have been copied successfully.


In [19]:

import pandas as pd
import re

# Load the CSV file
try:
    df = pd.read_csv("KDD/dataset/test_image_data.csv")
    print("CSV file loaded successfully.")
except FileNotFoundError:
    print("The file was not found. Please check the file path.")
    raise

# Check if the 'text' column exists
if 'text' not in df.columns:
    print("The 'text' column is missing from the CSV file.")
else:
    # Define a preprocessing function for the text column
    def preprocess_text(text):
        # Ensure text is a string
        if not isinstance(text, str):
            return text
        # Replace underscores with spaces
        text = text.replace("_", " ").replace("-", " ")
        # Remove numeric identifiers
        text = re.sub(r'\d+', '', text).strip()
        # Lowever case
        
        text = text.lower()
        # Optionally, convert to title case for readability
        #text = text.title()
        return text

    # Apply the preprocessing function to the 'text' column
    df['text'] = df['text'].apply(preprocess_text)

    # Save the modified DataFrame back to a new CSV
    df.to_csv("KDD/dataset/test_image_data.csv", index=False)
    print("Text preprocessing complete and saved.")


CSV file loaded successfully.
Text preprocessing complete and saved.


In [11]:
import pandas as pd

# Load the CSV file
df = pd.read_csv("KDD/dataset/train_image_data.csv")

# Check if 'text' column exists
if 'text' not in df.columns:
    print("The 'text' column is missing from the CSV file.")
else:
    # Find the longest entry in 'text' column
    df['text_length'] = df['text'].astype(str).apply(len)  # Create a new column with text lengths
    longest_text = df.loc[df['text_length'].idxmax(), 'text']  # Get the text with the max length
    longest_length = df['text_length'].max()  # Get the length of that text

    print("The longest text entry is:")
    print(longest_text)
    print("\nLength of the longest text:", longest_length)

The longest text entry is:
Clean Closed Polypropylene Take Out Food Container Front Side Up

Length of the longest text: 64
