---

# **Data Collection**

## Objectives

* Fetch data from Kaggle and prepare it for further processes.

## Inputs

*   Kaggle JSON file - the authentication token. 

## Outputs

* Generate Dataset: inputs/datasets/cars_dataset


---

# Import packages

In [21]:
import os

## Change the working directory

In [None]:
current_dir = os.getcwd()
current_dir

In [None]:
os.chdir('d:\\vscode-projects\\vehicle-type-recognition-v1')
print("You set a new current directory")

In [None]:

current_dir = os.getcwd()
current_dir

## Verify if `kaggle.json` Exists

In [None]:
if os.path.exists("kaggle.json"):
    print("kaggle.json found.")
else:
    print("kaggle.json NOT found. Please upload it to the project root.")

# Install Kaggle

In [None]:
# install kaggle package
%pip install kaggle==1.5.12

---

Run the cell below **to change the Kaggle configuration directory to the current working directory and set permissions for the Kaggle authentication JSON**.

In [27]:
os.environ['KAGGLE_CONFIG_DIR'] = os.getcwd()

Get the dataset path from the [Kaggle URL](https://www.kaggle.com/datasets/jutrera/stanford-car-dataset-by-classes-folder). When you are viewing the dataset at Kaggle, check what is after https://www.kaggle.com/ (in some cases kaggle.com/datasets). You should copy that at KaggleDatasetPath.

![Data Set](../images/dataset.png)

Set the Kaggle Dataset and Download it.

In [None]:
KaggleDatasetPath = "jutrera/stanford-car-dataset-by-classes-folder"
DestinationFolder = "inputs/car_dataset"   
! kaggle datasets download -d {KaggleDatasetPath} -p {DestinationFolder}

Unzip the downloaded file.

In [None]:
import zipfile
zip_path = "inputs/car_dataset/stanford-car-dataset-by-classes-folder.zip"
extract_to = "inputs/car_dataset"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_to)

print("Extraction completed.")

Delete the zip file.

In [None]:
zip_path = "inputs/car_dataset/stanford-car-dataset-by-classes-folder.zip"

if os.path.exists(zip_path):
    os.remove(zip_path)
    print("ZIP file removed successfully.")
else:
    print("ZIP file not found. It may have already been deleted.")

Move all class folders to a new folder "inputs\car_dataset\car_images

In [None]:
import shutil

# Paths
source_train = "inputs/car_dataset/car_data/car_data/train"
source_test = "inputs/car_dataset/car_data/car_data/test"
destination = "inputs/car_dataset/car_images"

# Create destination if it doesn't exist
os.makedirs(destination, exist_ok=True)

# Function to copy all images from a class folder
def copy_images(source_folder, target_folder):
    for img in os.listdir(source_folder):
        src = os.path.join(source_folder, img)
        dst = os.path.join(target_folder, img)
        if os.path.isfile(src):
            shutil.copy(src, dst)

# Function to move or merge class folders
def move_or_merge_classes(src_dir, dst_dir):
    for class_folder in os.listdir(src_dir):
        src_class_path = os.path.join(src_dir, class_folder)
        dst_class_path = os.path.join(dst_dir, class_folder)

        if not os.path.isdir(src_class_path):
            continue

        os.makedirs(dst_class_path, exist_ok=True)
        copy_images(src_class_path, dst_class_path)

# Merge folders from both train and test directories
move_or_merge_classes(source_train, destination)
move_or_merge_classes(source_test, destination)

print("All class folders merged into:", destination)

Move .csv files to a new folder "inputs\cars_dataset\csv

In [None]:
# Define paths
base_dir = "inputs/car_dataset"
csv_dir = os.path.join(base_dir, "csv")

# Create target folder if it doesn't exist
os.makedirs(csv_dir, exist_ok=True)

# CSV files to move
csv_files = ["anno_train.csv", "anno_test.csv", "names.csv"]

# Move each CSV file
for file_name in csv_files:
    src = os.path.join(base_dir, file_name)
    dst = os.path.join(csv_dir, file_name)
    if os.path.exists(src):
        shutil.move(src, dst)


Remove the folder "inputs\cars_dataset\car_data

In [None]:
car_data_path = "inputs/car_dataset/car_data"

# Check if folder exists before deleting
if os.path.exists(car_data_path) and os.path.isdir(car_data_path):
    shutil.rmtree(car_data_path)
    print("car_data' folder removed successfully.")
else:
    print("car_data' folder not found or already deleted.")

---

# Data Preparation

---

## Data cleaning

### Check and remove non-image files

In [35]:
def remove_non_image_files(my_data_dir):
    image_extensions = ('.png', '.jpg', '.jpeg')
    folders = os.listdir(my_data_dir)

    image_count = 0
    non_image_count = 0

    for folder in folders:
        folder_path = os.path.join(my_data_dir, folder)
        if not os.path.isdir(folder_path):
            continue  # Skip non-folder entries

        for file in os.listdir(folder_path):
            file_path = os.path.join(folder_path, file)
            if not file.lower().endswith(image_extensions):
                os.remove(file_path)
                non_image_count += 1
            else:
                image_count += 1

    print(f"{image_count} image files, {non_image_count} non-image files removed.")


In [None]:
remove_non_image_files(my_data_dir="inputs/car_dataset/car_images")

## Split train validation test set

In [37]:
import os
import shutil
import random

def split_train_validation_test_images_inplace(source_dir, train_ratio, val_ratio, test_ratio):
    # Check if ratios sum to 1.0
    if round(train_ratio + val_ratio + test_ratio, 2) != 1.0:
        print("Ratios must sum to 1.0")
        return

    # Get class labels, excluding split folders
    class_labels = [folder for folder in os.listdir(source_dir)
                    if os.path.isdir(os.path.join(source_dir, folder))
                    and folder not in ['train', 'validation', 'test']]

    # Create split folders
    for subset in ['train', 'validation', 'test']:
        for label in class_labels:
            path = os.path.join(source_dir, subset, label)
            os.makedirs(path, exist_ok=True)

    # Move images and remove empty folders
    for label in class_labels:
        class_path = os.path.join(source_dir, label)
        images = os.listdir(class_path)
        random.shuffle(images)

        n_total = len(images)
        n_train = int(n_total * train_ratio)
        n_val = int(n_total * val_ratio)

        for idx, image in enumerate(images):
            src = os.path.join(class_path, image)

            if idx < n_train:
                dst = os.path.join(source_dir, 'train', label, image)
            elif idx < n_train + n_val:
                dst = os.path.join(source_dir, 'validation', label, image)
            else:
                dst = os.path.join(source_dir, 'test', label, image)

            shutil.move(src, dst)

        # Remove the now-empty original class folder
        if os.path.exists(class_path) and not os.listdir(class_path):
            os.rmdir(class_path)

    print("All classes processed and original folders removed.")


Conventionally,
* The training set is divided into a 0.70 ratio of data.
* The validation set is divided into a 0.10 ratio of data.
* The test set is divided into a 0.20 ratio of data.

In [None]:
split_train_validation_test_images_inplace(
    source_dir="inputs/car_dataset/car_images",
    train_ratio=0.7,
    val_ratio=0.1,
    test_ratio=0.2
)

---