# Capstone Project 

### Install pip:

pip install torch torchvision matplotlib

### Import necessary libraries

In [3]:
import os
import json
import random
import shutil
from pycocotools.coco import COCO
import zipfile


### Initialize COCO API for instance annotations

In [14]:
dataDir = r'C:\Users\muhri\Untitled Folder 1\ML'  # adjust this to your data directory
dataType = 'val2017'
annDir = r'C:\Users\muhri\Untitled Folder 1\ML\annotations'
annFile = '{}\instances_{}.json'.format(annDir, dataType)

coco = COCO(annFile)


loading annotations into memory...
Done (t=0.79s)
creating index...
index created!


In [16]:

# Get all categories
cats = coco.loadCats(coco.getCatIds())
categories = {cat['id']: cat['name'] for cat in cats}

# Counter for images per category
counter = {cat: 0 for cat in categories.values()}

for img_id in coco.getImgIds():
    ann_ids = coco.getAnnIds(imgIds=img_id)
    anns = coco.loadAnns(ann_ids)
    cat_ids = {ann['category_id'] for ann in anns}
    if len(cat_ids) == 1:
        cat_id = next(iter(cat_ids))
        cat_name = categories[cat_id]
        counter[cat_name] += 1

# Print categories with more than 50 images
for category, count in counter.items():
    if count > 30:
        print(f'{category}: {count}')

airplane: 49
train: 49
bird: 51
sheep: 31
cow: 47
elephant: 48
bear: 41
zebra: 70
giraffe: 72
toilet: 47
clock: 68


### Get all image ids and filter them



In [12]:
# import random

object_categories = ['bicycle', 'car', 'motorcycle', 'stop sign', 'cat', 'dog', 'backpack', 'umbrella', 
                    'handbag', 'bottle', 'cup', 'tv', 'laptop', 'oven', 'scissors']

object_cat_ids = coco.getCatIds(catNms=object_categories)
object_img_ids = []

# Loop over each category
for cat_id in object_cat_ids:
    # Get all images of this category
    img_ids = coco.getImgIds(catIds=cat_id)

    # For each image, check if it only has one category
    for img_id in img_ids:
        ann_ids = coco.getAnnIds(imgIds=img_id)
        anns = coco.loadAnns(ann_ids)

        # Collect category ids for the annotations
        cat_ids = [ann['category_id'] for ann in anns]

        # If the image only has this one category and that category is in our object_cat_ids, add it to our list
        if len(set(cat_ids)) == 1 and cat_ids[0] == cat_id:
            object_img_ids.append(img_id)

# Removing duplicates
object_img_ids = list(set(object_img_ids))

# Shuffle the ids
random.shuffle(object_img_ids)

# Check if we have enough images
num_images_required = min(len(object_img_ids), 700)
assert len(object_img_ids) >= num_images_required, f"Not enough images with the given categories. Only found {len(object_img_ids)} images."

# Split the images into train and test datasets
train_ids = object_img_ids[:int(num_images_required*0.7)]  # 70% of images go to training
test_ids = object_img_ids[int(num_images_required*0.7):num_images_required]  # 30% of images go to testing



### Copy selected images to a new directory

In [13]:
import zipfile

train_img_dir = 'train_images'
test_img_dir = 'test_images'

os.makedirs(train_img_dir, exist_ok=True)
os.makedirs(test_img_dir, exist_ok=True)

for img_id in train_ids:
    img_info = coco.loadImgs(img_id)[0]
    img_path = os.path.join(dataDir, dataType, img_info['file_name'])
    shutil.copy(img_path, os.path.join(train_img_dir, img_info['file_name']))

for img_id in test_ids:
    img_info = coco.loadImgs(img_id)[0]
    img_path = os.path.join(dataDir, dataType, img_info['file_name'])
    shutil.copy(img_path, os.path.join(test_img_dir, img_info['file_name']))

# zip automatically
with zipfile.ZipFile('train_images.zip', 'w', zipfile.ZIP_DEFLATED) as zipf:
    for root, dirs, files in os.walk(train_img_dir):
        for file in files:
            zipf.write(os.path.join(root, file),
                       os.path.relpath(os.path.join(root, file),
                                       os.path.join(train_img_dir, '..')))

with zipfile.ZipFile('test_images.zip', 'w', zipfile.ZIP_DEFLATED) as zipf:
    for root, dirs, files in os.walk(test_img_dir):
        for file in files:
            zipf.write(os.path.join(root, file),
                       os.path.relpath(os.path.join(root, file),
                                       os.path.join(test_img_dir, '..')))


### Extract corresponding annotations and save to new json files

In [None]:
def filter_annotations(coco, img_ids, save_path):
    ann_ids = coco.getAnnIds(imgIds=img_ids,  catIds=object_cat_ids)
    anns = coco.loadAnns(ann_ids)
    
    cats = coco.loadCats(object_cat_ids)
    cat_dict = {cat['id']: cat['name'] for cat in cats}

    # creating a dictionary where each key is an image id and the value is a list of category names associated with that image
    image_cat_dict = {}
    for ann in anns:
        img_id = ann['image_id']
        if img_id not in image_cat_dict:
            image_cat_dict[img_id] = []
        image_cat_dict[img_id].append(cat_dict[ann['category_id']])
    
    img_metas = coco.loadImgs(img_ids)

    # adding the categories to the image metadata
    for img in img_metas:
        img['categories'] = image_cat_dict[img['id']]

    with open(save_path, 'w') as f:
        json.dump(img_metas, f)

filter_annotations(coco, train_ids, 'train_annotations.json')
filter_annotations(coco, test_ids, 'test_annotations.json')


### Unzip files after taking from github

In [None]:
def unzip_file(zip_path, output_path):
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(output_path)

# To use the function, specify the paths
unzip_file('train_images.zip', 'train_images')
unzip_file('test_images.zip', 'test_images')
