# Convert annotations.csv file to the required format in Tensorflow

### Import pandas library

In [17]:
import pandas as pd

### Input file : 

In [18]:
pwd

'/root/data transformation'

In [19]:
train_labels_path = "/root/PlantDoc-Object-Detection-Dataset/train_labels.csv"

In [20]:
pwd

'/root/data transformation'

In [21]:
train_labels_df = pd.read_csv(train_labels_path)

In [22]:
train_labels_df.shape

(8461, 8)

In [23]:
train_labels_df.head()

Unnamed: 0,filename,width,height,class,xmin,ymin,xmax,ymax
0,cherry-tree-leaves-and-fruits.jpg,350,300,Cherry leaf,198,77,299,252
1,cherry-tree-leaves-and-fruits.jpg,350,300,Cherry leaf,3,114,148,235
2,cherry-tree-leaves-and-fruits.jpg,350,300,Cherry leaf,30,184,189,297
3,cherry-tree-leaves-and-fruits.jpg,350,300,Cherry leaf,226,4,346,83
4,peach-and-leaf-stock-image-2809275.jpg,1300,1099,Peach leaf,237,479,527,810


In [24]:
train_labels_df.dtypes

filename    object
width        int64
height       int64
class       object
xmin         int64
ymin         int64
xmax         int64
ymax         int64
dtype: object

In [25]:
train_labels_df = train_labels_df[train_labels_df["height"] !=0]

In [26]:
train_labels_df.shape

(8457, 8)

### Output format : 
``` 
The annotations.json file should contain information for bounding boxes and their class labels in the form of a dictionary "images" and "annotations" keys. The value for the "images" key should be a list of dictionaries. There should be one dictionary for each image with the following information: {"file_name": image_name, "height": height, "width": width, "id": image_id}. The value for the "annotations" key should also be a list of dictionaries. There should be one dictionary for each bounding box with the following information: {"image_id": image_id, "bbox": [xmin, ymin, xmax, ymax], "category_id": bbox_label}.
```

### Adding "id" column : 

In [27]:
train_labels_df.head()

Unnamed: 0,filename,width,height,class,xmin,ymin,xmax,ymax
0,cherry-tree-leaves-and-fruits.jpg,350,300,Cherry leaf,198,77,299,252
1,cherry-tree-leaves-and-fruits.jpg,350,300,Cherry leaf,3,114,148,235
2,cherry-tree-leaves-and-fruits.jpg,350,300,Cherry leaf,30,184,189,297
3,cherry-tree-leaves-and-fruits.jpg,350,300,Cherry leaf,226,4,346,83
4,peach-and-leaf-stock-image-2809275.jpg,1300,1099,Peach leaf,237,479,527,810


In [28]:
train_labels_df["filename"].drop_duplicates().reset_index(drop=True)

0            cherry-tree-leaves-and-fruits.jpg
1       peach-and-leaf-stock-image-2809275.jpg
2                         foodjuly2011+026.jpg
3                                     NCLB.jpg
4                        applerust-500x383.jpg
                         ...                  
2330                               020.JPG.jpg
2331         6134794031202304-600x272.jpeg.jpg
2332       pddl-highlights-fig-1-bact-spot.jpg
2333           bacterialLeafSpot07-2jqdlmz.jpg
2334             2159_0.jpeg?itok=eBFRbolm.jpg
Name: filename, Length: 2335, dtype: object

In [29]:
new_df = pd.DataFrame() # new data frame 
# Keep single file names: 
new_df["filename"] = train_labels_df["filename"].drop_duplicates().reset_index(drop=True)
# DataFrame.index = the index (row labels) of the DataFrame.
new_df["image_id"] = new_df["filename"].index 

In [30]:
new_df.head()

Unnamed: 0,filename,image_id
0,cherry-tree-leaves-and-fruits.jpg,0
1,peach-and-leaf-stock-image-2809275.jpg,1
2,foodjuly2011+026.jpg,2
3,NCLB.jpg,3
4,applerust-500x383.jpg,4


In [31]:
train_labels_df = train_labels_df.merge(new_df, on="filename")

In [32]:
# "id" to be added to dictionnaries in "images" list in final JSON file: 
train_labels_df["id"] = train_labels_df["image_id"]

# "image_id" is to be added to dictionnaries in "annotations" list in final JSON file

In [33]:
train_labels_df.head()

Unnamed: 0,filename,width,height,class,xmin,ymin,xmax,ymax,image_id,id
0,cherry-tree-leaves-and-fruits.jpg,350,300,Cherry leaf,198,77,299,252,0,0
1,cherry-tree-leaves-and-fruits.jpg,350,300,Cherry leaf,3,114,148,235,0,0
2,cherry-tree-leaves-and-fruits.jpg,350,300,Cherry leaf,30,184,189,297,0,0
3,cherry-tree-leaves-and-fruits.jpg,350,300,Cherry leaf,226,4,346,83,0,0
4,peach-and-leaf-stock-image-2809275.jpg,1300,1099,Peach leaf,237,479,527,810,1,1


### Renaming columns to fit Tensorflow requirments :

In [34]:
train_labels_df = train_labels_df.rename(columns={"filename": "file_name", "class": "category_id"})
train_labels_df.head()

Unnamed: 0,file_name,width,height,category_id,xmin,ymin,xmax,ymax,image_id,id
0,cherry-tree-leaves-and-fruits.jpg,350,300,Cherry leaf,198,77,299,252,0,0
1,cherry-tree-leaves-and-fruits.jpg,350,300,Cherry leaf,3,114,148,235,0,0
2,cherry-tree-leaves-and-fruits.jpg,350,300,Cherry leaf,30,184,189,297,0,0
3,cherry-tree-leaves-and-fruits.jpg,350,300,Cherry leaf,226,4,346,83,0,0
4,peach-and-leaf-stock-image-2809275.jpg,1300,1099,Peach leaf,237,479,527,810,1,1


### Adding bounding box column : 

In [35]:
# adding "bbox" column as follows: "bbox": [xmin, ymin, xmax, ymax]
train_labels_df["bbox"] = train_labels_df[["xmin","ymin","xmax","ymax"]].values.tolist()

### Preparing the json output : 

In [36]:
json_format = {}
# consists of two lists : "images" and "annotations"

In [37]:
# 1. slice the dataframe 
# 2. drop duplicate lines 
# 3. transform to dictionnary, oriented "records" meaning a list of dictionnaries 
json_format["images"] = train_labels_df[["file_name", "width", "height", "id"]].drop_duplicates().to_dict(orient="records")

In [38]:
json_format["annotations"] = train_labels_df[["image_id", "bbox", "category_id"]].to_dict(orient="records")

In [39]:
json_format.keys()

dict_keys(['images', 'annotations'])

In [40]:
json_format["images"][0].keys()

dict_keys(['file_name', 'width', 'height', 'id'])

In [41]:
json_format["annotations"][0].keys()

dict_keys(['image_id', 'bbox', 'category_id'])

### Creating the annotations.json file : 

In [42]:
import json

In [43]:
with open("annotations.json", "w") as json_data_file:
    json.dump(json_format, json_data_file)

In [44]:
 pwd

'/root/data transformation'

In [45]:
import json 

with open ("annotations.json") as f :
    data = json.load(f)
    for item in data['images']: 
        if item['width'] == 0 & item['height'] == 0:
            print(item)

In [57]:
!aws s3 cp "/root/data transformation/annotations.json" s3://plant-disease-detection-datasets/input_directory/annotations.json

upload: ./annotations.json to s3://plant-disease-detection-datasets/input_directory/annotations.json


In [56]:
import os
import json
import glob 

source_path = "/root/PlantDoc-Object-Detection-Dataset/TRAIN"
image_paths= glob.glob(os.path.join(source_path, "*.jpg"))

def find_missing_images(image_path, annotations_file):
    # Load the annotations file
    with open(annotations_file, 'r') as file:
        annotations_data = json.load(file)
    
    # Extract the filenames from the annotations file
    annotation_filenames = [entry['file_name'] for entry in annotations_data['images']]
    
    # Get the filenames of the images in the specified folder
    folder_filenames = [filename for filename in os.listdir(image_path) if filename.endswith('.jpg') or filename.endswith('.png')]
    
    # Find the missing images
    missing_images = set(folder_filenames) - set(annotation_filenames)
    
    return missing_images


annotations_file_path = r"/root/data transformation/annotations.json"  # Specify the path to the annotations file

missing_images = find_missing_images(source_path, annotations_file_path)


In [55]:
missing_images

{'270412tglr-wild-strawberry-flowers-and-leaf-patch.jpg',
 '2f73110f80014a25a53f9551c94bf164.png.jpg',
 'Black+Raspberry+Leaves+3.jpg',
 'Downy%20mildew.JPG.jpg',
 'Early-blight-example.ashx?mw=250.jpg',
 'Hydrangea+%2527Claudie%2527%252C+Powdery+Mildew.JPG.jpg',
 'SouthernRustLeaf.png.jpg',
 'Tomato%20physiologic%20leaf%20roll1F.JPG.jpg',
 'ac-0018.pdf-2_2.jpg',
 'aquilegia-powdery-mildew-erysiphe-aquilegiae-on-columbine-leaves-bga5xr.jpg',
 'early-blight-1-la_potatoleaf.jpg',
 'pm1_600px.jpg',
 'powdery-mildew-erysiphe-plantani-on-young-sycamore-leaf-b372wn.jpg',
 'ppth-friskop-1-corn-rust.png.jpg',
 'raspberries.jpg',
 'raspberry-db.jpg',
 'southernrust1.png.jpg',
 'tdisease_1.jpg',
 'tomato_D4a-TobRingspotVirus-1000077_zoom.jpg'}