<a href="https://colab.research.google.com/github/ssegovba/identifying-deforestation/blob/main/segmentation_cleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

For segmentation we take all classes in our data set, assign them a label, and then each label will coresponds to a color and the images will be reduced to a color map with each color representing a class.

Tha goal is to identify Deforesstation through the presence of agriculture (since trees are cut down to clear land for farms). Initially the plan was to keep the same classes; forrest coverage is a class, argiculture coverage is a class, and then the additional classes for water, roads, etc(anything that isnt forrest). However, this data is not tagged for segmentation so it needs to be done manually.

Since we are only interested in how much forrest coverage there is, separating classes of man-made object i.e. mines, agriculture, etc. isnt necessary since they all indicate deforrestation. Therefore, we decided to have two classes and have a binary mask apllied to the segmentation training images.  

For optimized trainings, we want to include training images that contain as many classes as possible while also keeping a balance of images that are tagged clear, partly-cloudy, and haze so that the model can learn to identify deforestation regardless of weather conditions. In this script we select 200 training images based on this criteria.


In [3]:
import os
import glob
import shutil
#import keras
import sklearn
#import tensorflow
import numpy as np
import pandas as pd
from osgeo import gdal
import matplotlib.pyplot as plt
import seaborn as sns
import cv2
from matplotlib import pyplot as plt
%matplotlib inline
from PIL import Image
from sklearn.preprocessing import MultiLabelBinarizer
import json
from google.colab import drive

In [4]:
# Mount GDrive
drive.mount("/content/drive", force_remount=True)


Mounted at /content/drive


In [6]:
# Load data
data_path = "/content/drive/Shareddrives/computer-vision-project/Data/Unzipped/planet/planet/"
train_path = "/content/drive/Shareddrives/computer-vision-project/Data/Unzipped/planet/planet/train-jpg"
test_path = "/content/drive/Shareddrives/computer-vision-project/Data/Unzipped/planet/planet/test-jpg"

labels_df = pd.read_csv(data_path + "train_classes.csv")
print(labels_df)


        image_name                                           tags
0          train_0                                   haze primary
1          train_1                agriculture clear primary water
2          train_2                                  clear primary
3          train_3                                  clear primary
4          train_4      agriculture clear habitation primary road
...            ...                                            ...
40474  train_40474                                  clear primary
40475  train_40475                                         cloudy
40476  train_40476                      agriculture clear primary
40477  train_40477                 agriculture clear primary road
40478  train_40478  agriculture cultivation partly_cloudy primary

[40479 rows x 2 columns]


In [7]:
# Add extension so image names match file names
labels_df['image_name'] = labels_df['image_name'].apply(lambda x: x + '.jpg')

# Convert the space-separated tags into a list of tags
labels_df['tags'] = labels_df['tags'].apply(lambda x: x.split())

# Use a MultiLabelBinarizer for the tags for model training
mlb = MultiLabelBinarizer()
labels_df['encoded_tags'] = list(mlb.fit_transform(labels_df['tags']))

print(labels_df)

            image_name                                               tags  \
0          train_0.jpg                                    [haze, primary]   
1          train_1.jpg               [agriculture, clear, primary, water]   
2          train_2.jpg                                   [clear, primary]   
3          train_3.jpg                                   [clear, primary]   
4          train_4.jpg    [agriculture, clear, habitation, primary, road]   
...                ...                                                ...   
40474  train_40474.jpg                                   [clear, primary]   
40475  train_40475.jpg                                           [cloudy]   
40476  train_40476.jpg                      [agriculture, clear, primary]   
40477  train_40477.jpg                [agriculture, clear, primary, road]   
40478  train_40478.jpg  [agriculture, cultivation, partly_cloudy, prim...   

                                            encoded_tags  
0      [0, 0, 0,

In [8]:
# function to print all classes in an image to assist with manual tagging
def get_image_classes(image_name):
    # Filter the DataFrame for the row where the image_name matches
    image_classes = labels_df[labels_df['image_name'] == image_name]

    # Check if the image_name exists in the DataFrame
    if not image_classes.empty:
        # Return the tags associated with the image
        return image_classes['tags'].values[0]
    else:
        return "Image name not found"

# image_name = "train_16352.jpg" is crazy
image_name = "train_38754.jpg"
print(get_image_classes(image_name))

['agriculture', 'bare_ground', 'clear', 'cultivation', 'habitation', 'primary', 'road', 'water']


In [9]:
# list of all possible tags
unique_tags = set(tag for sublist in labels_df['tags'] for tag in sublist)
unique_tags = sorted(list(unique_tags))
print(unique_tags)

['agriculture', 'artisinal_mine', 'bare_ground', 'blooming', 'blow_down', 'clear', 'cloudy', 'conventional_mine', 'cultivation', 'habitation', 'haze', 'partly_cloudy', 'primary', 'road', 'selective_logging', 'slash_burn', 'water']


In [10]:
labels_df['tag_count'] = labels_df['tags'].apply(len)
max_tags = labels_df['tag_count'].max()
print("The maximum number of tags in a single image is:", max_tags)

The maximum number of tags in a single image is: 9


In [11]:
# Flatten the list of tags from these filtered images and count each tag
tag_counts = {}
for tags_list in labels_df['tags']:
    for tag in tags_list:
        if tag in tag_counts:
            tag_counts[tag] += 1
        else:
            tag_counts[tag] = 1

# Convert the dictionary to a list of tuples sorted by the number of occurrences, descending
sorted_tag_counts = sorted(tag_counts.items(), key=lambda item: item[1], reverse=True)

# Print the sorted list of tags and their counts
for tag, count in sorted_tag_counts:
    print(f"Tag: {tag}, Count: {count}")

Tag: primary, Count: 37513
Tag: clear, Count: 28431
Tag: agriculture, Count: 12315
Tag: road, Count: 8071
Tag: water, Count: 7411
Tag: partly_cloudy, Count: 7261
Tag: cultivation, Count: 4547
Tag: habitation, Count: 3660
Tag: haze, Count: 2697
Tag: cloudy, Count: 2089
Tag: bare_ground, Count: 862
Tag: selective_logging, Count: 340
Tag: artisinal_mine, Count: 339
Tag: blooming, Count: 332
Tag: slash_burn, Count: 209
Tag: blow_down, Count: 101
Tag: conventional_mine, Count: 100


In [12]:
tag_counts = {}

# Loop through the numbers 1 to 9 and count the occurrences of each tag count
for i in range(1, 10):
    tag_counts[i] = (labels_df['tag_count'] == i).sum()

for tag_count, num_images in tag_counts.items():
    print(f"Number of images with exactly {tag_count} tags: {num_images}")

Number of images with exactly 1 tags: 2091
Number of images with exactly 2 tags: 19176
Number of images with exactly 3 tags: 7192
Number of images with exactly 4 tags: 7171
Number of images with exactly 5 tags: 3700
Number of images with exactly 6 tags: 996
Number of images with exactly 7 tags: 127
Number of images with exactly 8 tags: 24
Number of images with exactly 9 tags: 2


The number of images with 7-9 tags(the highest number of tage images can have) is 153 so we will pull a list of those images into a seperate data frame. The number of imgaes with the next highest tag count(6) is 996. This is too many. So we will first check how representative the classes/tags are amoung images with the most tags, and randomly select images that have the unrepresented tags from the pool that has 6 tags.

In [13]:
filtered_df = labels_df[labels_df['tag_count'].isin([7, 8, 9])]

# Flatten the list of tags from these filtered images and count each tag
tag_counts = {}
for tags_list in filtered_df['tags']:
    for tag in tags_list:
        if tag in tag_counts:
            tag_counts[tag] += 1
        else:
            tag_counts[tag] = 1

# Convert the dictionary to a list of tuples sorted by the number of occurrences, descending
sorted_tag_counts = sorted(tag_counts.items(), key=lambda item: item[1], reverse=True)

# Print the sorted list of tags and their counts
for tag, count in sorted_tag_counts:
    print(f"Tag: {tag}, Count: {count}")


Tag: cultivation, Count: 162
Tag: primary, Count: 152
Tag: road, Count: 147
Tag: agriculture, Count: 144
Tag: habitation, Count: 144
Tag: clear, Count: 139
Tag: water, Count: 122
Tag: bare_ground, Count: 30
Tag: artisinal_mine, Count: 16
Tag: partly_cloudy, Count: 12
Tag: slash_burn, Count: 10
Tag: selective_logging, Count: 9
Tag: conventional_mine, Count: 9
Tag: haze, Count: 2
Tag: blooming, Count: 1


The goal is aroughly 66/66/66 split between the three mutuallly exclusive classes. Clear is overrepresented, and partly_cloudy and haze are very underrepresented. So we will randomly drop some of the images tagged clear from the list, and select images from the pool of images that have 6 tags that are tagged partly_cloudy or haze. If this pool does not have enough we will move to randomly select from the pool of images with 5 tags total.

In [14]:
# Filter the DataFrame to find rows where 'tag_count' is 6 and 'tags' contains 'haze'
tag6_haze_df = labels_df[(labels_df['tag_count'] == 6) & (labels_df['tags'].apply(lambda x: 'haze' in x))]

print(len(tag6_haze_df))


34


In [15]:
# Flatten the list of tags from these filtered images and count each tag
tag_counts_tag6 = {}
for tags_list in tag6_haze_df['tags']:
    for tag in tags_list:
        if tag in tag_counts_tag6:
            tag_counts_tag6[tag] += 1
        else:
            tag_counts_tag6[tag] = 1

# Convert the dictionary to a list of tuples sorted by the number of occurrences, descending
sorted_tag_counts6 = sorted(tag_counts_tag6.items(), key=lambda item: item[1], reverse=True)

# Print the sorted list of tags and their counts
for tag, count in sorted_tag_counts6:
    print(f"Tag: {tag}, Count: {count}")

Tag: haze, Count: 34
Tag: primary, Count: 34
Tag: agriculture, Count: 33
Tag: cultivation, Count: 31
Tag: road, Count: 30
Tag: habitation, Count: 22
Tag: water, Count: 17
Tag: bare_ground, Count: 3


In [16]:
# The number of images from the 6 tags pool with the haze tag is 34,
# which gets us to 36/66 so we will merge all of those and then randomly select
# from the pool with 5 tags.
merged_df = pd.concat([filtered_df, tag6_haze_df], ignore_index=True)
print(len(merged_df))
print(merged_df)


187
          image_name                                               tags  \
0      train_185.jpg  [agriculture, artisinal_mine, clear, cultivati...   
1      train_222.jpg  [agriculture, clear, cultivation, habitation, ...   
2      train_247.jpg  [agriculture, clear, cultivation, habitation, ...   
3      train_680.jpg  [agriculture, clear, cultivation, habitation, ...   
4      train_728.jpg  [agriculture, clear, cultivation, habitation, ...   
..               ...                                                ...   
182  train_33452.jpg  [cultivation, habitation, haze, primary, road,...   
183  train_33860.jpg  [agriculture, habitation, haze, primary, road,...   
184  train_35249.jpg  [agriculture, cultivation, haze, primary, road...   
185  train_37481.jpg  [agriculture, cultivation, habitation, haze, p...   
186  train_38283.jpg  [agriculture, cultivation, haze, primary, road...   

                                          encoded_tags  tag_count  
0    [1, 1, 0, 0, 0, 1, 0, 

In [17]:
# Flatten the list of tags from these filtered images and count each tag
tag_countsm = {}
for tags_list in merged_df['tags']:
    for tag in tags_list:
        if tag in tag_countsm:
            tag_countsm[tag] += 1
        else:
            tag_countsm[tag] = 1

# Convert the dictionary to a list of tuples sorted by the number of occurrences, descending
sorted_tag_countsm = sorted(tag_countsm.items(), key=lambda item: item[1], reverse=True)

# Print the sorted list of tags and their counts
for tag, count in sorted_tag_countsm:
    print(f"Tag: {tag}, Count: {count}")

Tag: cultivation, Count: 193
Tag: primary, Count: 186
Tag: agriculture, Count: 177
Tag: road, Count: 177
Tag: habitation, Count: 166
Tag: clear, Count: 139
Tag: water, Count: 139
Tag: haze, Count: 36
Tag: bare_ground, Count: 33
Tag: artisinal_mine, Count: 16
Tag: partly_cloudy, Count: 12
Tag: slash_burn, Count: 10
Tag: selective_logging, Count: 9
Tag: conventional_mine, Count: 9
Tag: blooming, Count: 1


In [18]:
# Randomly drop excess images tagged clear but exclude images if they contain a
# under represented tag.
excluded_tags = {'bare_ground', 'artisinal_mine', 'slash_burn', 'selective_logging', 'conventional_mine', 'blooming'}

# Filter rows where 'clear' is in the tags and none of the excluded tags are present
rows_to_drop = merged_df[merged_df['tags'].apply(lambda x: 'clear' in x and not any(tag in x for tag in excluded_tags))]

# Sample 70 rows from the filtered DataFrame
if len(rows_to_drop) > 70:
    rows_to_drop = rows_to_drop.sample(n=70, random_state=1)  # Using a fixed seed for reproducibility

# Drop these rows from the original DataFrame
clean_merged_df = merged_df.drop(rows_to_drop.index)

print(clean_merged_df)

          image_name                                               tags  \
0      train_185.jpg  [agriculture, artisinal_mine, clear, cultivati...   
2      train_247.jpg  [agriculture, clear, cultivation, habitation, ...   
3      train_680.jpg  [agriculture, clear, cultivation, habitation, ...   
4      train_728.jpg  [agriculture, clear, cultivation, habitation, ...   
7     train_1340.jpg  [agriculture, artisinal_mine, clear, cultivati...   
..               ...                                                ...   
182  train_33452.jpg  [cultivation, habitation, haze, primary, road,...   
183  train_33860.jpg  [agriculture, habitation, haze, primary, road,...   
184  train_35249.jpg  [agriculture, cultivation, haze, primary, road...   
185  train_37481.jpg  [agriculture, cultivation, habitation, haze, p...   
186  train_38283.jpg  [agriculture, cultivation, haze, primary, road...   

                                          encoded_tags  tag_count  
0    [1, 1, 0, 0, 0, 1, 0, 0, 1

In [19]:
# Filter the DataFrame to find rows where 'tag_count' is 6 and 'tags' contains 'haze'
tag6_part_cloud_df = labels_df[(labels_df['tag_count'] == 6) & (labels_df['tags'].apply(lambda x: 'partly_cloudy' in x))]

print(len(tag6_part_cloud_df))


136


In [20]:
# The pool of images with 6 total tags contains 136 images tagged partly_cloudy
# so we will now randomly select 57 to add to the main list
# Filter rows where 'partly_cloudy' is in the tags and none of the excluded tags are present
rows_to_drop2 = tag6_part_cloud_df[tag6_part_cloud_df['tags'].apply(lambda x: 'partly_cloudy' in x and not any(tag in x for tag in excluded_tags))]

# Sample 79 rows from the filtered DataFrame
if len(rows_to_drop2) > 79:
    rows_to_drop2 = rows_to_drop2.sample(n=79, random_state=1)  # Using a fixed seed for reproducibility

# Drop these rows from the original DataFrame
clean_merged_df2 = tag6_part_cloud_df.drop(rows_to_drop2.index)

# Display the modified DataFrame
print(len(clean_merged_df2))


57


In [21]:
# Now we need to randomly select the rest of the images with a haze tag from the
# pool of images that have 5 tags total
# Filter the DataFrame to find rows where 'tag_count' is 5 and 'tags' contains 'haze'
tag5_haze_df = labels_df[(labels_df['tag_count'] == 5) & (labels_df['tags'].apply(lambda x: 'haze' in x))]

print(len(tag5_haze_df))


140


In [24]:
# Filter rows where 'haze' is in the tags and none of the excluded tags are present
rows_to_drop3 = tag5_haze_df[tag5_haze_df['tags'].apply(lambda x: 'haze' in x and not any(tag in x for tag in excluded_tags))]

# Sample 30 rows from the filtered DataFrame
if len(rows_to_drop3) > 108:
    rows_to_drop3 = rows_to_drop3.sample(n=108, random_state=1)  # Using a fixed seed for reproducibility

# Drop these rows from the original DataFrame
clean_merged_df3 = tag5_haze_df.drop(rows_to_drop3.index)

# Display the modified DataFrame
print(len(clean_merged_df3))

32


In [25]:
# Merge the final dataframes of all the selected photos. Result is 206 with
# a 68/68/69 split between clear, partly_cloudy, and haze while keeeping
# as many images with the highest number of tags as possible
combined_df = pd.concat([clean_merged_df3, clean_merged_df2, clean_merged_df], ignore_index=True)

print(len(combined_df))
print(combined_df)


206
          image_name                                               tags  \
0      train_518.jpg          [agriculture, haze, primary, road, water]   
1     train_2818.jpg          [agriculture, haze, primary, road, water]   
2     train_3715.jpg    [agriculture, cultivation, haze, primary, road]   
3     train_3931.jpg          [agriculture, haze, primary, road, water]   
4     train_3969.jpg   [agriculture, cultivation, haze, primary, water]   
..               ...                                                ...   
201  train_33452.jpg  [cultivation, habitation, haze, primary, road,...   
202  train_33860.jpg  [agriculture, habitation, haze, primary, road,...   
203  train_35249.jpg  [agriculture, cultivation, haze, primary, road...   
204  train_37481.jpg  [agriculture, cultivation, habitation, haze, p...   
205  train_38283.jpg  [agriculture, cultivation, haze, primary, road...   

                                          encoded_tags  tag_count  
0    [1, 0, 0, 0, 0, 0, 0, 

In [26]:
# Flatten the list of tags from these filtered images and count each tag
tag_counts_cdf = {}
for tags_list in combined_df['tags']:
    for tag in tags_list:
        if tag in tag_counts_cdf:
            tag_counts_cdf[tag] += 1
        else:
            tag_counts_cdf[tag] = 1

# Convert the dictionary to a list of tuples sorted by the number of occurrences, descending
sorted_tag_cdf = sorted(tag_counts_cdf.items(), key=lambda item: item[1], reverse=True)

# Print the sorted list of tags and their counts
for tag, count in sorted_tag_cdf:
    print(f"Tag: {tag}, Count: {count}")

Tag: primary, Count: 205
Tag: agriculture, Count: 185
Tag: road, Count: 184
Tag: habitation, Count: 158
Tag: cultivation, Count: 145
Tag: water, Count: 123
Tag: partly_cloudy, Count: 69
Tag: clear, Count: 69
Tag: haze, Count: 68
Tag: bare_ground, Count: 43
Tag: artisinal_mine, Count: 18
Tag: selective_logging, Count: 14
Tag: conventional_mine, Count: 12
Tag: slash_burn, Count: 12
Tag: blooming, Count: 3


In [None]:
# Now use the combined_dataframe to make copies of the images we selected
# and place them in a new folder in the google drive
source_dir = "/content/drive/Shareddrives/computer-vision-project/Data/Unzipped/planet/planet/train-jpg/"
destination_dir = "/content/drive/Shareddrives/computer-vision-project/Data/Unzipped/planet/planet/train_seg/"

if not os.path.exists(destination_dir):
    os.makedirs(destination_dir)


In [None]:
# Initialize a list to hold any images that couldnt be found
not_found_files = []

# Copy the files
for image_name in combined_df['image_name']:
    src_path = os.path.join(source_dir, image_name)
    dst_path = os.path.join(destination_dir, image_name)
    try:
        # Try to copy the file to the new directory
        shutil.copy(src_path, dst_path)
    except FileNotFoundError:
        # If the file is not found, append it to the list and continue
        not_found_files.append(image_name)

# Print the files that couldn't be found
if not_found_files:
    print("These files were not found and could not be copied:")
    for file in not_found_files:
        print(file)

These files were not found and could not be copied:
train_7818.jpg
train_8854.jpg
train_9236.jpg
train_8494.jpg
train_8623.jpg
train_9027.jpg
train_7961.jpg
train_8330.jpg
train_8944.jpg
train_9110.jpg
train_9528.jpg
train_9673.jpg
train_7848.jpg


In [27]:
# once all of the training segmentation images have been tagged in labelme
# or somewhere else(only need to tag primary) the code below will create a binary
# mask. The .yaml file will have 2 classes 1) primary 2)nonprimary

source_dir = "/content/drive/Shareddrives/computer-vision-project/Data/Unzipped/planet/planet/train_seg_tagged/"
output_mask_dir = os.path.join(source_dir, "masks")

# Ensure the output directory for masks exists
if not os.path.exists(output_mask_dir):
    os.makedirs(output_mask_dir)

# Assuming JSON and image files are in the same directory
json_dir = source_dir
image_dir = source_dir

# Process each JSON file
for json_filename in os.listdir(json_dir):
    if json_filename.endswith('.json'):
        json_path = os.path.join(json_dir, json_filename)
        image_filename = json_filename.replace('.json', '.jpg')  # Change extension according to your image files
        image_path = os.path.join(image_dir, image_filename)

        # Load JSON data
        with open(json_path, 'r') as file:
            data = json.load(file)

        # Create a blank mask
        mask = np.zeros((data['imageHeight'], data['imageWidth']), dtype=np.uint8)

        # Draw each polygon from the JSON onto the mask
        for shape in data['shapes']:
            if shape['shape_type'] == 'polygon':
                pts = np.array(shape['points'], np.int32)
                pts = pts.reshape((-1, 1, 2))
                cv2.fillPoly(mask, [pts], 255)  # Fill polygon with white on the mask

        # Save the mask to a file
        mask_path = os.path.join(output_mask_dir, image_filename.replace('.jpg', '_mask.png'))  # Save mask with a new name
        cv2.imwrite(mask_path, mask)


In [28]:
print(f"Processing {json_filename}...")
if os.path.exists(mask_path):
    print(f"Mask saved successfully: {mask_path}")
else:
    print(f"Failed to save mask for {image_filename}")


Processing masks...
Mask saved successfully: /content/drive/Shareddrives/computer-vision-project/Data/Unzipped/planet/planet/train_seg_tagged/masks/train_518_mask.png
