In [1]:
import numpy as np
import json 
import os
import matplotlib.pyplot as plt
from tqdm import tqdm
%matplotlib inline

Now we have 2 datasets: 

1. **all_revised**: all the samples found using the clustering
2. **overlapsOnly**: a subset of the first whose samples have bounding box overlapping the one of another sunspot group.

We still have to filter out some edge cases, which is done in this notebook

Edge cases:
The sample is located close to the limb + another sunspot group appears in the crop, however their respective bbox did not overlap each other.


The following cells must be run for all 3 splits: train val and test.

In [2]:
limb_limit = 60 # angular distance threshold that should be filtered

# split = 'train'
split = 'val'
# split = 'test'


# dir with only overlap samples
overlaps_dir = f'../../datasets/classification/2002-2019_2/rebuttal_overlap_only'
overlap_json = f'{overlaps_dir}/dataset_overlapsOnly.json'
overlap_npy = f'{overlaps_dir}/all_samples_{split}.npy'
# dir with all samples
all_dir = f'../../datasets/classification/2002-2019_2/rebuttal_all_revised'
all_json = f'{all_dir}/dataset_revised.json'
all_npy = f'{all_dir}/all_samples_{split}.npy'

# go through all the sample in all_json and check if they are in overlap_json
# if they are, check if they are within the limit
# if they are, check if their mask contain another sample
# if they do, remove them from all_json

#open the two files
with open(all_json, 'r') as f:
    all_dict = json.load(f)
with open(overlap_json, 'r') as f: 
    overlap_dict = json.load(f)

# open the npy files
all_samples = np.load(all_npy, allow_pickle=True).item()
overlap_samples = np.load(overlap_npy, allow_pickle=True).item()





In [3]:
# get the keys of the overlap dict
overlap_keys = list(overlap_dict[split].keys())

# get the keys of the all dict

# get the keys of the all dict that are not in the overlap dict
keys_to_remove = []
to_count = 0

show = False

for cur_key in tqdm(all_dict[split]):
    
    # get the sample
    cur_sample = all_dict[split][cur_key]
    cur_lon = cur_sample['centroid_Lon']
    cur_lat = cur_sample['centroid_Lat']

    if cur_key not in all_samples:
        print(f'Skipping {cur_key}')
        continue

    # get the mask
    cur_sample_npy = all_samples[cur_key]

    # print(cur_key)
    # print(cur_sample_npy.keys())

    cur_sample_conf = cur_sample_npy['confidence_map']
    cur_sample_grp_conf = cur_sample_npy['group_confidence_map']
    cur_sample_ang_dist = cur_sample_npy['angular_excentricity']

    # print(f'{cur_sample_ang_dist}-> {cur_sample_ang_dist * 90}')

    #check if sample is close to the limb
    is_close_to_limb = False
    if cur_sample_ang_dist * 90 > limb_limit:
        # print("sample is close to limb!")
        is_close_to_limb = True

    # check if cur_sample_conf contains other group, to do so:
    # compute difference between cur_sample_conf and cur_sample_grp_conf
    # if difference contains non-zero pixels, then there is another group
    contains_other_group = False
    diff = cur_sample_conf - cur_sample_grp_conf
    if np.any(diff != 0):
        # print("Found another group!")
        # print()
        contains_other_group = True

    # check if cur_sample is in overlap_dict.
    is_in_overlap = False
    # Attention, remove the last underscore if present.
    # When there was some name conflict between overlapping and non-overlappig samples,
    # the key was saved with an underscore at the end
    tmp = cur_key[:-1] if cur_key[-1] == '_' else cur_key
    if tmp in overlap_keys:
        # print("sample is in overlap!")
        # print()
        is_in_overlap = True

    # # if sample is in overlap OR is close to limb and contains another group, add it to the list of keys to remove
    # if is_in_overlap or (is_close_to_limb and contains_other_group):
    
    # if sample is close to limb and contains another group, add it to the list of keys to remove
    if (is_close_to_limb and contains_other_group):
        keys_to_remove.append(cur_key)

        p1 = "Limb" if is_close_to_limb else "NoLimb"
        p2 = "Other" if contains_other_group else "NoOther"
        p3 = "Overlap" if is_in_overlap else "NoOvervlap"

        # if (not is_in_overlap) and (is_close_to_limb and contains_other_group):
        # if (is_close_to_limb and contains_other_group):
        if show:
            to_count += 1
            # show the sample
            fig, ax = plt.subplots(1,3, figsize=(15,5))
            ax[0].imshow(cur_sample_conf)
            ax[1].imshow(cur_sample_grp_conf)
            ax[2].imshow(cur_sample_npy['image'], cmap='gray', interpolation='None')
            fig.suptitle(f'{cur_key} : {p1} / {p2} / {p3} ')
            fig.tight_layout()
            fig.show()
    


    # break

print(len(keys_to_remove))
print(to_count)


100%|██████████| 774/774 [00:00<00:00, 3348.79it/s]

Skipping UPH20050122094506_0
Skipping UPH20070505080705_0
Skipping UPH20030319094901_0
Skipping UPH20030317135451_0
Skipping UPH20051201123624_0
34
0





In [4]:
out_dir = f'../../datasets/classification/2002-2019_2/sunscc_all_revised_filtered'

#mkdir if it does not exist
if not os.path.exists(out_dir):
    os.makedirs(out_dir)
    

# create a new dict with only the keys that are not in keys_to_remove
new_dict = {}
new_npy = {}

for cur_key in tqdm(all_dict[split]):
    if cur_key not in all_samples:
        print(f'Skipping {cur_key}')
        continue
        
    if cur_key not in keys_to_remove:
        # add the sample to the new dict
        new_dict[cur_key] = all_dict[split][cur_key]

        # add the sample to the new npy
        new_npy[cur_key] = all_samples[cur_key]



# dump the new dict in a new json file
out_json = f'{out_dir}/dataset_filtered_{split}.json'
with open(out_json, 'w') as f:
    json.dump(new_dict, f)
    
print( "ok" )

# dump the new npy
np.save(f'{out_dir}/all_samples_{split}', new_npy)
    



100%|██████████| 774/774 [00:00<00:00, 618831.74it/s]

Skipping UPH20050122094506_0
Skipping UPH20070505080705_0
Skipping UPH20030319094901_0
Skipping UPH20030317135451_0
Skipping UPH20051201123624_0





ok
