In [1]:
import os
import numpy as np
import glob
import json

from copy import deepcopy

import sunscc.utils.clustering.clustering_utilities as c_utils

import importlib
importlib.reload(c_utils)

from tqdm.notebook import tqdm


%matplotlib ipympl

%load_ext autoreload
%autoreload 2



In [2]:
class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        if isinstance(obj, np.int64):
            return int(obj)
        if isinstance(obj, np.floating):
            return float(obj)
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        if isinstance(obj, np.bool_):
            return bool(obj)
        return super(NpEncoder, self).default(obj)

In [3]:
root_dir = '../../datasets/classification/2002-2019_2'

db_dict_filepath = root_dir+'/wl_list2dbGroups_Classification.json'

param_optim_p1_folder = root_dir + '/param_optimization'
param_optim_p2_folder = root_dir + '/param_optimP2'

db_dict = { }
with open(db_dict_filepath, 'r') as f:
    db_dict = json.load(f)

# Number of overlaps

In [4]:
# fn = f'cur_dict_2002-19_dist0.1_Lon0.1_lat0.1_iter20.json'  
fn = f'cur_dict_2002-19_dist0.1_Lon0.35_lat0.08_iter20.json'  
print(fn)

#     raise

cur_huge_dict_filename = os.path.join(param_optim_p1_folder,fn)
with open(cur_huge_dict_filename,'r') as f:
    cur_huge_dict = json.load(f)

cur_dict_2002-19_dist0.1_Lon0.35_lat0.08_iter20.json


In [5]:
total_bbox = 0
total_overlap = 0

# cur_huge_dict = grid_image_out_dict[0]
for basename in tqdm(list(cur_huge_dict.keys())[:]):
    cur_image_dict = cur_huge_dict[basename]
    
    angle = cur_image_dict["SOLAR_P0"]
    deltashapeX = cur_image_dict["deltashapeX"]
    deltashapeY = cur_image_dict["deltashapeY"]
    
    drawing_radius_px = db_dict[basename]["dr_radius_px"]
    
    group_list = cur_image_dict['db']
    
    ms_dict = cur_image_dict['meanshift']
    
    centroids = np.array(ms_dict["centroids"])
    centroids_px = np.array(ms_dict["centroids_px"])
    
    db_classes = [{"Zurich":item['Zurich'], "McIntosh":item['McIntosh'] } for item in group_list]
    db_bboxes = [np.array(item['bbox_wl']) for item in group_list]
    db_centers_px = np.array([[(b[2]+b[0])/2,(b[3]+b[1])/2] for b in db_bboxes])
        
    # check that current bbox is does not overlap any
    isolated_bboxes_bool = np.array(c_utils.get_intersecting_db_bboxes(db_bboxes)) == 0
    isolated_bboxes_indices = np.where(isolated_bboxes_bool == True)[0]

    total_bbox += len(db_bboxes)
    total_overlap += len(db_bboxes) - len(isolated_bboxes_indices)

print(total_bbox, total_overlap)
print(total_bbox - total_overlap)
print(total_overlap / total_bbox)

  0%|          | 0/2969 [00:00<?, ?it/s]

6528 732
5796
0.11213235294117647


# PHASE 1: Construire le dataset

In [6]:
fn = f'image_outdict_dist0.1_Lon0.35_lat0.08_iter20.json'  

cur_image_out_dict_filename = os.path.join(param_optim_p2_folder,fn)
with open(cur_image_out_dict_filename,'r') as f:
    image_out_dict = json.load(f)

In [7]:
# 1) analyse the distributions

classes = ['A','B','C','D','E','F','G','H','I','J','X']

distribs = {c: 0 for c in classes}
group_types = {}
group_types2 = {c:{} for c in classes}

for bn, img_dict in tqdm(image_out_dict.items()):
    if 'groups' not in img_dict:
        continue
        
    groups = img_dict['groups']
    
    for i, g in enumerate(groups):
        cur_c = g["Zurich"]
        distribs[cur_c] +=1
        
        new_group_infos= {
            'angle': img_dict['angle'],
            'deltashapeX': img_dict['deltashapeX'],
            'deltashapeY':img_dict['deltashapeY'],
            'centroid_px': g['centroid_px'],
            'centroid_Lat': g['centroid_Lat'],
            'centroid_Lon': g['centroid_Lon'],
            'members': g['members'],
            'members_mean_px': g['members_mean_px'],
            'angular_excentricity_rad': g['angular_excentricity_rad'],
            'angular_excentricity_deg': g['angular_excentricity_deg'],
            'Zurich': g['Zurich'],
            'McIntosh': g['McIntosh'],   
        }
        
        new_goup_id = bn + '_' + str(i)
        group_types[new_goup_id] = new_group_infos
        group_types2[cur_c][new_goup_id] = new_group_infos
        
    
print(distribs)
print()
# print(group_types2)

  0%|          | 0/2969 [00:00<?, ?it/s]

{'A': 605, 'B': 714, 'C': 1039, 'D': 1012, 'E': 240, 'F': 41, 'G': 148, 'H': 383, 'I': 0, 'J': 1040, 'X': 21}



In [8]:
group_based_dataset = deepcopy(group_types)
group_based_dataset2 = deepcopy(group_types2)

In [9]:
First2superFirst = {"A":"A",
                    "B":"B",
                    "C":"C",
                    "D":"SuperGroup",
                    "E":"SuperGroup",
                    "F":"SuperGroup",
                    "H":"H",
                    "X":"X"
                   }
Second2superSecond = {"x":"x",
                      "r":"r",
                      "s": "sym",
                      "h": "sym",
                      "a": "asym",
                      "k": "asym",
                     }
Third2superThird = {"x": "x",
                    "o": "o",
                    "i": "frag",
                    "c": "frag",
                   }

def add_superclasses(group_dict):
    cpy = deepcopy(group_dict)    
    # print(cpy)

    cpy["SuperClass"] = {
        "1": First2superFirst[group_dict["McIntosh"][0]],
        "2": Second2superSecond[group_dict["McIntosh"][1]],
        "3": Third2superThird[group_dict["McIntosh"][2]],
    }
    
    return cpy
    
    

grp_to_remove = []
group_based_dataset_superclasses = {}
for g in tqdm(group_based_dataset):
    try : 
        group = group_based_dataset[g]
        # print(group)
        group = add_superclasses(group)
#         print(group)
        group_based_dataset_superclasses[g] = group
        
    except KeyError:
        print(g)
        print(group_based_dataset[g])
        if group_based_dataset[g]["McIntosh"] == '   ':
            print( "error")
            grp_to_remove.append((g,group_based_dataset[g]['Zurich']))

for k,k_type in grp_to_remove:
    group_based_dataset.pop(k)
    group_based_dataset2[k_type].pop(k)

# group_based_dataset_superclasses

    

  0%|          | 0/5243 [00:00<?, ?it/s]

UPH20040722080136_1
{'angle': 6.950210373029591, 'deltashapeX': 226, 'deltashapeY': 226, 'centroid_px': [1763.984688376337, 789.7939222567339], 'centroid_Lat': -0.19138280535545324, 'centroid_Lon': 0.8858795170464564, 'members': [[789.1715265866209, 1766.2315608919382], [805.4, 1705.56], [804.0, 1700.5]], 'members_mean_px': [799.5238421955404, 1724.0971869639795], 'angular_excentricity_rad': 0.8332981619101842, 'angular_excentricity_deg': 47.74446775346269, 'Zurich': 'G', 'McIntosh': '   '}
error


UPH20111104083629_2
{'angle': 23.9656539855509, 'deltashapeX': 698, 'deltashapeY': 698, 'centroid_px': [278.653666143603, 1271.799855625567], 'centroid_Lat': 0.3251877192454918, 'centroid_Lon': 1.8424379034316671, 'members': [[1259.2236979166667, 292.2916666666667], [1242.1, 242.68333333333334], [1244.75, 213.5], [1278.3066933066932, 201.21578421578423], [1305.9180487804879, 240.3180487804878], [1297.0, 280.1666666666667], [1304.952380952381, 278.3333333333333], [1309.0969387755101, 294.5408163265306]], 'members_mean_px': [1280.1684699664675, 255.38120616535033], 'angular_excentricity_rad': 1.1306380698466285, 'angular_excentricity_deg': 64.7807895590294, 'Zurich': 'E', 'McIntosh': '   '}
error
UPH20111104083629_3
{'angle': 23.9656539855509, 'deltashapeX': 698, 'deltashapeY': 698, 'centroid_px': [452.9394902751303, 708.7341490132685], 'centroid_Lat': -0.24894683817332025, 'centroid_Lon': 2.1239463299682932, 'members': [[682.0751028806584, 487.61831275720164], [691.4444444444445, 392.6

### Split per types

In [10]:
distribs2 = {c:0 for c in classes}
group_types2 = {c:{} for c in classes}

for grp_id, grp_dict in tqdm(group_based_dataset_superclasses.items()):
    cur_c = grp_dict["Zurich"]
    group_types2[cur_c][grp_id] = grp_dict
        
# group_types2

  0%|          | 0/5240 [00:00<?, ?it/s]

### Step 2:  Split groups among train, val, test sets

In [11]:
import random


def splitPerc(l, perc):
    # Turn percentages into values between 0 and 1
    splits = np.cumsum(perc)/100.

    if splits[-1] != 1:
        raise ValueError("percents don't add up to 100")

    # Split doesn't need last percent, it will just take what is left
    splits = splits[:-1]

    # Turn values into indices
    splits *= len(l)

    # Turn double indices into integers.
    # CAUTION: numpy rounds to closest EVEN number when a number is halfway
    # between two integers. So 0.5 will become 0 and 1.5 will become 2!
    # If you want to round up in all those cases, do
    # splits += 0.5 instead of round() before casting to int
    splits = splits.round().astype(int)

    return np.split(l, splits)

splits = ['train', 'val', 'test']

splits_percentages = [70, 15, 15]

assert np.array(splits_percentages).sum() == 100

group_based_dataset_superclasses_splits = {sp:{} for sp in splits}

for t, type_dict in group_types2.items():
    list_type_groups = list(type_dict.keys()) 
    # shuffle
    random.shuffle(list_type_groups)
    
    indices = np.array(range(len(list_type_groups)))
    
    s = splitPerc(indices, splits_percentages)

    # take percentage and fill group_based_dataset
    for i, sp in enumerate(splits):
        split_indices = s[i]
        split_groups = [list_type_groups[j] for j in split_indices]
        
#         print(split_groups)
        for g in  split_groups:
            group_based_dataset_superclasses_splits[sp][g] = type_dict[g] 
            

In [12]:
out_subdir = 'sunscc'
final_json = "dataset_nosplits"

if not os.path.exists(f'{root_dir}/{out_subdir}'):
    os.makedirs(f'{root_dir}/{out_subdir}')

with open(f'{root_dir}/{out_subdir}/{final_json}.json', 'w') as f:
    json.dump(group_based_dataset_superclasses, f, cls=NpEncoder)
final_json_split = "dataset_final"
with open(f'{root_dir}/{out_subdir}/{final_json_split}.json', 'w') as f:
    json.dump(group_based_dataset_superclasses_splits, f, cls=NpEncoder)

In [13]:
for split in splits:
    with open(f'{root_dir}/{out_subdir}/{split}.json', 'w') as f:
        json.dump(group_based_dataset_superclasses_splits[split], f, cls=NpEncoder)

# PHASE 2: Splitting Overlaps from Isolated

In [14]:
old_path = '../../datasets/classification/2002-2019_2/test/dataset_nosplits.json'
new_path = '../../datasets/classification/2002-2019_2/rebuttal/dataset_nosplits.json'

#open the two files
with open(old_path, 'r') as f:
    old_dict = json.load(f)
with open(new_path, 'r') as f:
    new_dict = json.load(f)
    

In [15]:
# find the entries that are in the new dict but not in the old one
# Focusing on the keys is not sufficient, we should look at:
# - the part before the firs underscore in the key
# - the "centroid_lon" and "centroid_lat" fields in the value


new_entries = {}
num_new_entries = 0

for cur_k,cur_v in new_dict.items():
    # get the first part of the key
    first_part = cur_k.split('_')[0]

    centroid_lon = cur_v['centroid_Lon']
    centroid_lat = cur_v['centroid_Lat']

    # get all entries in the old dict that have the same first part
    old_dict_keys = [k for k in old_dict.keys() if k.split('_')[0] == first_part]
    
    # check if any of these entries have the same centroid
    for old_k in old_dict_keys:
        old_centroid_lon = old_dict[old_k]['centroid_Lon']
        old_centroid_lat = old_dict[old_k]['centroid_Lat']
        
        if centroid_lon == old_centroid_lon and centroid_lat == old_centroid_lat:
            # print("Found a match!")
            # print(cur_k)
            # print(old_k)
            # print(cur_v)
            # print(old_dict[old_k])
            # print()
            break
    else:      
        new_entries[cur_k] = cur_v

        num_new_entries += 1
        # print("Found a new entry!")
        # print(cur_k)
        # print(cur_v)
        # print()

print("num_new_entries: ",num_new_entries)




num_new_entries:  1371


In [16]:
#dump the new dict
new_overlap_dir = '../../datasets/classification/2002-2019_2/sunscc_overlap_only'
out_file = new_overlap_dir + '/dataset_nosplits.json'

if not os.path.exists(new_overlap_dir):
    os.makedirs(new_overlap_dir)

with open(out_file, 'w') as f:
    json.dump(new_entries, f)

#### Separate train val and test splits

In [17]:
old_path = '../../datasets/classification/2002-2019_2/test/dataset_nosplits.json'
new_path = '../../datasets/classification/2002-2019_2/sunscc/dataset_final.json'

split = 'test'

#open the two files
with open(old_path, 'r') as f:
    old_dict = json.load(f)
with open(new_path, 'r') as f:
    new_dict = json.load(f)

In [18]:
total = 0
for split in ['train','val','test']:
    print("split: ",split, " len: ",len(new_dict[split]))
    total += len(new_dict[split])
print("total: ",total)

split:  train  len:  3669
split:  val  len:  785
split:  test  len:  786
total:  5240


In [19]:
output = {}

for split in ['train','val','test']:
    new_dict_split  = new_dict[split]

    # find the entries that are in the new dict but not in the old one
    # Focusing on the keys is not sufficient, we should look at:
    # - the part before the firs underscore in the key
    # - the "centroid_lon" and "centroid_lat" fields in the value


    new_entries = {}
    num_new_entries = 0

    for cur_k,cur_v in new_dict_split.items():
        # get the first part of the key
        first_part = cur_k.split('_')[0]

        centroid_lon = cur_v['centroid_Lon']
        centroid_lat = cur_v['centroid_Lat']

        # get all entries in the old dict that have the same first part
        old_dict_keys = [k for k in old_dict.keys() if k.split('_')[0] == first_part]
        
        # check if any of these entries have the same centroid
        for old_k in old_dict_keys:
            old_centroid_lon = old_dict[old_k]['centroid_Lon']
            old_centroid_lat = old_dict[old_k]['centroid_Lat']
            
            if centroid_lon == old_centroid_lon and centroid_lat == old_centroid_lat:
                # print("Found a match!")
                # print(cur_k)
                # print(old_k)
                # print(cur_v)
                # print(old_dict[old_k])
                # print()
                break
        else:      
            new_entries[cur_k] = cur_v

            num_new_entries += 1
            # print("Found a new entry!")
            # print(cur_k)
            # print(cur_v)
            # print()

    print(f"{split} -> num_new_entries: ",num_new_entries)

    #dump the new dict
    out_file = f'../../datasets/classification/2002-2019_2/sunscc_overlap_only/overlaps_{split}.json'
    with open(out_file, 'w') as f:
        json.dump(new_entries, f)
    
    output[split] = new_entries

#dump the new dict
out_file = f'../../datasets/classification/2002-2019_2/sunscc_overlap_only/dataset_overlapsOnly.json'
with open(out_file, 'w') as f:
    json.dump(output, f)


train -> num_new_entries:  967
val -> num_new_entries:  192
test -> num_new_entries:  212


# Find Difference between overlapOnly and Total

In [20]:
old_path = '../../datasets/classification/2002-2019_2/test/dataset_nosplits.json'
new_path = '../../datasets/classification/2002-2019_2/sunscc/dataset_final.json'
with open(old_path, 'r') as f:
    old_dict = json.load(f)
with open(new_path, 'r') as f:
    new_dict = json.load(f)

nooverlap_file = '../../datasets/classification/2002-2019_2/test/dataset_final.json'
overlapsonly_file = f'../../datasets/classification/2002-2019_2/sunscc_overlap_only/dataset_overlapsOnly.json'


with open(nooverlap_file, 'r') as f:
    nooverlap_dict = json.load(f)

with open(overlapsonly_file, 'r') as f:
    overlapsonly_dict = json.load(f)


In [21]:
final_dict = {'train': [], 'val': [], 'test': []}

for split in ['train', 'val', 'test']:
    num_NoOverlap = len(nooverlap_dict[split])
    num_OverlapsOnly = len(overlapsonly_dict[split])


    print('Number of samples in NoOverlap %s: %d' % (split, num_NoOverlap))
    print('Number of samples in OverlapsOnly %s: %d' % (split, num_OverlapsOnly))
    print('Number of samples in Total %s: %d' % (split, num_NoOverlap + num_OverlapsOnly))
    print("---------")

    fusion = nooverlap_dict[split].copy()

    for key, value in overlapsonly_dict[split].items():
        if key in fusion:
            fusion[key + '_'] = value
        else:
            fusion[key] = value
    # fusion.update(overlapsonly_dict[split])

    print('Number of samples in Fusion %s: %d' % (split, len(fusion)))
    print("---------")
    print("---------")

    # fuse the two lists
    final_dict[split] = fusion




Number of samples in NoOverlap train: 2722
Number of samples in OverlapsOnly train: 967
Number of samples in Total train: 3689
---------
Number of samples in Fusion train: 3689
---------
---------
Number of samples in NoOverlap val: 583
Number of samples in OverlapsOnly val: 192
Number of samples in Total val: 775
---------
Number of samples in Fusion val: 775
---------
---------
Number of samples in NoOverlap test: 583
Number of samples in OverlapsOnly test: 212
Number of samples in Total test: 795
---------
Number of samples in Fusion test: 795
---------
---------


In [24]:
import os
out_dir = f'../../datasets/classification/2002-2019_2/sunscc_all_revised'

#mkdir if it does not exist
if not os.path.exists(out_dir):
    os.makedirs(out_dir)

# dump final_dict
with open(f'{out_dir}/dataset_revised.json', 'w') as f:
    json.dump(final_dict, f)

for s in final_dict:
    # dump final_dict
    with open(f'{out_dir}/dataset_revised_{s}.json', 'w') as f:
        json.dump(final_dict[s], f)

In [23]:
# rebuttal_out_dir = f'../../datasets/classification/2002-2019_2/rebuttal_all_revised'
# # dump final_dict
# with open(f'{rebuttal_out_dir}/dataset_revised.json', 'r') as f:
#     tmp_load = json.load(f)

#     for s in tmp_load:
#         # dump final_dict
#         with open(f'{rebuttal_out_dir}/dataset_revised_{s}.json', 'w') as f:
#             json.dump(tmp_load[s], f)

This is the end of Notebook Step 0,  now go to Notebook Step 1