In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/COMS\ E6998\ PDL Sys Perf/Project/

/content/drive/MyDrive/COMS E6998 PDL Sys Perf/Project


# Notebook to classify post-disaster images using damage levels
- Save all feature labels per image (df)
- Create dummy columns to get indicators of each damage level (df2)
- Sum up all the indicator columns per image and take the maximum (df3)

# Final code to generate labels

In [None]:
import json
import os
import pandas as pd

def get_max_label(row):
    row_dmgs = dict()
    damage_levels = ['no-damage', 'minor-damage', 'major-damage', 'destroyed', 'un-classified', 'none']
    for dmg in damage_levels:
        row_dmgs[dmg] = row[dmg]
    
    # Get all damage levels with the maximum value
    max_count = max(row_dmgs.items(), key=lambda x: x[1])
    dmg_lvl_max = []
    for dmg_lvl, count in row_dmgs.items():
        if count == max_count[1]:
            dmg_lvl_max.append(dmg_lvl)
    return ", ".join(dmg_lvl_max)

def save_image_damage_levels_and_labels(img_dir, verbose_idx=-1, stop_idx=-1):
    """Reads image labels in and saves three dfs:
    df: Save all feature labels per image.
    df2: Create dummy columns to get indicators of each damage level.
    df3: Sum up all the indicator columns per image and take the maximum.
    
    :param img_dir: Can be any of the following: ["train", "test", "hold"].
    :param verbose_idx: Will print index of image after every `verbose_idx` images.
    :param stop_idx: Index of image at which to stop running the script (just 
    used for testing, otherwise left alone).
    """

    df = pd.DataFrame()
    labels_dir = img_dir + '/' + 'labels/'
    for img_json_idx, img_label_json_file_name in enumerate(os.listdir(labels_dir)):

        # Print image index after verbose_idx number of images
        if verbose_idx > 0 and img_json_idx % verbose_idx == 0:
            print(img_json_idx)

        # Make sure it's a post-disaster image
        if 'post_disaster' not in img_label_json_file_name:
            continue

        with open(labels_dir + img_label_json_file_name) as f:
            data = json.load(f)
        f.close()

        all_features = data['features']['lng_lat']

        if len(all_features) > 0:

            for idx, feature in enumerate(all_features):
                img_info = dict()
                img_info['file_name'] = img_label_json_file_name
                img_info['feature_index'] = int(idx)
                img_info['disaster'] = data['metadata']['disaster']
                img_info['disaster_type'] = data['metadata']['disaster_type']

                properties = feature['properties']
                img_info['feature_type'] = properties['feature_type']
                img_info['damage_level'] = properties['subtype']

                df = df.append(img_info, ignore_index=True)

        # If image has no features (buildings, etc.)
        else:
            img_info = dict()
            img_info['file_name'] = img_label_json_file_name
            img_info['feature_index'] = 0
            img_info['disaster'] = data['metadata']['disaster']
            img_info['disaster_type'] = data['metadata']['disaster_type']
            img_info['feature_type'] = 'none'
            img_info['damage_level'] = 'none'

            df = df.append(img_info, ignore_index=True)

        if stop_idx > 0 and img_json_idx == stop_idx:
            break

    df.to_csv(f'{img_dir}/{img_dir}_post_disaster_images_damage_levels.csv.gz', compression='gzip', index=False)

    df2 = pd.concat([df, pd.get_dummies(df['damage_level'])], axis=1)
    
    df3 = df2.groupby(['file_name', 'disaster', 'disaster_type'])['destroyed', 'major-damage', 'minor-damage', 'no-damage', 'un-classified', 'none'].sum().reset_index()
    df3['final_label'] = df3.apply(lambda row: get_max_label(row), axis=1)
    df3.to_csv(f'{img_dir}/{img_dir}_post_disaster_images_max_labels.csv.gz', compression='gzip', index=False)

    return df3

In [None]:
def check_all_images_in_folder(img_dir):
    """Checks that all images whose JSON paths are saved in our labels dataset 
    are in the images folder. """

    # Read in JSON paths from labels dataset
    imgs_max_labels = pd.read_csv(f'{img_dir}/{img_dir}_post_disaster_images_max_labels.csv.gz', low_memory=False)
    json_img_names = set([i.split('.')[0] for i in list(imgs_max_labels['file_name'].unique())])

    # Read in image filenames
    png_img_names = set([i.split('.')[0] for i in os.listdir(img_dir + '/post-disaster-images/')])

    return json_img_names, png_img_names

In [None]:
os.getcwd()

'/content/drive/MyDrive/COMS E6998 PDL Sys Perf/Project'

In [None]:
MY_PATH = "drive/MyDrive/COMS E6998 PDL Sys Perf/Project/"
os.chdir(MY_PATH)

In [None]:
os.getcwd()

'/content/drive/MyDrive/COMS E6998 PDL Sys Perf/Project'

## Data from downloaded "test" post-disaster images folder
- Not really using these images as our test data; this is just the data we decided to download first
- 476 "no-damage" images, 107 "destroyed" images

In [None]:
test_imgs_max_labels = save_image_damage_levels_and_labels(img_dir='test', verbose_idx=100)

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800




In [None]:
test_imgs_max_labels.head()

Unnamed: 0,file_name,disaster,disaster_type,destroyed,major-damage,minor-damage,no-damage,un-classified,none,final_label
0,guatemala-volcano_00000003_post_disaster.json,guatemala-volcano,volcano,1.0,0.0,2.0,0.0,0.0,0.0,minor-damage
1,guatemala-volcano_00000005_post_disaster.json,guatemala-volcano,volcano,0.0,0.0,0.0,0.0,1.0,0.0,un-classified
2,guatemala-volcano_00000009_post_disaster.json,guatemala-volcano,volcano,0.0,0.0,0.0,3.0,1.0,0.0,no-damage
3,guatemala-volcano_00000011_post_disaster.json,guatemala-volcano,volcano,0.0,0.0,1.0,0.0,0.0,0.0,minor-damage
4,guatemala-volcano_00000021_post_disaster.json,guatemala-volcano,volcano,0.0,0.0,0.0,23.0,0.0,0.0,no-damage


In [None]:
test_imgs_max_labels['final_label'].value_counts()

no-damage                             476
none                                  181
destroyed                             107
major-damage                           81
minor-damage                           42
un-classified                          22
no-damage, destroyed                    8
major-damage, destroyed                 5
minor-damage, major-damage              3
no-damage, minor-damage                 2
no-damage, un-classified                2
destroyed, un-classified                2
major-damage, un-classified             1
no-damage, major-damage, destroyed      1
Name: final_label, dtype: int64

In [None]:
test_json_img_names, test_png_img_names = check_all_images_in_folder(img_dir='test')

In [None]:
test_json_img_names == test_png_img_names

True

## Data from downloaded "hold" post-disaster images folder

In [None]:
hold_imgs_max_labels = save_image_damage_levels_and_labels(img_dir='hold', verbose_idx=100)

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800




In [None]:
hold_imgs_max_labels.head()

Unnamed: 0,file_name,disaster,disaster_type,destroyed,major-damage,minor-damage,no-damage,un-classified,none,final_label
0,guatemala-volcano_00000004_post_disaster.json,guatemala-volcano,volcano,4.0,8.0,6.0,2.0,0.0,0.0,major-damage
1,guatemala-volcano_00000012_post_disaster.json,guatemala-volcano,volcano,0.0,0.0,0.0,33.0,0.0,0.0,no-damage
2,guatemala-volcano_00000014_post_disaster.json,guatemala-volcano,volcano,1.0,5.0,0.0,0.0,1.0,0.0,major-damage
3,guatemala-volcano_00000020_post_disaster.json,guatemala-volcano,volcano,0.0,0.0,8.0,27.0,1.0,0.0,no-damage
4,guatemala-volcano_00000022_post_disaster.json,guatemala-volcano,volcano,2.0,1.0,0.0,4.0,0.0,0.0,no-damage


In [None]:
hold_imgs_max_labels['final_label'].value_counts()

no-damage                                 437
none                                      170
destroyed                                 125
major-damage                               92
minor-damage                               50
un-classified                              25
no-damage, un-classified                    5
major-damage, destroyed                     5
no-damage, minor-damage                     4
minor-damage, destroyed                     4
no-damage, destroyed                        4
major-damage, un-classified                 3
minor-damage, major-damage                  3
destroyed, un-classified                    3
no-damage, major-damage, destroyed          1
no-damage, major-damage, un-classified      1
no-damage, destroyed, un-classified         1
Name: final_label, dtype: int64

In [None]:
hold_imgs_max_labels[hold_imgs_max_labels['final_label'] == 'destroyed'].head()

Unnamed: 0,file_name,disaster,disaster_type,destroyed,major-damage,minor-damage,no-damage,un-classified,none,final_label
104,hurricane-florence_00000440_post_disaster.json,hurricane-florence,flooding,4.0,0.0,0.0,0.0,0.0,0.0,destroyed
130,hurricane-harvey_00000031_post_disaster.json,hurricane-harvey,flooding,7.0,6.0,0.0,0.0,0.0,0.0,destroyed
134,hurricane-harvey_00000050_post_disaster.json,hurricane-harvey,flooding,3.0,0.0,0.0,0.0,0.0,0.0,destroyed
147,hurricane-harvey_00000124_post_disaster.json,hurricane-harvey,flooding,13.0,3.0,0.0,0.0,0.0,0.0,destroyed
246,hurricane-matthew_00000123_post_disaster.json,hurricane-matthew,wind,32.0,17.0,15.0,6.0,13.0,0.0,destroyed


In [None]:
hold_json_img_names, hold_png_img_names = check_all_images_in_folder(img_dir='hold')

In [None]:
hold_json_img_names == hold_png_img_names

True

## Data from downloaded "train" post-disaster images folder

In [None]:
train_imgs_max_labels = save_image_damage_levels_and_labels(img_dir='train', verbose_idx=100)

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500




In [None]:
train_imgs_max_labels.head()

Unnamed: 0,file_name,disaster,disaster_type,destroyed,major-damage,minor-damage,no-damage,un-classified,none,final_label
0,guatemala-volcano_00000000_post_disaster.json,guatemala-volcano,volcano,0.0,0.0,0.0,10.0,0.0,0.0,no-damage
1,guatemala-volcano_00000001_post_disaster.json,guatemala-volcano,volcano,0.0,0.0,4.0,0.0,0.0,0.0,minor-damage
2,guatemala-volcano_00000002_post_disaster.json,guatemala-volcano,volcano,1.0,0.0,0.0,0.0,0.0,0.0,destroyed
3,guatemala-volcano_00000006_post_disaster.json,guatemala-volcano,volcano,0.0,0.0,0.0,97.0,0.0,0.0,no-damage
4,guatemala-volcano_00000007_post_disaster.json,guatemala-volcano,volcano,0.0,0.0,0.0,9.0,0.0,0.0,no-damage


In [None]:
train_imgs_max_labels['final_label'].value_counts()

no-damage                                               1370
none                                                     516
destroyed                                                357
major-damage                                             261
minor-damage                                             157
un-classified                                             70
no-damage, destroyed                                      11
minor-damage, major-damage                                 7
no-damage, minor-damage                                    7
no-damage, major-damage                                    6
major-damage, un-classified                                6
minor-damage, destroyed                                    6
major-damage, destroyed                                    5
no-damage, un-classified                                   5
minor-damage, un-classified                                4
no-damage, minor-damage, major-damage                      3
no-damage, destroyed, un

In [None]:
train_imgs_max_labels[train_imgs_max_labels['final_label'] == 'destroyed'].head()

Unnamed: 0,file_name,disaster,disaster_type,destroyed,major-damage,minor-damage,no-damage,un-classified,none,final_label
2,guatemala-volcano_00000002_post_disaster.json,guatemala-volcano,volcano,1.0,0.0,0.0,0.0,0.0,0.0,destroyed
6,guatemala-volcano_00000010_post_disaster.json,guatemala-volcano,volcano,2.0,0.0,0.0,0.0,0.0,0.0,destroyed
40,hurricane-florence_00000043_post_disaster.json,hurricane-florence,flooding,1.0,0.0,0.0,0.0,0.0,0.0,destroyed
101,hurricane-florence_00000147_post_disaster.json,hurricane-florence,flooding,4.0,0.0,0.0,0.0,0.0,0.0,destroyed
295,hurricane-florence_00000475_post_disaster.json,hurricane-florence,flooding,28.0,0.0,0.0,7.0,0.0,0.0,destroyed


In [None]:
train_json_img_names, train_png_img_names = check_all_images_in_folder(img_dir='train')

In [None]:
train_json_img_names == train_png_img_names

True

# Save images to label directories 

In [None]:
import datetime
import os
import pandas as pd
import shutil
import traceback

def save_images_to_label_dirs(img_dir, specified_labels=["no-damage", "destroyed"], verbose_idx=100):
    """Saves images to directory given their max labels, only for specified labels. 
    
    :param img_dir: Can be any of the following: ["train", "test", "hold"].
    :param specified_labels: A list of the labels for which we want to save 
    images to directory.
    :param verbose_idx: Will print index of image after every `verbose_idx` images.
    """

    # Get images directory for specified dir
    img_sub_dir = "post-disaster-images"
    img_loc_path = img_dir + '/' + img_sub_dir

    # Read in max labels (ground truth labels) for specified dir
    df_labels = pd.read_csv(
        img_dir + '/' + f'{img_dir}_post_disaster_images_max_labels.csv.gz', 
        low_memory=False)
    
    # Create directory for labels
    dir_suffix = '-'.join(str(datetime.datetime.today()).split(' '))
    dir_to_save_labeled_imgs = img_dir + '/' + f'ground-truth-labels-{dir_suffix}'
    os.mkdir(dir_to_save_labeled_imgs)
    print(f'Saving {img_dir} images to label directories in {dir_to_save_labeled_imgs} ...')
    print(f"Selected labels: {', '.join(specified_labels)}")
    
    # Save images to label dirs (only for specified labels)
    for label in specified_labels:

        # Create new label dir
        new_label_dir = dir_to_save_labeled_imgs + '/' + label
        os.mkdir(new_label_dir)

        # Iterate through labels df to find images and save to dir
        sub_df_labels = df_labels[df_labels['final_label'] == label]
        print(f'Saving {sub_df_labels.shape[0]} {label} images to {new_label_dir}...')

        img_ctr = 0
        for i, row in sub_df_labels.iterrows():
            img_file_name = row['file_name'].split('.')[0] + '.png'
            img_src = img_loc_path + '/' + img_file_name
            img_dst = new_label_dir
            try:
                shutil.copy2(img_src, img_dst)
                img_ctr += 1
                if img_ctr % verbose_idx == 0:
                    print(img_ctr)
            except:
                print(traceback.print_exc())
                break
        
        print(f'Saved {img_ctr} {label} images to {new_label_dir}.')

    print('Done.')
        

In [None]:
save_images_to_label_dirs(img_dir='train')

Saving train images to label directories in train/ground-truth-labels-2022-04-20-18:39:35.114357 ...
Selected labels: no-damage, destroyed
Saving 1370 no-damage images to train/ground-truth-labels-2022-04-20-18:39:35.114357/no-damage...
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
Saved 1370 no-damage images to train/ground-truth-labels-2022-04-20-18:39:35.114357/no-damage.
Saving 357 destroyed images to train/ground-truth-labels-2022-04-20-18:39:35.114357/destroyed...
100
200
300
Saved 357 destroyed images to train/ground-truth-labels-2022-04-20-18:39:35.114357/destroyed.
Done.


In [None]:
save_images_to_label_dirs(img_dir='hold')

Saving hold images to label directories in hold/ground-truth-labels-2022-04-20-18:50:25.371671 ...
Selected labels: no-damage, destroyed
Saving 437 no-damage images to hold/ground-truth-labels-2022-04-20-18:50:25.371671/no-damage...
100
200
300
400
Saved 437 no-damage images to hold/ground-truth-labels-2022-04-20-18:50:25.371671/no-damage.
Saving 125 destroyed images to hold/ground-truth-labels-2022-04-20-18:50:25.371671/destroyed...
100
Saved 125 destroyed images to hold/ground-truth-labels-2022-04-20-18:50:25.371671/destroyed.
Done.


In [None]:
save_images_to_label_dirs(img_dir='test')

Saving test images to label directories in test/ground-truth-labels-2022-04-20-18:54:12.256069 ...
Selected labels: no-damage, destroyed
Saving 476 no-damage images to test/ground-truth-labels-2022-04-20-18:54:12.256069/no-damage...
100
200
300
400
Saved 476 no-damage images to test/ground-truth-labels-2022-04-20-18:54:12.256069/no-damage.
Saving 107 destroyed images to test/ground-truth-labels-2022-04-20-18:54:12.256069/destroyed...
100
Saved 107 destroyed images to test/ground-truth-labels-2022-04-20-18:54:12.256069/destroyed.
Done.


# Appendix

In [None]:
import json
import os
import pandas as pd

In [None]:
labels_dir = 'labels/'

In [None]:
# Reading in one JSON to see structure
file_name = "socal-fire_00001384_post_disaster.json"

with open(labels_dir + file_name) as f:
    data = json.load(f)
f.close()

In [None]:
data
# Looks like this image has no identified buildings
# The list "lng_lat" is empty

{'features': {'lng_lat': [], 'xy': []},
 'metadata': {'capture_date': '2018-11-14T18:42:58.000Z',
  'catalog_id': '103001008513F200',
  'disaster': 'socal-fire',
  'disaster_type': 'fire',
  'gsd': 2.5700748,
  'height': 1024,
  'id': 'MjU4MTA1Ng.EOdtzG5Wlt9Emjh2Lxz3vT2wIQ4',
  'img_name': 'socal-fire_00001384_post_disaster.png',
  'off_nadir_angle': 32.6742,
  'original_height': 1024,
  'original_width': 1024,
  'pan_resolution': 0.64383847,
  'provider_asset_type': 'WORLDVIEW02',
  'sensor': 'WORLDVIEW02',
  'sun_azimuth': 163.31218,
  'sun_elevation': 35.885086,
  'target_azimuth': 334.1289,
  'width': 1024}}

In [None]:
# Can use "disaster" and "disaster_type" features to check for the types of disasters

In [None]:
data.keys()

dict_keys(['features', 'metadata'])

In [None]:
data['features'].keys()

dict_keys(['lng_lat', 'xy'])

In [None]:
data['features']['lng_lat']

[{'properties': {'feature_type': 'building',
   'subtype': 'minor-damage',
   'uid': '88703461-a33d-4327-9244-a0d4e2242ede'},
  'wkt': 'POLYGON ((-90.83554484998086 14.43845885230631, -90.83537287728352 14.4384423422973, -90.83538037422657 14.43831945352924, -90.83540560603248 14.43831968526952, -90.83540995792454 14.43824243758745, -90.83550367603263 14.43824329830721, -90.83550436486316 14.43826526135382, -90.83554937919234 14.43827006610289, -90.83554796853291 14.43832274927794, -90.83562901982741 14.43832876323568, -90.83562374139244 14.43840863722861, -90.83556340771383 14.43840369180101, -90.83556321239615 14.43842389019052, -90.83554879421308 14.43842375777654, -90.83554484998086 14.43845885230631))'},
 {'properties': {'feature_type': 'building',
   'subtype': 'destroyed',
   'uid': 'e168e405-3479-44ee-849a-7af2ed32dee1'},
  'wkt': 'POLYGON ((-90.83658244456636 14.43748886352666, -90.8364608085529 14.43748599046975, -90.83645960893244 14.43742362233364, -90.83658299625326 14.437

In [None]:
data['features']['lng_lat'][0]

{'properties': {'feature_type': 'building',
  'subtype': 'minor-damage',
  'uid': '88703461-a33d-4327-9244-a0d4e2242ede'},
 'wkt': 'POLYGON ((-90.83554484998086 14.43845885230631, -90.83537287728352 14.4384423422973, -90.83538037422657 14.43831945352924, -90.83540560603248 14.43831968526952, -90.83540995792454 14.43824243758745, -90.83550367603263 14.43824329830721, -90.83550436486316 14.43826526135382, -90.83554937919234 14.43827006610289, -90.83554796853291 14.43832274927794, -90.83562901982741 14.43832876323568, -90.83562374139244 14.43840863722861, -90.83556340771383 14.43840369180101, -90.83556321239615 14.43842389019052, -90.83554879421308 14.43842375777654, -90.83554484998086 14.43845885230631))'}

In [None]:
data['features']['lng_lat'][0]['properties']

{'feature_type': 'building',
 'subtype': 'minor-damage',
 'uid': '88703461-a33d-4327-9244-a0d4e2242ede'}

In [None]:
data['features']['lng_lat'][0]['properties']['subtype']

'minor-damage'

In [None]:
# To loop through all the features (buildings, etc.) in "lng_lat" list for an image:
for feature in data['features']['lng_lat']:
    properties = feature['properties']
    feature_type = properties['feature_type']
    damage_level = properties['subtype']

In [None]:
df = pd.DataFrame()
for img_json_idx, img_label_json_file_name in enumerate(os.listdir(labels_dir)):

    # Make sure it's a post-disaster image
    if 'post_disaster' not in img_label_json_file_name:
        continue

    with open(labels_dir + img_label_json_file_name) as f:
        data = json.load(f)
    f.close()
    
    for idx, feature in enumerate(data['features']['lng_lat']):
        img_info = dict()
        img_info['file_name'] = img_label_json_file_name
        img_info['feature_index'] = int(idx)
        img_info['disaster'] = data['metadata']['disaster']
        img_info['disaster_type'] = data['metadata']['disaster_type']

        properties = feature['properties']
        img_info['feature_type'] = properties['feature_type']
        img_info['damage_level'] = properties['subtype']

        df = df.append(img_info, ignore_index=True)

    if img_json_idx == 5:
        break

In [None]:
file_name = "hurricane-matthew_00000135_post_disaster.json"

with open(labels_dir + file_name) as f:
    data = json.load(f)
f.close()

In [None]:
len(data['features']['lng_lat'])

22

In [None]:
df

Unnamed: 0,file_name,feature_index,disaster,disaster_type,feature_type,damage_level
0,hurricane-matthew_00000135_post_disaster.json,0.0,hurricane-matthew,wind,building,minor-damage
1,hurricane-matthew_00000135_post_disaster.json,1.0,hurricane-matthew,wind,building,destroyed
2,hurricane-matthew_00000135_post_disaster.json,2.0,hurricane-matthew,wind,building,major-damage
3,hurricane-matthew_00000135_post_disaster.json,3.0,hurricane-matthew,wind,building,destroyed
4,hurricane-matthew_00000135_post_disaster.json,4.0,hurricane-matthew,wind,building,major-damage
5,hurricane-matthew_00000135_post_disaster.json,5.0,hurricane-matthew,wind,building,minor-damage
6,hurricane-matthew_00000135_post_disaster.json,6.0,hurricane-matthew,wind,building,destroyed
7,hurricane-matthew_00000135_post_disaster.json,7.0,hurricane-matthew,wind,building,destroyed
8,hurricane-matthew_00000135_post_disaster.json,8.0,hurricane-matthew,wind,building,major-damage
9,hurricane-matthew_00000135_post_disaster.json,9.0,hurricane-matthew,wind,building,major-damage


In [None]:
df2 = pd.concat([df, pd.get_dummies(df['damage_level'])], axis=1)

In [None]:
df2

Unnamed: 0,file_name,feature_index,disaster,disaster_type,feature_type,damage_level,destroyed,major-damage,minor-damage,no-damage
0,hurricane-matthew_00000135_post_disaster.json,0.0,hurricane-matthew,wind,building,minor-damage,0,0,1,0
1,hurricane-matthew_00000135_post_disaster.json,1.0,hurricane-matthew,wind,building,destroyed,1,0,0,0
2,hurricane-matthew_00000135_post_disaster.json,2.0,hurricane-matthew,wind,building,major-damage,0,1,0,0
3,hurricane-matthew_00000135_post_disaster.json,3.0,hurricane-matthew,wind,building,destroyed,1,0,0,0
4,hurricane-matthew_00000135_post_disaster.json,4.0,hurricane-matthew,wind,building,major-damage,0,1,0,0
5,hurricane-matthew_00000135_post_disaster.json,5.0,hurricane-matthew,wind,building,minor-damage,0,0,1,0
6,hurricane-matthew_00000135_post_disaster.json,6.0,hurricane-matthew,wind,building,destroyed,1,0,0,0
7,hurricane-matthew_00000135_post_disaster.json,7.0,hurricane-matthew,wind,building,destroyed,1,0,0,0
8,hurricane-matthew_00000135_post_disaster.json,8.0,hurricane-matthew,wind,building,major-damage,0,1,0,0
9,hurricane-matthew_00000135_post_disaster.json,9.0,hurricane-matthew,wind,building,major-damage,0,1,0,0


In [None]:
df3 = df2.groupby(['file_name', 'disaster', 'disaster_type'])['destroyed', 'major-damage', 'minor-damage', 'no-damage'].sum().reset_index()
df3

  """Entry point for launching an IPython kernel.


Unnamed: 0,file_name,disaster,disaster_type,destroyed,major-damage,minor-damage,no-damage
0,hurricane-matthew_00000135_post_disaster.json,hurricane-matthew,wind,11,8,3,0
1,socal-fire_00000356_post_disaster.json,socal-fire,fire,5,1,0,14


In [None]:
def get_max_label(row):
    row_dmgs = dict()
    damage_levels = ['no-damage', 'minor-damage', 'major-damage', 'destroyed']
    for dmg in damage_levels:
        row_dmgs[dmg] = row[dmg]
    return max(row_dmgs, key=row_dmgs.get)

In [None]:
df3['final_label'] = df3.apply(lambda row: get_max_label(row), axis=1)

In [None]:
df3

Unnamed: 0,file_name,disaster,disaster_type,destroyed,major-damage,minor-damage,no-damage,final_label
0,hurricane-matthew_00000135_post_disaster.json,hurricane-matthew,wind,11,8,3,0,destroyed
1,socal-fire_00000356_post_disaster.json,socal-fire,fire,5,1,0,14,no-damage


# Appendix: Create image labels for Google Cloud AutoML Vision

In [None]:
import os
import pandas as pd

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
MY_PATH = "drive/MyDrive/COMS E6998 PDL Sys Perf/Project/"
os.chdir(MY_PATH)

In [None]:
os.getcwd()

'/content/drive/MyDrive/COMS E6998 PDL Sys Perf/Project'

In [None]:
ls

 create_image_labels.ipynb
 df2_gcloud_img_locs.csv
 df_gcloud_img_locs.csv
 [0m[01;34mdl-project[0m/
'Final Presentation notes.gdoc'
 [01;34mgt-labels[0m/
 [01;34mhold[0m/
 [01;34mold[0m/
 [01;34mtest[0m/
 [01;34mtrain[0m/
 using-autogluon.ipynb
'Using AutoML to Classify Damage Levels in Natural Disaster Images.gslides'


In [None]:
# Create CSV as follows for training/testing data bucket locations

# [set,]image_path[,label]
# TRAIN,gs://My_Bucket/sample1.jpg,cat
# TEST,gs://My_Bucket/sample2.jpg,dog

In [None]:
df_gcloud_img_locs = pd.DataFrame(columns=['set', 'image_path', 'label'])
df_gcloud_img_locs

Unnamed: 0,set,image_path,label


In [None]:
MAIN_BUCKET = "gs://images2-disaster-classify-project"
GDRIVE_IMG_FOLDER = "dl-project"
# bucket_path = "gs://images_disaster_classify_project/train/no-damage/"

In [None]:
for img_set in os.listdir(GDRIVE_IMG_FOLDER):  # train or test
    if img_set in ['train', 'validation', 'test']:
        for label in os.listdir("/".join([GDRIVE_IMG_FOLDER, img_set])):  # no-damage or destroyed
            if label in ['no-damage', 'destroyed']:
                for img in os.listdir("/".join([GDRIVE_IMG_FOLDER, img_set, label])):
                    img_info_dict = {
                        'set': img_set.upper(),
                        'image_path': "/".join([MAIN_BUCKET, img_set, label, img]),
                        'label': label.replace('-', '_')
                    }
                    df_gcloud_img_locs = df_gcloud_img_locs.append(img_info_dict, ignore_index=True)

In [None]:
df_gcloud_img_locs.shape

(2872, 3)

In [None]:
df_gcloud_img_locs['set'].value_counts()

TRAIN         1727
TEST           583
VALIDATION     562
Name: set, dtype: int64

In [None]:
df_gcloud_img_locs

Unnamed: 0,set,image_path,label
0,TEST,gs://images2-disaster-classify-project/test/de...,destroyed
1,TEST,gs://images2-disaster-classify-project/test/de...,destroyed
2,TEST,gs://images2-disaster-classify-project/test/de...,destroyed
3,TEST,gs://images2-disaster-classify-project/test/de...,destroyed
4,TEST,gs://images2-disaster-classify-project/test/de...,destroyed
...,...,...,...
2867,VALIDATION,gs://images2-disaster-classify-project/validat...,no_damage
2868,VALIDATION,gs://images2-disaster-classify-project/validat...,no_damage
2869,VALIDATION,gs://images2-disaster-classify-project/validat...,no_damage
2870,VALIDATION,gs://images2-disaster-classify-project/validat...,no_damage


In [None]:
df_gcloud_img_locs.loc[0]['image_path']

'gs://images2-disaster-classify-project/test/destroyed/hurricane-florence_00000029_post_disaster.png'

In [None]:
df_gcloud_img_locs.to_csv('df2_gcloud_img_locs.csv', index=False)

In [None]:
df2_gcloud_img_locs = pd.read_csv('df2_gcloud_img_locs.csv')

In [None]:
df2_gcloud_img_locs[df2_gcloud_img_locs['set'] == 'TEST']['label'].value_counts()

no_damage    476
destroyed    107
Name: label, dtype: int64

In [None]:
'no-damage/'[:-1].replace('-', '_')

'no_damage'

In [None]:
"/".join([GDRIVE_IMG_FOLDER, img_set])

'dl-project/test'