In [1]:
import os
import glob
import json

import numpy as np
import pandas as pd

# Parse existing data and map images to towns

In [2]:
town_map = {
    "mixco_1_and_ebenezer":0,
    "mixco_3":1,
    "borde_soacha":2,
    "borde_rural":3,
    "dennery":4,
    "castries":5,
    "gros_islet":5
}

file_map = {}

for town in town_map:
    train_file = "data/geojsons/train-%s.geojson"%town
    test_file  = "data/geojsons/test-%s.geojson" %town
    
    with open(train_file) as geojson_file:
        geojson = json.load(geojson_file)
        
        for feature in geojson['features']:
            file_map[feature['id']] = town_map[town]
    
    if not os.path.exists(test_file): continue
    with open(test_file) as geojson_file:
        geojson = json.load(geojson_file)
        
        for feature in geojson['features']:
            file_map[feature['id']] = town_map[town]

In [3]:
df = pd.read_csv("data/train_labels.csv", index_col='id')
verified = df[df.verified==True]
verified.head()

Unnamed: 0_level_0,verified,concrete_cement,healthy_metal,incomplete,irregular_metal,other
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
7a3f2a10,True,1.0,0.0,0.0,0.0,0.0
7a1f731e,True,0.0,0.0,0.0,1.0,0.0
7a424ad8,True,0.0,1.0,0.0,0.0,0.0
7a3edc5e,True,0.0,1.0,0.0,0.0,0.0
7a303a6e,True,0.0,1.0,0.0,0.0,0.0


In [4]:
files  = list(verified.index)
labels = verified[df.columns[1:]].to_numpy().argmax(1)
train_ids = [file_map[fid] for fid in files]

print(np.bincount(labels) / len(labels) * 100)
print("Training town distribution : ", np.bincount(train_ids))

[ 9.32750504 49.63685272  4.49226631 35.24546066  1.29791527]
Training town distribution :  [3351  159 5995 4361 1004]


# Generate dataframe with Augmented Data

In [5]:
new_train_ids = pd.DataFrame(data={"label":labels, "filelabel":train_ids}, index=files, columns=["label", "filelabel"])
new_train_ids.to_csv("new_train.csv", index_label="id")
new_train_ids.head()

Unnamed: 0,label,filelabel
7a3f2a10,0,3
7a1f731e,3,3
7a424ad8,1,3
7a3edc5e,1,3
7a303a6e,1,3


In [6]:
test_files = [fname[:-4] for fname in os.listdir("data/test_aligned/test_images/")]
test_ids   = [file_map[fid] for fid in test_files]

print("Testing town distribution : ", np.bincount(test_ids))
new_test_ids = pd.DataFrame(data=test_ids, index=test_files, columns=["filelabel"])
new_test_ids.to_csv("new_test.csv", index_label="id")
new_test_ids.head()

Testing town distribution :  [1568   58 3017 2160  522]


Unnamed: 0,filelabel
7a459b34,3
7a4b61ea,4
7a4e8258,3
7a511018,2
7a4f5a34,0
