# Pre-Processing

In [1]:
import os
import requests
import numpy as np
import pandas as pd
from PIL import Image
from matplotlib import pyplot as plt

In [2]:
path = "."

training_folder = 'D:/Dataset/aidea-farmland-crops-autumn/train'

public_folder = 'D:/Dataset/aidea-farmland-crops-autumn/public'

private_folder = 'D:/Dataset/aidea-farmland-crops-autumn/private'

labels = next(os.walk(training_folder), (None, None, []))[1]

## Helper Functions

In [3]:
# Note: PIL will automatically correct the width and height according to EXIF (angle)
def get_image_information_without_loading(path, folder=None, limit=None):
    data = {key: [] for key in ['file', 'label', 'shape', 'height', 'width', 'taken_datetime', 'make', 'model', 'angle']}
    for file in os.listdir(path)[:limit]:
        filedir = F'{path}/{file}'
        image = Image.open(filedir)
        data['file'].append(file)
        data['label'].append(folder)
        data['shape'].append(image.size)
        data['width'].append(image.size[0])
        data['height'].append(image.size[1])
        # check image._getexif is not NoneType and has key 36867
        data['taken_datetime'].append(pd.to_datetime(image._getexif()[36867], format="%Y:%m:%d %H:%M:%S") if 
            (image._getexif() is not None and 36867 in image._getexif()) else None)
        # check image._getexif is not NoneType and has key 271
        data['make'].append(image._getexif()[271] if 
            (image._getexif() is not None and 271 in image._getexif()) else None)
        # check image._getexif is not NoneType and has key 272
        data['model'].append(image._getexif()[272] if 
            (image._getexif() is not None and 272 in image._getexif()) else None)
        # check image._getexif is not NoneType and has key 274
        data['angle'].append(image._getexif()[274] if (image._getexif() is not None and 274 in image._getexif()) else None)
    return pd.DataFrame(data)

# Note: go_through_folders
def go_through_folders_to_get_image_information(path, limit=None, verbose=0):
    files, folders = [], []
    data = pd.DataFrame()
    for (dirpath, foldernames, filenames) in os.walk(path):
        folders.extend(foldernames), files.extend(filenames)
        break
    for idx, folder in enumerate(folders):
        folderdir = F'{path}/{folder}'
        if(verbose):
            print(F'{idx+1}/{len(folders)}, folderdir: {folderdir}')
        current = get_image_information_without_loading(folderdir, folder, limit=limit)
        data = pd.concat([data, current])
    return data.reset_index(drop=True)

In [4]:
def get_angle_from_exif(path):
    image = Image.open(path)
    angle = image._getexif()[274] if (image._getexif() is not None and 274 in image._getexif()) else None
    return angle 

## Read Image and EXIF
讀取圖片的基本資料，需要注意並非每張圖片都有 EXIF 資訊

### training

In [None]:
train_tag_loc_coor = pd.read_csv(F'{path}/data/train_tag_loc_coor.csv')
train_describe = go_through_folders_to_get_image_information(training_folder, limit=5, verbose=1)
# Get Datetime from EXIF
train_describe['taken_month'] = pd.to_datetime(train_describe['taken_datetime']).dt.month
train_describe['taken_year'] = pd.to_datetime(train_describe['taken_datetime']).dt.year
train_describe['taken_hour'] = pd.to_datetime(train_describe['taken_datetime']).dt.hour
# Get Angle from EXIF
train_describe['angle'] = train_describe['angle'].apply(lambda x: {1:0, 3:180, 6:270, 8:90}[x] if x in [1, 3, 6, 8] else x)
train_describe = pd.merge(train_describe, train_tag_loc_coor, on='file')
train_describe.to_csv(F'{path}/train_tag_loc_coor_describe.csv', index=False)

In [5]:
train_describe = pd.read_csv(F'{path}/data/train_tag_loc_coor_describe.csv')
train_describe.describe()

Unnamed: 0,taken_month,taken_year,taken_hour,target_fid,target_x,target_y,town_x,town_y,width,height,angle
count,55096.0,55096.0,55096.0,89514.0,89514.0,89514.0,89514.0,89514.0,89514.0,89514.0,84356.0
mean,7.317174,2019.442428,12.25294,44756.5,0.65077,-33.431352,120.563448,23.622252,2783.064236,3355.647094,75.44739
std,3.112334,0.55025,3.143825,25840.610335,22.651154,92.33117,0.289508,0.576954,1026.27296,1464.427506,121.155522
min,1.0,2019.0,4.0,0.0,-1462.0,-2683.0,120.099205,22.049339,750.0,640.0,0.0
25%,4.0,2019.0,10.0,22378.25,0.0,0.0,120.364716,23.404182,1633.0,1600.0,0.0
50%,9.0,2019.0,12.0,44756.5,0.0,0.0,120.483185,23.778591,3000.0,4000.0,0.0
75%,10.0,2020.0,15.0,67134.75,0.0,0.0,120.665039,23.955526,3120.0,4208.0,270.0
max,12.0,2022.0,20.0,89513.0,1004.0,2043.0,121.760269,25.083782,8000.0,8000.0,270.0


In [7]:
# Check how much data has exif
invalid_takens = train_describe[(train_describe['taken_datetime'].isnull())]
print(F'Invalid Takens: {len(invalid_takens)} ({len(invalid_takens)/len(train_describe)*100:.2f}%)')

Invalid Takens: 34418 (38.45%)


### public

In [None]:
public_tag_loc_coor = pd.read_csv(F'{path}/data/public_tag_loc_coor.csv')
public_describe = get_image_information_without_loading(private_folder)
# Get Angle from EXIF
public_describe['angle'] = public_describe['angle'].apply(lambda x: {1:0, 3:180, 6:270, 8:90}[x] if x in [1, 3, 6, 8] else x)
public_describe = pd.merge(public_describe, public_tag_loc_coor, on='file')
public_describe.to_csv(F'{path}/data/public_tag_loc_coor_describe.csv', index=False)

### private

In [29]:
private_tag_loc_coor = pd.read_csv(F'{path}/data/private_tag_loc_coor.csv')
private_describe = get_image_information_without_loading(private_folder)
# Get Angle from EXIF
private_describe['angle'] = private_describe['angle'].apply(lambda x: {1:0, 3:180, 6:270, 8:90}[x] if x in [1, 3, 6, 8] else x)
private_describe = pd.merge(private_describe, private_tag_loc_coor, on='file')
private_describe.to_csv(F'{path}/data/private_tag_loc_coor_describe.csv', index=False)

In [32]:
# Check how much data has exif
invalid_takens = private_describe[(private_describe['taken_datetime'].isnull())]
print(F'Invalid Takens: {len(invalid_takens)} ({len(invalid_takens)/len(private_describe)*100:.2f}%)')

Invalid Takens: 4300 (38.53%)


In [46]:
# private_tag_loc_coor = pd.read_csv(F'{path}/data/private_tag_loc_coor.csv')
# private = pd.merge(private_tag_loc_coor, private, on='file')
# private.to_csv(F'{path}/data/private_tag_loc_coor_describe.csv', index=False)
private = pd.read_csv(F'{path}/data/private_tag_loc_coor_describe.csv')
private.isnull().sum()

target_fid            0
file                  0
target_x              0
target_y              0
county_name           0
town_name             0
town_x                0
town_y                0
label             11160
shape                 0
height                0
width                 0
taken_datetime     4300
make               4206
model              4206
angle               734
taken_month        4300
taken_year         4300
taken_hour         4300
dtype: int64

## Get Elevation of Towns
取得各個鄉鎮的海拔資訊

In [9]:
# Script for returning elevation from lat, long, based on open elevation data, which in turn is based on SRTM
def get_elevation(lat, long):
    query = ('https://api.open-elevation.com/api/v1/lookup'
             f'?locations={lat},{long}')
    r = requests.get(query).json()  # json object, various ways you can extract value
    # one approach is to use pandas json functionality:
    elevation = pd.json_normalize(r, 'results')['elevation'].values[0]
    return elevation

In [47]:
group_by_town_x_y = train_describe.groupby(['county_name', 'town_name', 'town_x', 'town_y']).size().reset_index(name='counts')
group_by_town_x_y = group_by_town_x_y.sort_values(by=['counts'], inplace=False).reset_index(drop=True)
len(group_by_town_x_y)

211

In [55]:
group_by_town_x_y['town_z'] = group_by_town_x_y.apply(lambda row: get_elevation(row['town_y'], row['town_x']), axis=1)
group_by_town_x_y.to_csv(F'{path}/data/train_groupby_town_elevation.csv', index=None)

## Apply Elevation to Dataset
將海拔資訊加入到資料集中

In [47]:
def get_elevation_from_dataframe_by_county_town(df, county_name, town_name):
    df = df[df['county_name'].eq(county_name)]
    df = df[df['town_name'].eq(town_name)]
    return df['town_z'].mean()

In [48]:
group_by_town_x_y = pd.read_csv(F'{path}/data/train_groupby_town_elevation.csv')

In [59]:
train_describe['town_z'] = train_describe.apply(lambda row: get_elevation_from_dataframe_by_county_town(group_by_town_x_y, row['county_name'], row['town_name']), axis=1)
train_describe.to_csv(F'{path}/data/train_tag_loc_coor_describe_elevation.csv', index=None)

In [49]:
public_describe['town_z'] = public_describe.apply(lambda row: get_elevation_from_dataframe_by_county_town(group_by_town_x_y, row['county_name'], row['town_name']), axis=1)
public_describe.to_csv(F'{path}/data/public_tag_loc_coor_describe_elevation.csv', index=None)

In [None]:
private_describe['town_z'] = private_describe.apply(lambda row: get_elevation_from_dataframe_by_county_town(group_by_town_x_y, row['county_name'], row['town_name']), axis=1)
private_describe.to_csv(F'{path}/data/private_tag_loc_coor_describe_elevation.csv', index=None)