In [1]:
#hide
import os
from pathlib import Path
import matplotlib.pyplot as plt
import glob
import time

import pandas as pd
pd.options.mode.chained_assignment=None

from fastai.vision.all import *

In [2]:
# Change this path to adapt to where you downloaded the data
BASE_PATH = Path("/storage/geolifeclef-2021/")
DATA_PATH = BASE_PATH / "data"

# Create the path to save submission files
SUBMISSION_PATH = Path("submissions")
os.makedirs(SUBMISSION_PATH, exist_ok=True)

In [3]:
import sys

sys.path.append( '../' )
from GLC.data_loading.common import load_patch
from GLC.plotting import visualize_observation_patch
from GLC.metrics import top_30_error_rate
from GLC.metrics import top_k_error_rate_from_sets
from GLC.metrics import predict_top_30_set, predict_top_k_set

PATCHES_PATH = DATA_PATH / "patches/"
patch = load_patch(10000000, PATCHES_PATH)

print("Number of data sources: {}".format(len(patch)))
print("Arrays shape: {}".format([p.shape for p in patch]))
print("Data types: {}".format([p.dtype for p in patch]))

Number of data sources: 4
Arrays shape: [(256, 256, 3), (256, 256), (256, 256), (256, 256)]
Data types: [dtype('uint8'), dtype('uint8'), dtype('int16'), dtype('uint8')]


In [4]:
def get_x(r):
    """Loads the patch data associated to an observation id

    Parameters
    ----------
    observation_id : integer
        Identifier of the observation.
    patches_path : string / pathlib.Path
        Path to the folder containing all the patches.
    landcover_mapping : 1d array-like
        Facultative mapping of landcover codes, useful to align France and US codes.
    return_arrays : boolean
        If True, returns all the patches as Numpy arrays (no PIL.Image returned).

    Returns
    -------
    patches : tuple of size 4 containing 2d array-like objects
        Returns a tuple containing all the patches in the following order: RGB, Near-IR, altitude and landcover.
    """
    observation_id = str(r['observation_id'])

    region_id = observation_id[0]
    if region_id == "1":
        region = "fr"
    elif region_id == "2":
        region = "us"
    else:
        raise ValueError("Incorrect 'observation_id' {}, can not extract region id from it".format(observation_id))

    subfolder1 = observation_id[-2:]
    subfolder2 = observation_id[-4:-2]

    filename = Path(PATCHES_PATH) / region / subfolder1 / subfolder2 / observation_id

    rgb_filename = filename.with_name(filename.stem + "_rgb.jpg")
    
    return rgb_filename

def get_y(r):
    species_id = r['species_id']
    
    return species_id

def splitter(df):
    train = df.index[df['subset'] == 'train'].tolist()
    valid = df.index[df['subset'] != 'train'].tolist()
    return train,valid

def get_observations(data_path):
    df_fr = pd.read_csv(data_path / "observations" / "observations_fr_train.csv",
                        sep=";", index_col="observation_id")
    df_us = pd.read_csv(data_path / "observations" / "observations_us_train.csv",
                        sep=";", index_col="observation_id")
    
    df = pd.concat((df_fr, df_us))
    
    return df

In [5]:
dls = load_pickle('dataloaders.pkl')

In [6]:
df_val = load_pickle('observations_test_df.pkl')

In [7]:
learn = cnn_learner(dls, resnet34)

In [8]:
learn = learn.load(file='geolife-21-cnn_0')

In [9]:
y_val = df_val.species_id.astype(int)

In [10]:
dl_t = learn.dls.test_dl(df_val, bs=128)
preds, _ = learn.get_preds(dl=dl_t)

In [11]:
preds_ids = array([learn.dls.vocab[pred] for pred in predict_top_k_set(preds, 30)])
score_val = top_k_error_rate_from_sets(y_val, preds_ids)
print("Top-30 error rate: {:.1%}".format(score_val))

Top-30 error rate: 81.1%


In [15]:
def get_test_observations(data_path):
    df_fr_test = pd.read_csv(DATA_PATH / "observations" / "observations_fr_test.csv", sep=";",
                             low_memory=False, dtype={'observation_id': str})
    df_us_test = pd.read_csv(DATA_PATH / "observations" / "observations_us_test.csv", sep=";",
                             low_memory=False, dtype={'observation_id': str})
    
    df_test = pd.concat((df_fr_test, df_us_test))
    return df_test

def generate_submission_file(filename, corrected_observation_ids, s_pred):
    s_pred = [
        " ".join(map(str, pred_set))
        for pred_set in s_pred
    ]
    
    df = pd.DataFrame({
        "Id": corrected_observation_ids,
        "Predicted": s_pred
    })
    df.to_csv(filename, index=False)

In [None]:
df_test = get_test_observations(DATA_PATH)
df_test_obs_id_mapping = pd.read_csv(BASE_PATH / "test_observation_ids_mapping.csv", sep=";")

In [17]:
dl_t = learn.dls.test_dl(df_test, bs=256)
preds, _ = learn.get_preds(dl=dl_t)
preds_ids = array([learn.dls.vocab[pred] for pred in predict_top_k_set(preds, 30)])

# Generate the submission file
generate_submission_file(SUBMISSION_PATH/"fastai_cnn_rbg.csv", df_test_obs_id_mapping["Id"], preds_ids)

In [18]:
!kaggle competitions submit -c geolifeclef-2021 -f {SUBMISSION_PATH/"fastai_cnn_rbg.csv"} -m "fastai cnn submission"

100%|██████████████████████████████████████| 5.67M/5.67M [00:00<00:00, 10.8MB/s]
Successfully submitted to GeoLifeCLEF 2021 - LifeCLEF 2021 x FGVC8