# [Thanks to the starter kernel provided by @tlorieul](https://www.kaggle.com/code/tlorieul/geolifeclef2022-data-loading-and-visualization)

In [None]:
CUDA_LAUNCH_BLOCKING="1"

In [None]:
!pip install pretrainedmodels

In [None]:
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
import torch.nn.functional as F
import pretrainedmodels

In [None]:
%pylab inline --no-import-all

import os
from pathlib import Path

import pandas as pd


# Change this path to adapt to where you downloaded the data
DATA_PATH = Path("../input/geolifeclef-2022-lifeclef-2022-fgvc9")

# Create the path to save submission files
SUBMISSION_PATH = Path("submissions")
os.makedirs(SUBMISSION_PATH, exist_ok=True)

# Clone the GitHub repository
!rm -rf GLC
!git clone https://github.com/maximiliense/GLC

In [None]:
import skimage.io
from skimage.io import imread
import tifffile 
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
from tqdm.notebook import tqdm

import cv2
import shutil, json

import tensorflow as tf

import glob, os
import seaborn as sns
import gc, pandas as pd, numpy as np
import warnings
from warnings import WarningMessage, filterwarnings

In [None]:
import torch
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


In [None]:
df_obs_fr_train = pd.read_csv(DATA_PATH / "observations" / "observations_fr_train.csv", sep=";", index_col="observation_id", nrows = 10000)
df_obs_us_train = pd.read_csv(DATA_PATH / "observations" / "observations_us_train.csv", sep=";", index_col="observation_id", nrows=10000)

df_obs_train = pd.concat((df_obs_fr_train, df_obs_us_train))

obs_id_train = df_obs_train.index.values

print("Number of observations for testing: {}".format(len(df_obs_train)))

df_obs_train.head()

In [None]:
df_obs_train.longitude.isnull().sum()

In [None]:
from GLC.data_loading.environmental_raster import PatchExtractor
from pathlib import Path
#from GLC.data_loading.common import load_patch


from torch.utils.data import Dataset, DataLoader

# plot environmental rasters

In [None]:
extractor_bio = PatchExtractor(DATA_PATH / "rasters", size=256)
extractor_bio.add_all_bioclimatic_rasters()
extractor_bio.append('sndppt')
print("Number of rasters: {}".format(len(extractor_bio)))

In [None]:
fig = plt.figure(figsize=(14, 10))
extractor_bio.plot((43.61, 3.88), fig=fig)

In [None]:
# another lat,long position
fig = plt.figure(figsize=(14, 10))
extractor_bio.plot((46.783695,2.072855), fig=fig)

In [None]:
bio_batches = extractor_bio[(46.783695,2.072855)]


In [None]:
print("Arrays shape: {}".format([p.shape for p in bio_batches]))
print("Data types: {}".format([p.dtype for p in bio_batches]))

In [None]:
#bio_tuple = tuple(bio_batches)

In [None]:
plt.imshow(bio_batches[0])

In [None]:
#gc.collect()

**BDTICM**: Absolute depth to bedrock cm

**BLDFIE**: Bulk density (fine earth) kg/m3

**CECSOL**: Cation Exchange Capacity of soil cmolc/kg

**CLYPPT**: Weight percentage of the clay particles (<0.0002 mm) percentage

**ORCDRC**: Soil organic carbon content permille

**PHIHOX**: pH index measured in water solution pH

**SLTPPT**: Weight percentage of the silt particles (0.0002–0.05 mm) percentage

**SNDPPT**: Weight percentage of the sand particles (0.05–2 mm) percentage

# We can use either extractor_bio or extractor_p

In [None]:
"""
extractor_p = PatchExtractor(DATA_PATH / "rasters", size=256)
extractor_p.append('bdticm')
extractor_p.append('bldfie')
extractor_p.append('cecsol')
extractor_p.append('clyppt')

extractor_p.append('phihox')
extractor_p.append('orcdrc')
extractor_p.append('sltppt')

#extractor_p.append('sndppt')
"""

In [None]:
#extractor_p.append('sndppt')

# Pedologic factors for Montpellier region, Fr

In [None]:
#fig = plt.figure(figsize=(10, 10))
#extractor_p.plot((43.61, 3.88), fig=fig)

# build dataset, dataloader

In [None]:
from albumentations import (
    HorizontalFlip, VerticalFlip, IAAPerspective, ShiftScaleRotate, CLAHE, RandomRotate90,
    Transpose, ShiftScaleRotate, Blur, OpticalDistortion, GridDistortion, HueSaturationValue,
    IAAAdditiveGaussianNoise, GaussNoise, MotionBlur, MedianBlur, IAAPiecewiseAffine, RandomResizedCrop,
    IAASharpen, IAAEmboss, RandomBrightnessContrast, Flip, OneOf, Compose, Normalize, Cutout, CoarseDropout, ShiftScaleRotate, CenterCrop, Resize
)

from albumentations.pytorch import ToTensorV2

def get_train_transforms():
    return Compose([
            #RandomResizedCrop(256, 256),
            #Transpose(p=0.5),
            HorizontalFlip(p=0.5),
            VerticalFlip(p=0.5),
            #ShiftScaleRotate(p=0.5),
            #HueSaturationValue(hue_shift_limit=0.2, sat_shift_limit=0.2, val_shift_limit=0.2, p=0.5),
            #RandomBrightnessContrast(brightness_limit=(-0.1,0.1), contrast_limit=(-0.1, 0.1), p=0.5),
            Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], max_pixel_value=255.0, p=1.0),
            #CoarseDropout(p=0.5),
            #Cutout(p=0.5),
            ToTensorV2(p=1.0),
        ], p=1.)
  
        
def get_valid_transforms():
    return Compose([
            CenterCrop(256,256, p=1.),
            #Resize(CFG['img_size'], CFG['img_size']),
            Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], max_pixel_value=255.0, p=1.0),
            ToTensorV2(p=1.0),
        ], p=1.)

In [None]:
def load_patch(
    observation_id,
    patches_path,
    *,
    data="all",
    landcover_mapping=None,
    return_arrays=True
):
    """Loads the patch data associated to an observation id
    Parameters
    ----------
    observation_id : integer
        Identifier of the observation.
    patches_path : string / pathlib.Path
        Path to the folder containing all the patches.
    data : string or list of string
        Specifies what data to load, possible values: 'all', 'rgb', 'near_ir', 'landcover' or 'altitude'.
    landcover_mapping : 1d array-like
        Facultative mapping of landcover codes, useful to align France and US codes.
    return_arrays : boolean
        If True, returns all the patches as Numpy arrays (no PIL.Image returned).
    Returns
    -------
    patches : tuple of size 4 containing 2d array-like objects
        Returns a tuple containing all the patches in the following order: RGB, Near-IR, altitude and landcover.
    """
    observation_id = str(observation_id)

    region_id = observation_id[0]
    if region_id == "1":
        region = "patches-fr"
    elif region_id == "2":
        region = "patches-us"
    else:
        raise ValueError(
            "Incorrect 'observation_id' {}, can not extract region id from it".format(
                observation_id
            )
        )

    subfolder1 = observation_id[-2:]
    subfolder2 = observation_id[-4:-2]

    filename = Path(patches_path) / region / subfolder1 / subfolder2 / observation_id

    patches = []

    if data == "all":
        data = ["rgb", "near_ir", "landcover", "altitude"]

    if "rgb" in data:
        rgb_filename = filename.with_name(filename.stem + "_rgb.jpg")
        rgb_patch = Image.open(rgb_filename)
        if return_arrays:
            rgb_patch = np.asarray(rgb_patch)
        patches.append(rgb_patch)

    if "near_ir" in data:
        near_ir_filename = filename.with_name(filename.stem + "_near_ir.jpg")
        near_ir_patch = Image.open(near_ir_filename)
        if return_arrays:
            near_ir_patch = np.asarray(near_ir_patch)
        patches.append(near_ir_patch)

    if "altitude" in data:
        altitude_filename = filename.with_name(filename.stem + "_altitude.tif")
        altitude_patch = tifffile.imread(altitude_filename)
        patches.append(altitude_patch)

    if "landcover" in data:
        landcover_filename = filename.with_name(filename.stem + "_landcover.tif")
        landcover_patch = tifffile.imread(landcover_filename)
        #print (landcover_filename)
        if landcover_mapping is not None:
            landcover_patch = landcover_mapping[landcover_patch]
        patches.append(landcover_patch)

    return patches

In [None]:
#https://github.com/maximiliense/GLC/blob/master/data_loading/pytorch_dataset.py
class GeoLifeCLEF2022Dataset(Dataset):
    """Pytorch dataset handler for GeoLifeCLEF 2022 dataset.
    Parameters
    ----------
    root : string or pathlib.Path
        Root directory of dataset.
    subset : string, either "train", "val", "train+val" or "test"
        Use the given subset ("train+val" is the complete training data).
    region : string, either "both", "fr" or "us"
        Load the observations of both France and US or only a single region.
    patch_data : string or list of string
        Specifies what type of patch data to load, possible values: 'all', 'rgb', 'near_ir', 'landcover' or 'altitude'.
    use_rasters : boolean (optional)
        If True, extracts patches from environmental rasters.
    patch_extractor : PatchExtractor object (optional)
        Patch extractor to use if rasters are used.
    transform : callable (optional)
        A function/transform that takes a list of arrays and returns a transformed version.
    target_transform : callable (optional)
        A function/transform that takes in the target and transforms it.
    """

    def __init__(
        self,
        root,
        subset,
        *,
        region="both",
        patch_data="all",
        use_rasters=True,
        patch_extractor=None,
        transform=None,
        target_transform=None
    ):
        self.root = Path(root)
        self.subset = subset
        self.region = region
        self.patch_data = patch_data
        self.transform = transform
        self.target_transform = target_transform

        possible_subsets = ["train", "val", "train+val", "test"]
        if subset not in possible_subsets:
            raise ValueError(
                "Possible values for 'subset' are: {} (given {})".format(
                    possible_subsets, subset
                )
            )

        possible_regions = ["both", "fr", "us"]
        if region not in possible_regions:
            raise ValueError(
                "Possible values for 'region' are: {} (given {})".format(
                    possible_regions, region
                )
            )

        if subset == "test":
            subset_file_suffix = "test"
            self.training_data = False
        else:
            subset_file_suffix = "train"
            self.training_data = True

        df_fr = pd.read_csv(
            self.root
            / "observations"
            / "observations_fr_{}.csv".format(subset_file_suffix),
            sep=";",
            index_col="observation_id",nrows = 50000
        )
        df_us = pd.read_csv(
            self.root
            / "observations"
            / "observations_us_{}.csv".format(subset_file_suffix),
            sep=";",
            index_col="observation_id",nrows =50000
        )

        if region == "both":
            df = pd.concat((df_fr, df_us))
        elif region == "fr":
            df = df_fr
        elif region == "us":
            df = df_us

        if self.training_data and subset != "train+val":
            ind = df.index[df["subset"] == subset]
            df = df.loc[ind]

        self.observation_ids = df.index
        self.coordinates = df[["latitude", "longitude"]].values

        if self.training_data:
            self.targets =df.species_id.values #torch.tensor(df["species_id"].values, dtype = torch.long)
        else:
            self.targets = None

        # FIXME: add back landcover one hot encoding?
        # self.one_hot_size = 34
        # self.one_hot = np.eye(self.one_hot_size)

        if use_rasters:
            if patch_extractor is None:
                #from .environmental_raster import PatchExtractor

                patch_extractor = PatchExtractor(self.root / "rasters", size=256)
                patch_extractor.add_all_rasters()

            self.patch_extractor = patch_extractor
        else:
            self.patch_extractor = None

    def __len__(self):
        return len(self.observation_ids)

    def __getitem__(self, index):
        latitude = self.coordinates[index][0]
        longitude = self.coordinates[index][1]
        observation_id = self.observation_ids[index]
        try:
            
            patches = load_patch(
                observation_id, self.root, data=self.patch_data
            )
        except ValueError:
            pass
            
        patches = torch.Tensor(patches)
        # FIXME: add back landcover one hot encoding?
        # lc = patches[3]
        # lc_one_hot = np.zeros((self.one_hot_size,lc.shape[0], lc.shape[1]))
        # row_index = np.arange(lc.shape[0]).reshape(lc.shape[0], 1)
        # col_index = np.tile(np.arange(lc.shape[1]), (lc.shape[0], 1))
        # lc_one_hot[lc, row_index, col_index] = 1

        # Extracting patch from rasters
        if self.patch_extractor is not None:
            # this will have all the bioclimatic or pedologic rasters for the specific lat, long position
            print (observation_id, latitude, longitude)
            environmental_patches = self.patch_extractor[(latitude, longitude)]
            #patches = patches + torch.from_numpy(np.array(environmental_patches))
            # convert list to pytorch tensor
            #print (patches[0].size, patches[1].size, patches[2].size, patches[3].size)   #196608 65536 65536 65536
            #patches =  tf.ragged.constant(patches)
            # convert numpy to pytorch tensor
            #environmental_patches = torch.from_numpy(environmental_patches)
            #print (patches.shape)
            #print (environmental_patches.shape)  # 20,256,256
            patches = patches + torch.Tensor(environmental_patches)
            


        # Concatenate all patches into a single tensor
        if len(patches) == 1:
            patches = patches[0]

        if self.transform:
            patches = self.transform(patches)
            #patches = self.transform(image=patches)["image"]
            #print (patches.shape)

        if self.training_data:
            target = self.targets[index]

            if self.target_transform:
                target = self.target_transform(target)

            return patches, target
        else:
            return patches

#### note : landcover + 20 rasters

In [None]:

dataset = GeoLifeCLEF2022Dataset(DATA_PATH,subset = "train", 
                                 region = 'both', 
                                 patch_data = 'landcover', \
                                 use_rasters = True,\
                                 #transform = get_train_transforms(),\
                                 transform = None,\
                                 patch_extractor = extractor_bio )


In [None]:
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size

In [None]:
#len(np.unique(df_obs.species_id[:200].values))

In [None]:
#len(np.unique(dataset.targets))

In [None]:
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])
train_loader = DataLoader(train_dataset, batch_size=16,num_workers = 0,shuffle = True,drop_last=True)
val_loader = DataLoader(val_dataset, batch_size=16, num_workers = 0,shuffle = False,drop_last=True)

In [None]:
image_patch,target  = iter(train_loader).next()
plt.figure(figsize=(10, 12))
print (image_patch.shape)
##### convert 1 ch image to 3 ch, use for near_ir, lancover, altitude images
#https://stackoverflow.com/questions/51995977/how-can-i-use-a-pre-trained-neural-network-with-grayscale-images/51996037#51996037
#rgb_batch = np.repeat(image_patch[..., np.newaxis], 3, -1)
#print (rgb_batch[i])
#print(rgb_batch.shape)  # (64, 224, 224, 3)
#print(target)
print (target.shape)
for i in range(16):

#for i, data in enumerate(train_loader):
    #image_batch , label_batch = data
    ax = plt.subplot(4, 4, i + 1)
    # for rgb
    #image = image_patch[i].numpy().astype("uint8")
    # near_ir
    #image=rgb_batch[i].numpy().astype("uint8")
    # plt.imshow(image)
    # tif
    #image = rgb_batch[i]
    plt.imshow(image_patch[i][0,:,:])
    label = target[i].numpy()
    plt.title(label)
    plt.axis("off")
plt.tight_layout()


# Simple model

In [None]:
#df_obs_fr['species_id'][:50]

In [None]:
from torchvision.models.resnet import ResNet, BasicBlock

In [None]:
N_classes = 17036
class ResNetGeolife(ResNet):
    def __init__(self):
        super().__init__(BasicBlock, [3, 4, 6, 3], num_classes=N_classes)

        self.conv1 = nn.Conv2d(20, 64, kernel_size=7, stride=1, padding=3, bias=False)

        
net = ResNetGeolife().to(device)

In [None]:
#len(dataset.targets)-1

In [None]:
#model =Resnet18(in_channels = 3, pretrained=False,  num_classes =17036)#len(np.unique(dataset.targets))-1)

In [None]:
optimizer = torch.optim.Adam(net.parameters(),lr = 0.001)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = net.to(device)

def loss_fn(preds, labels):
    #print (preds)
    #print(labels)
    loss = nn.CrossEntropyLoss()(preds, labels)
    #loss = nn.BCEWithLogitsLoss()
    return loss

In [None]:
#import torchvision.transforms as transforms

In [None]:
#https://www.kaggle.com/code/drcapa/esc-50-eda-pytorch
def train(model, optimizer,  train_loader, val_loader, epochs=2, device='cpu'):
    for epoch in range(epochs):
        training_loss = 0.0
        valid_loss = 0.0
        model.train()
        for batch in train_loader:
            optimizer.zero_grad()
            inputs, targets = batch
            inputs = inputs.float()
            #inputs = inputs.to(device)
            #targets = targets.float()
            targets = targets.to(device)
            #batch_samples = inputs.size(0)

            #inputs = inputs.view(batch_samples, inputs.size(1))
            inputs_m, inputs_s = inputs.mean(), inputs.std()
            inputs = (inputs - inputs_m) / inputs_s
            # for near_ir img
            #inputs = np.repeat(inputs[..., np.newaxis], 3, -1)
            
            #if inputs.size(1) > 3:
            #    inputs = inputs.permute(0,3,2,1)
                #print(inputs.shape, inputs.size(1))
            inputs = inputs.to(device)
            
            
            output = model(inputs) 
            #print (output.shape)
            #print(targets)
            loss = nn.CrossEntropyLoss(ignore_index = -1)
            loss = loss(output, targets)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            #training_loss += loss.data.item()*inputs.size(0)
            training_loss += loss.item() 
        training_loss /= len(train_loader.dataset)
        
        model.eval()
        num_correct = 0
        num_examples = 0
        for batch in val_loader:
            inputs, targets = batch
            inputs = inputs.float()
            
            inputs_m, inputs_s = inputs.mean(), inputs.std()
            inputs = (inputs - inputs_m) / inputs_s
            
            #inputs = np.repeat(inputs[..., np.newaxis], 3, -1)
            #if inputs.size(1) > 3:
            #    inputs = inputs.permute(0,3,2,1)
                
            inputs = inputs.to(device)
            #inputs = inputs.to(device)
            targets = targets.to(device)
            output = model(inputs)
            loss = loss_fn(output, targets)
            #valid_loss += loss.data.item()*inputs.size(0)
            valid_loss += loss.item() 
            correct = torch.eq(torch.max(F.softmax(output, dim=1), dim=1)[1], targets)
           
            num_correct += torch.sum(correct).item()
            
            num_examples += correct.shape[0]
           
        valid_loss /= len(val_loader.dataset)
        
        try:
            x =  num_correct/num_examples
        except ZeroDivisionError:
            x = 0
        print('Epoch: {}, Training Loss: {:.2f}, Validation Loss: {:.2f}, '
              'accuracy = {:.2f}'.format(epoch+1, training_loss, valid_loss, x))


In [None]:
gc.collect()

In [None]:
train(model.to(device), optimizer, train_loader, val_loader, epochs=2, device=device)

Lets find the above error cause

In [None]:
df_obs_train[df_obs_train.index==20065615] #20065615 #20050902

fig = plt.figure(figsize=(14, 10))
#extractor_bio.plot((40.87572 , -124.07787), fig=fig) #20065615
extractor_bio.plot((40.856445, -124.097336), fig = fig) #20064477 

#extractor_bio.plot((51.017408, 2.133926), fig=fig) # obs id 10228153

In [None]:
PATH = './torchvision_resnet_lc_envrasters.bin'
torch.save(net.state_dict(), PATH)

# Loading test patch

In [None]:
test_dataset = GeoLifeCLEF2022Dataset(DATA_PATH,subset = "test", 
                                 region = 'both', 
                                 patch_data = 'landcover', \
                                 use_rasters = True,\
                                 transform = None,\
                                 patch_extractor = extractor_bio
                                 )

In [None]:
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=1, shuffle = False)

In [None]:
#test_loader.sampler.num_samples

In [None]:
image_patch  = iter(test_loader).next()
plt.figure(figsize=(10, 12))
#print (image_patch.shape)
##### convert 2 ch image to 3 ch, use for near_ir, lancover, altitude images
#https://stackoverflow.com/questions/51995977/how-can-i-use-a-pre-trained-neural-network-with-grayscale-images/51996037#51996037
rgb_batch = np.repeat(image_patch[..., np.newaxis], 3, -1)
#print (rgb_batch[i])
print(rgb_batch.shape)  # (64, 224, 224, 3)
#print(target)
#print (target.shape)
for i in range(16):

#for i, data in enumerate(train_loader):
    #image_batch , label_batch = data
    ax = plt.subplot(4, 4, i + 1)
    # for rgb
    #image = image_patch[i].numpy().astype("uint8")
    # near_ir
    #image=rgb_batch[i].numpy().astype("uint8")
    # plt.imshow(image)
    # tif
    image = rgb_batch[i]
    plt.imshow(image[:,:,0])
    #label = target[i].numpy()
    #plt.title(label)
    plt.axis("off")
plt.tight_layout()



In [None]:
def test_inference (model, dl):
  correct_prediction = 0
  total_prediction = 0
  PREDS = []
  #LABELS = []
  model.eval()
  # Disable gradient updates
  with torch.no_grad():
    for data in tqdm(iter(dl)):
      # Get the input features , and put them on the GPU
      inputs = data[0]
      inputs = inputs.float()
      # Normalize the inputs
      #inputs_m, inputs_s = inputs.mean(), inputs.std()
      #inputs = (inputs - inputs_m) / inputs_s
      #print (inputs.shape)
      inputs = np.repeat(inputs[..., np.newaxis], 3, -1)
      inputs = inputs.unsqueeze(0)
      #print (inputs.shape)
      if inputs.size(1) > 3:
        inputs = inputs.permute(0, 3, 1,2)

        inputs = inputs.to(device)
      # Get predictions
      outputs = model(inputs)

      # Get the predicted class with the highest score
      _, prediction = torch.max(outputs,1)
      PREDS.append(prediction.view(-1).cpu().detach().numpy())


      
  PREDS = np.concatenate(PREDS)
  #LABELS = np.concatenate(LABELS)
  
  #preds_df = pd.DataFrame({'song_id':LABELS, 'genre_id':PREDS})
  return (PREDS)

In [None]:
#model.to(device)
#model.load_state_dict(torch.load(file))
#print (f'Predicting test set using weight ....  {file}')
preds = test_inference(model, test_loader)

In [None]:
#first_30_species = np.arange(30)
#s_pred = np.tile(first_30_species[None], (len(df_obs_test), 1))

In [None]:
df_obs_fr_test = pd.read_csv(DATA_PATH / "observations" / "observations_fr_test.csv", sep=";", index_col="observation_id", nrows = 10000)
df_obs_us_test = pd.read_csv(DATA_PATH / "observations" / "observations_us_test.csv", sep=";", index_col="observation_id", nrows=10000)

df_obs_test = pd.concat((df_obs_fr_test, df_obs_us_test))

obs_id_test = df_obs_test.index.values

print("Number of observations for testing: {}".format(len(df_obs_test)))

df_obs_test.head()

In [None]:
from GLC.submission import generate_submission_file
help(generate_submission_file)

In [None]:
# Compute baseline on the test set
#s_pred = batch_predict(predict_func, X_test, batch_size=1024)

# Generate the submission file
#generate_submission_file(SUBMISSION_PATH / "random_forest_on_environmental_vectors.csv", df_obs_test.index, s_pred)

* Version 1 : near_ir
* Version 2 : landcover
* Version 3 : altitude   10000 images, 80-20 split, 2 epochs, nn.crossentrophy loss, lr = 0.001
* Version 4 : rasters - bioclimatic + pedologic 
* version 5 : - error
