In [1]:
from pathlib import Path

In [2]:
import pandas as pd

In [3]:
from sklearn.impute import SimpleImputer

In [4]:
import numpy as np

In [5]:
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint

In [6]:
from models.pl_trainer.PLTrainer import EnvVectorModelTrainer

In [7]:
from models.pl_trainer.PLTrainer import ResNetBasedModelTrainer

In [8]:
from sklearn.preprocessing import LabelEncoder

## Global Variables ##

In [9]:
DATA_PATH = Path("./data/geolifeclef-2022-lifeclef-2022-fgvc9/")

In [10]:
DATA_PATH

PosixPath('data/geolifeclef-2022-lifeclef-2022-fgvc9')

## Load Data ##

#### Training Data ####

In [11]:
df_obs_fr = pd.read_csv(DATA_PATH / "observations" / "observations_fr_train.csv", sep=";")
df_obs_us = pd.read_csv(DATA_PATH / "observations" / "observations_us_train.csv", sep=";")

df_obs = pd.concat((df_obs_fr, df_obs_us))

print("Number of observations for training: {}".format(len(df_obs)))

df_obs.head()

Number of observations for training: 1627475


Unnamed: 0,observation_id,latitude,longitude,species_id,subset
0,10561949,45.705116,1.424622,241,train
1,10131188,45.146973,6.416794,101,train
2,10799362,46.783695,-2.072855,700,train
3,10392536,48.604866,-2.825003,1456,train
4,10335049,48.815567,-0.161431,157,train


In [12]:
df_obs.subset.value_counts()

train    1587395
val        40080
Name: subset, dtype: int64

In [13]:
df_obs.species_id.unique().size

17037

#### Test Data #####

In [14]:
df_obs_fr_test = pd.read_csv(DATA_PATH / "observations" / "observations_fr_test.csv", sep=";")
df_obs_us_test = pd.read_csv(DATA_PATH / "observations" / "observations_us_test.csv", sep=";")

df_obs_test = pd.concat((df_obs_fr_test, df_obs_us_test))

print("Number of observations for testing: {}".format(len(df_obs_test)))

df_obs_test.head()

Number of observations for testing: 36421


Unnamed: 0,observation_id,latitude,longitude
0,10782781,43.601788,6.940195
1,10364138,46.241711,0.683586
2,10692017,45.181095,1.533459
3,10222322,46.93845,5.298678
4,10241950,45.017433,0.960736


#### Species Info ####

In [15]:
df_species = pd.read_csv(DATA_PATH / "metadata" / "species_details.csv", sep=";")

In [16]:
df_species.head()

Unnamed: 0,species_id,GBIF_species_id,GBIF_species_name,GBIF_genus_name,GBIF_family_name,GBIF_kingdom_name
0,0,5356345,Laburnum anagyroides,Laburnum,Fabaceae,Plantae
1,1,2874515,Cucurbita maxima,Cucurbita,Cucurbitaceae,Plantae
2,2,2874569,Cucumis sativus,Cucumis,Cucurbitaceae,Plantae
3,3,3033363,Ranunculus acris,Ranunculus,Ranunculaceae,Plantae
4,4,3034830,Heracleum sphondylium,Heracleum,Apiaceae,Plantae


#### Env Vectors ####

In [17]:
df_env = pd.read_csv(DATA_PATH / "pre-extracted" / "environmental_vectors.csv", sep=";")
df_env.head()

Unnamed: 0,observation_id,bio_1,bio_2,bio_3,bio_4,bio_5,bio_6,bio_7,bio_8,bio_9,...,bio_18,bio_19,bdticm,bldfie,cecsol,clyppt,orcdrc,phihox,sltppt,sndppt
0,10000000,1.420833,6.908333,29.272598,614.1493,15.1,-8.5,23.6,-1.0,9.183333,...,248.0,358.0,2082.0,988.0,29.0,13.0,63.0,62.0,34.0,53.0
1,10000001,8.8375,9.858334,37.771393,586.8139,23.8,-2.3,26.099998,6.016667,16.383333,...,226.0,288.0,1816.0,1142.0,20.0,22.0,39.0,58.0,41.0,36.0
2,10000002,6.241667,8.35,32.239384,632.8609,21.0,-4.9,25.9,3.033333,14.2,...,268.0,317.0,1346.0,1075.0,29.0,22.0,54.0,59.0,40.0,38.0
3,10000003,12.554167,9.525001,40.189877,541.80865,25.9,2.2,23.699999,6.85,19.35,...,157.0,257.0,1227.0,1383.0,21.0,28.0,18.0,71.0,46.0,25.0
4,10000004,8.029167,10.075,36.636364,633.0175,23.7,-3.8,27.5,4.616667,16.083334,...,214.0,280.0,2833.0,1202.0,24.0,25.0,33.0,69.0,38.0,37.0


In [18]:
env_columns = df_env.iloc[:,1:].columns.tolist()+['latitude','longitude']

#### Landcover Mapping ####

In [19]:
df_suggested_landcover_alignment = pd.read_csv(DATA_PATH / "metadata" / "landcover_suggested_alignment.csv", sep=";")
landcover_mapping = df_suggested_landcover_alignment["suggested_landcover_code"].values

## Prepare Data ##

In [20]:
train_df = df_obs[df_obs["subset"] == "train"]
val_df = df_obs[df_obs["subset"] == "val"]

In [21]:
# df_obs_test[df_obs_test.observation_id.isin(df_env[df_env.isnull().any(axis=1)].observation_id.unique())]

In [22]:
df_env = df_env.fillna(df_env.mean(axis=0)) 

In [23]:
train_df = train_df.merge(df_env, on='observation_id') 
val_df = val_df.merge(df_env, on='observation_id')
test_df = df_obs_test.merge(df_env, on='observation_id')

In [24]:
test_df[test_df.isnull().any(axis=1)]

Unnamed: 0,observation_id,latitude,longitude,bio_1,bio_2,bio_3,bio_4,bio_5,bio_6,bio_7,...,bio_18,bio_19,bdticm,bldfie,cecsol,clyppt,orcdrc,phihox,sltppt,sndppt


In [25]:
label_encoder = LabelEncoder()
label_encoder.fit(df_obs["species_id"].values)

LabelEncoder()

In [26]:
label_encoder.classes_.shape

(17037,)

In [27]:
# X_train = train_df[env_columns].values
# X_val = val_df[env_columns].values
# X_test = test_df[env_columns].values

# y_train = label_encoder.transform(train_df["species_id"].values)
# y_val = label_encoder.transform(val_df["species_id"].values)

# n_val = len(val_df)
# print("Validation set size: {} ({:.1%} of train observations)".format(n_val, n_val / len(df_obs)))

In [28]:
train_df["species_id"] = label_encoder.transform(train_df["species_id"].values)
val_df["species_id"] = label_encoder.transform(val_df["species_id"].values)
n_val = len(val_df)
print("Validation set size: {} ({:.1%} of train observations)".format(n_val, n_val / len(df_obs)))

Validation set size: 40080 (2.5% of train observations)


In [29]:
train_df.head()

Unnamed: 0,observation_id,latitude,longitude,species_id,subset,bio_1,bio_2,bio_3,bio_4,bio_5,...,bio_18,bio_19,bdticm,bldfie,cecsol,clyppt,orcdrc,phihox,sltppt,sndppt
0,10561949,45.705116,1.424622,241,train,11.229167,8.724999,37.286324,556.81506,24.5,...,211.0,287.0,1678.0,1381.0,13.0,20.0,26.0,58.0,36.0,44.0
1,10131188,45.146973,6.416794,101,train,4.5875,9.058333,33.302696,664.6022,19.9,...,265.0,362.0,1771.0,1219.0,28.0,18.0,49.0,61.0,38.0,45.0
2,10799362,46.783695,-2.072855,700,train,12.625,6.65,34.635418,466.4396,23.4,...,137.0,253.0,1808.0,1351.0,21.0,24.0,31.0,72.0,34.0,42.0
3,10392536,48.604866,-2.825003,1456,train,11.579166,6.525,37.5,406.30518,21.4,...,160.0,282.0,1667.0,1277.0,18.0,22.0,24.0,63.0,37.0,40.0
4,10335049,48.815567,-0.161431,157,train,10.295834,7.475,36.286407,487.0713,22.1,...,152.0,235.0,1379.0,1321.0,19.0,25.0,26.0,65.0,56.0,20.0


#### Fill missing values ####

In [30]:
# imp = SimpleImputer(
#     missing_values=np.nan,
#     strategy="constant",
#     fill_value=np.finfo(np.float32).min,
# )
# imp.fit(X_train)

In [31]:
# X_train = imp.transform(X_train)
# X_val = imp.transform(X_val)
# X_test = imp.transform(X_test)

## Training ##

In [32]:
BATCHSIZE = 32*4
NUM_CLASSES = len(label_encoder.classes_)
EPOCHS = 5

In [33]:
NUM_CLASSES

17037

In [34]:
INPUT_DIM = len(env_columns)

In [35]:
checkpoint_callback = ModelCheckpoint(
            monitor="valid/epoch/top_30_error",
            dirpath="checkpoints",
            filename="resnet50-env-vector-mlp-{epoch:02d}-{top_30_error:.2f}",
            save_top_k=2,
            mode="min",
        )



In [36]:
# model = EnvVectorModelTrainer(dropout_mlp=0.1,
#                       input_dim=INPUT_DIM,
#                       n_classes = NUM_CLASSES,
#                       hidden_dims=[512, 256, 128],
#                       l_rate=0.01,
#                       batch_size=BATCHSIZE,
#                       X_train = X_train,
#                       y_train = y_train,
#                       X_val = X_val,
#                       y_val = y_val,
#                       num_workers = 6
#                       )

In [37]:
model = ResNetBasedModelTrainer(res_net_out_dim = 1000,
                 dropout_mlp = 0.1,
                 input_dim = INPUT_DIM,
                 n_classes = NUM_CLASSES,
                 hidden_dims = [512, 256],
                 l_rate = 0.01,
                 batch_size = BATCHSIZE,
                 train_df = train_df,
                 val_df = val_df,
                 env_vec_columns = env_columns, 
                 landcover_mapping = landcover_mapping, 
                 data_path = DATA_PATH,
                 num_workers = 15)

Using cache found in /home/rohit/.cache/torch/hub/pytorch_vision_v0.10.0
Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /home/rohit/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth


  0%|          | 0.00/97.8M [00:00<?, ?B/s]

In [38]:
print(model)

ResNetBasedModelTrainer(
  (model): ResnetBasedModel(
    (mlp_nn): Sequential(
      (0): Linear(in_features=1029, out_features=512, bias=True)
      (1): ReLU()
      (2): Linear(in_features=512, out_features=256, bias=True)
      (3): ReLU()
      (4): Linear(in_features=256, out_features=17037, bias=True)
    )
    (m): LogSoftmax(dim=1)
    (res_model): ResNet(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=

In [39]:
trainer = Trainer(max_epochs=EPOCHS,
                  fast_dev_run=False,
                  callbacks=[checkpoint_callback],
                  gpus=1)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores


In [None]:
trainer.fit(model)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type             | Params
-----------------------------------------------
0 | model     | ResnetBasedModel | 30.6 M
1 | criterion | NLLLoss          | 0     
-----------------------------------------------
30.6 M    Trainable params
0         Non-trainable params
30.6 M    Total params
122.377   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Top-30 error rate: 98.3% Validation Set


Training: 0it [00:00, ?it/s]

In [None]:
print('done')