In this notebook I will try to predict birds only with metadata.

This is an experiment I did for my interest, so it doesn't contribute to raising the ranking of LB.

In [None]:
import os
import numpy as np
import pandas as pd
import lightgbm as lgb

from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold
from tqdm._tqdm_notebook import tqdm_notebook
from matplotlib import pyplot as plt

load meta data

In [None]:
train_metadata_df = pd.read_csv("/kaggle/input/birdclef-2021/train_metadata.csv")

make target data

In [None]:
label_dic = {v:i for i, v in enumerate(train_metadata_df["primary_label"].unique())}
n_labels = len(label_dic)
y_train = train_metadata_df["primary_label"].map(label_dic).values

conversion year data.

it has mistake labels like '0199'.


In [None]:
def check_year(year):
    if year[:2] in ["19", "20"]:
        return int(year)
    else:
        # there are mistake label like 0000, 0201, 0199, ...
        return -1

In [None]:
date_df = pd.DataFrame(train_metadata_df["date"].str.split("-").tolist(), columns=["year", "month", "day"])
date_df["year"] = date_df["year"].map(check_year)
date_df["month"] = date_df["month"].astype(int)
date_df["day"] = date_df["day"].astype(int)
date_df.head()

conversion year data.

it has mistake labels like 'xx'.

In [None]:
def check_hhmm(m):
    if m is None:
        return -1
    m = m.lower().replace("am", "").replace("pm", "")
    if m in ["?", "??", "x", "xx", ".", "", "night", "xx.xx", "dawn", "xx;xx"]:
        return -1
    return int(m)

In [None]:
time_df = pd.DataFrame(train_metadata_df["time"].str.split(":").tolist(), columns=["hour", "minute", "second"])
time_df["hour"] = time_df["hour"].map(check_hhmm)
time_df["minute"] = time_df["minute"].map(check_hhmm)
time_df = time_df.drop("second", axis=1)
time_df.head()

Frequent authors are featured.

Infrequent authors are grouped into -1

In [None]:
author_counts = train_metadata_df["author"].value_counts()
frequent_author = {v: i for i, v in enumerate(author_counts[author_counts > 100].index)}
author_df = pd.DataFrame(train_metadata_df["author"].map(frequent_author).fillna(-1).values, columns=["author_id"])
author_df.head()

make feature values

In [None]:
org_features = ["latitude", "longitude", "rating"]
feature_df = pd.concat([train_metadata_df[org_features], date_df, time_df, author_df], axis=1)
feature_df.head()

In [None]:
features = feature_df.columns.tolist()
X_train = feature_df.values

Train by LGBM

In [None]:
oof = np.zeros(len(y_train))
skf = StratifiedKFold(n_splits=5,  shuffle=True, random_state=416)
for train_index, valid_index in skf.split(X_train, y_train):
    
    dtrain = lgb.Dataset(X_train[train_index, :], label=y_train[train_index])
    d_eval = lgb.Dataset(X_train[valid_index, :], label=y_train[valid_index])
    
    param = {
        'objective': 'multiclass',
        'metric': 'multi_logloss',
        'num_class': n_labels,
        'verbosity': -1,
        'boosting_type': 'gbdt',
    }

    model = lgb.train(param, 
              dtrain,
              valid_sets=d_eval,
              early_stopping_rounds=10)
    pred_y = model.predict(X_train[valid_index, :])
    oof[valid_index] = pred_y.argmax(1)
    
oof = oof.astype(int)
print("---------------------------")
score = f1_score(y_train, oof, average='micro')
print(f"F1 micro = {score:0.4}")