# GeoLifeCLEF2022 - Exploratory Data Analysis

On-Going EDA

In [None]:
%pylab inline --no-import-all

from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import umap

In [None]:
!rm -rf GLC
!git clone https://github.com/maximiliense/GLC

In [None]:
DATA_PATH = Path("../input/geolifeclef-2022-lifeclef-2022-fgvc9/")

# Tools

In [None]:
from GLC.plotting import plot_map

def plot_observations_distribution(ax, df_obs, df_obs_test=None, **kwargs):
    default_kwargs = {
        "zorder": 1,
        "alpha": 0.1,
        "s": 0.5
    }
    default_kwargs.update(kwargs)
    kwargs = default_kwargs
    
    ax.scatter(df_obs.longitude, df_obs.latitude, color="blue", **kwargs)
    
    if df_obs_test is not None:
        ax.scatter(df_obs_test.longitude, df_obs_test.latitude, color="red", **kwargs)

# Observations

In [None]:
df_obs_fr = pd.read_csv(DATA_PATH / "observations" / "observations_fr_train.csv", sep=";", index_col="observation_id")
df_obs_us = pd.read_csv(DATA_PATH / "observations" / "observations_us_train.csv", sep=";", index_col="observation_id")
df_obs_fr_test = pd.read_csv(DATA_PATH / "observations" / "observations_fr_test.csv", sep=";", index_col="observation_id")
df_obs_us_test = pd.read_csv(DATA_PATH / "observations" / "observations_us_test.csv", sep=";", index_col="observation_id")

df_obs = pd.concat((df_obs_fr, df_obs_us))
df_obs_test = pd.concat((df_obs_fr_test, df_obs_us_test))

print(f"Number of observations for training: {len(df_obs)}")
print(f"Number of observations for testing: {len(df_obs_test)}")

df_obs.head()

# Target Distribution

In [None]:
freq = df_obs.species_id.value_counts()
prop_filter = 0.95
top_k = 30

print(f'{len(freq)} unique species')

print('top 5 species:')
print(freq.head())

print(f'Top {top_k} proportion of total: {freq.cumsum().values[top_k-1]/len(df_obs):.2%}')
print(f'{(freq.cumsum()/len(df_obs)<prop_filter).mean():.2%} targets cumulates to {prop_filter:.2%} of observations')

plt.plot(freq.cumsum().values/len(df_obs.species_id));
plt.axhline(y=prop_filter, color='r', linestyle='-');
plt.axvline(x=30, color='k', linestyle='-');

# Target Types

In [None]:
species_details = pd.read_csv(DATA_PATH / "metadata" / "species_details.csv", sep=";")
species_details['count'] = species_details.species_id.map(freq)

In [None]:
species_details.head()

# Kingdoms

In [None]:
gb = species_details.groupby('GBIF_kingdom_name')['count'].sum()
plt.pie(gb.values.flatten().astype('int'),labels = gb.index);

# Families

In [None]:
gb = species_details.groupby('GBIF_family_name')['count'].sum()

freq = gb.sort_values(ascending=False)
prop_filter = 0.95
top_k = 30

print(f'{len(freq)} unique families')
print('top 5 families:')
print(freq.head())

print(f'Top {top_k} families proportion of total: {freq.cumsum().values[top_k-1]/len(df_obs):.2%}')
print(f'{(freq.cumsum()/len(df_obs)<prop_filter).mean():.2%} families cumulates to {prop_filter:.2%} of observations')

plt.plot(freq.cumsum().values/len(df_obs.species_id));
plt.axhline(y=prop_filter, color='r', linestyle='-');
plt.axvline(x=top_k, color='k', linestyle='-');

Lots of plants at the top.

# Unique Species

In [None]:
specie_id = 5045

fig = plt.figure(figsize=(10, 5.5))
ax = plot_map(region="us")
plot_observations_distribution(ax, df_obs_us, df_obs_us[df_obs_us.species_id==specie_id ])
ax.set_title(f"Observations distribution (US) - specie {specie_id}")

The specie don't appear in France... So it seems we have important geo correlation.

# patch data

Voluminous data only accessible trough specific function.
Might need some work befor being usable. 

In [None]:
from GLC.data_loading.common import load_patch
from GLC.plotting import visualize_observation_patch

patch = load_patch(10171444, DATA_PATH)
print("Number of data sources: {}".format(len(patch)))
print("Arrays shape: {}".format([p.shape for p in patch]))
print("Data types: {}".format([p.dtype for p in patch]))


df_suggested_landcover_alignment = pd.read_csv(DATA_PATH / "metadata" / "landcover_suggested_alignment.csv", sep=";")
df_suggested_landcover_alignment.head()

landcover_mapping = df_suggested_landcover_alignment["suggested_landcover_code"].values
patch = load_patch(10171444, DATA_PATH, landcover_mapping=landcover_mapping)

R = patch[0][:,:,0]
G = patch[0][:,:,1]
B = patch[0][:,:,2]
IR = patch[1]
Altitude = patch[2]
land_cover = patch[3]

# Environmental rasters


In [None]:
df_env = pd.read_csv(DATA_PATH / "pre-extracted" / "environmental_vectors.csv", sep=";")
df_env.head()

In [None]:
df_env.hist(bins=100,figsize=(20,20));

In [None]:
corr = df_env.corr()

fig, ax = plt.subplots(figsize=(16,16))  

sns.heatmap(
    corr, 
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True, ax=ax
);

ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right',
);

In [None]:
df_env['country'] = df_env.observation_id.astype('str').str[0]
df_explo = df_env[df_env.country=='1'].drop(['observation_id','country'],axis=1)
df_explo = df_explo.fillna(df_explo.mean())
mean_explo = df_explo.mean()
std_explo = df_explo.std()
df_explo = ((df_explo-mean_explo)/std_explo).copy()

In [None]:
SAMPLE = 0.1

df_SAMPLE = df_explo.sample(frac=SAMPLE)

reducer = umap.UMAP()
embedding = reducer.fit_transform(df_SAMPLE)
embedding.shape

In [None]:
lim_q = 0.05

for c in df_SAMPLE.columns:
    print(c)
    plt.scatter(embedding[:, 0],embedding[:, 1], s=0.01, c = df_SAMPLE[c], vmin=df_SAMPLE[c].quantile(lim_q), vmax=df_SAMPLE[c].quantile(1-lim_q))
    plt.show()

# Target / Feature exploration

For a given specie we can check how the target behave regarding the different features.

In [None]:
specie_id = 5045

df_merge = df_obs.merge(df_env.set_index('observation_id'), left_index=True, right_index=True, how='left')
df_merge['country'] = df_merge.index.astype('str').str[0]
df_merge['binary_target'] = df_merge.species_id == specie_id

nb_q = 20

features = [
    'latitude', 'longitude', 'bio_1', 'bio_2', 'bio_3', 'bio_4', 'bio_5',
       'bio_6', 'bio_7', 'bio_8', 'bio_9', 'bio_10', 'bio_11', 'bio_12',
       'bio_13', 'bio_14', 'bio_15', 'bio_16', 'bio_17', 'bio_18', 'bio_19',
       'bdticm', 'bldfie', 'cecsol', 'clyppt', 'orcdrc', 'phihox', 'sltppt',
       'sndppt'
]

for c in features:
    print(c)
    quant = df_merge[c].quantile(np.arange(nb_q)/nb_q).values
    df_merge['q'] = pd.cut(df_merge[c], quant, duplicates = 'drop')
    avg_bin_target_by_q = df_merge.groupby('q')['binary_target'].mean()
    avg_q_by_q = df_merge.groupby('q')[c].mean()
    plt.scatter(avg_q_by_q.values,avg_bin_target_by_q.values)
    plt.show()