##### Old preprocessing steps

In [24]:
conservation_status[conservation_status["speccode"].isna()]

Unnamed: 0,speccode,english_name_spec,scientific_name,status


In [25]:
# Stuff that's in the conservation list that's not in the atlas
merged_df = pd.merge(
    conservation_status,
    species_lookup,
    on=["speccode", "english_name"],
    how="outer",
    indicator=True,
)

values_only_in_df1 = merged_df.loc[
    merged_df["_merge"] == "left_only", ["speccode", "english_name"]
]
values_only_in_df1

KeyError: 'english_name'

In [None]:
# Unify names - change conservation names to match atlas and take atlas names
cons_latin = pd.read_csv("data/conservation/conservation_status_latin.csv")
cons_latin

Unnamed: 0,speccode,english_name_spec,scientific_name,status
0,1,red-throated diver,gavia stellata,g
1,2,black-throated diver,gavia arctica,a
2,2,black-throated diver,gavia arctica,g
3,3,great northern diver,gavia immer,a
4,3,great northern diver,gavia immer,g
...,...,...,...,...
641,1583,rock/water pipit,anthus petrosus/spinoletta,g
642,1584,white-cheeked turaco,tauraco leucotis,g
643,1586,violet turaco,musophaga violacea,g
644,1603,house crow,corvus splendens,g


In [None]:
cons_latin[cons_latin.duplicated("speccode")]

Unnamed: 0,speccode,english_name_spec,scientific_name,status
2,2,black-throated diver,gavia arctica,g
4,3,great northern diver,gavia immer,g
10,8,red-necked grebe,podiceps grisegena,r
12,9,slavonian grebe,podiceps auritus,r
14,10,black-necked grebe,podiceps nigricollis,a
...,...,...,...,...
557,901,water pipit,anthus spinoletta,g
560,1008,balearic shearwater,puffinus mauretanicus,r
580,1079,lesser redpoll,acanthis cabaret,r
605,1282,yellow-legged gull,larus michahellis,g


In [None]:
conservation_status.sort_values(by="speccode")

Unnamed: 0,speccode,english_name,status
173,1,red-throated diver,g
142,2,black-throated diver,a
174,2,black-throated diver,g
145,3,great northern diver,a
175,3,great northern diver,g
...,...,...,...
633,1583,rock/water pipit,g
634,1584,white-cheeked turaco,g
635,1586,violet turaco,g
636,1603,house crow,g


In [None]:
cons_no_dupe_latin = pd.merge(
    cons_no_dupe, species_lookup, how="left", on="speccode", suffixes=["_cons", "_spec"]
)
cons_no_dupe_latin = cons_no_dupe_latin.sort_values(by="speccode").dropna(
    subset=["taxonomic_rank"]
)
cons = cons_no_dupe_latin[
    ["speccode", "english_name_spec", "scientific_name", "status"]
]

In [None]:
cons_no_dupe = conservation_status.drop_duplicates(subset="speccode")

##### Imports and files

In [21]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import sklearn
from IPython.display import display, Markdown

In [3]:
dists = pd.read_csv("data/atlas_open_data_files/distributions.csv")
dist_changes = pd.read_csv("data/atlas_open_data_files/distribution_changes.csv")
grid_coords = pd.read_csv(
    "data/atlas_open_data_files/grid_square_coordinates_lookup.csv"
)
pct_bench_species = pd.read_csv(
    "data/atlas_open_data_files/percent_benchmark_species_detected.csv"
)
species_lookup = pd.read_csv(
    "data/atlas_open_data_files/species_lookup_nocase.csv", encoding="ISO-8859-1"
)
conservation_status = pd.read_csv("data/conservation/conservation_status_final.csv")

#### Misc stuff

In [4]:
# The number of atlases in the dataset
dists["period"].unique()

array(['1968-72', '2008-11', '2007/08-10/11', '1980/81-1982/83',
       '1988-91'], dtype=object)

In [5]:
# These are the top 10 birds which were spotted most in the Big Garden
# Birdwatch (2023) https://www.rspb.org.uk/whats-happening/big-garden-birdwatch
HOUSE_SPARROW = 459
BLUE_TIT = 436
STARLING = 457
WOODPIDGEON = 270
BLACKBIRD = 371
ROBIN = 345
GOLDFINCH = 471
GREAT_TIT = 437
MAGPIE = 450
LONG_TAILED_TIT = 431

# Ideas
- Regression of distribution changes of common garden birds?
- Classifying birds that need special concern?
- To what extent does the presence/movement of passerines affect the movement of birds of prey?
    - Regression where y = bird of prey and x = passerine

- **Feed data in to NN which predicts whether a bird will be on the latest Red List or not**

# Predicting conservation status using citizen science bird atlases

[Classifications are from the BTO birds of conservation concern](https://www.bto.org/our-science/publications/birds-conservation-concern)

## Data preprocessing

### Conservation status
- Got atlas from Gillings et al.
- Got conservation status from BTO
- Only birds which have data in the atlas are included in study
- Formatted conservation status with atlas species lookup codes in CSV
- Corrected erroneous records where no species code was filled in caused by difference in bird names between lists
- The Great Auk went extinct in the 19th century -- there's no data for it in the atlases!
    - This record has been removed from the former breeding birds list
- Removed species aggregates from lookup
- Added latin names which serve as unique identifiers
- Eng names taken from species lookup

**The first red list was published in 1996.**

### Bird distribution data
- Problem: Different number of species studied in different atlases

In [118]:
# Remove records in dist_changes corresponding to species aggregates
display(Markdown("#### Before removal of species aggregates"))
display(len(dist_changes["speccode"].unique()))

species_lookup_aggs = pd.read_csv(
    "data/atlas_open_data_files/species_lookup.csv", encoding="ISO-8859-1"
)
species_lookup_aggs_col = species_lookup_aggs[
    species_lookup_aggs["taxonomic_rank"] == "Species aggregate"
]["speccode"]

display(Markdown("#### After removal of species aggregates"))
dist_changes = dist_changes[~(dist_changes["speccode"].isin(species_lookup_aggs_col))]
len(dist_changes["speccode"].unique())

#### Before removal of species aggregates

446

#### After removal of species aggregates

446

In [6]:
dist_changes["interval"].unique()

array(['1970-2010', '1990-2010', '1980-2010', '1970-1990'], dtype=object)

Do something with 1970-1990 and 1990-2010?

In [90]:
display(Markdown("#### Birds in the 1970-1990 atlas"))
display(len(dist_changes[dist_changes["interval"] == "1970-1990"]["speccode"].unique()))

display(Markdown("#### Birds in the 1970-2010 atlas"))
display(len(dist_changes[dist_changes["interval"] == "1970-2010"]["speccode"].unique()))

#### Birds in the 1970-1990 atlas

278

#### Birds in the 1970-2010 atlas

300

#### Summary table
|interval|speccode|avg_tenkms_stable|avg_tenkms_gain|avg_tenkms_loss|
|-|-|-|-|-|
|a|a|a|a|a|

In [74]:
# Group by "speccode" and "interval", and calculate the mean for each group
grouped = dist_changes_two_ints.groupby(["speccode", "interval"]).agg(
    {"n_tenkms_stable": "mean", "n_tenkms_gain": "mean", "n_tenkms_loss": "mean"}
)
grouped = grouped.reset_index()
display(len(grouped["speccode"].unique()))
display(len(conservation_status["speccode"].unique()))
grouped[grouped["speccode"] == 459]

322

459

Unnamed: 0,speccode,interval,n_tenkms_stable,n_tenkms_gain,n_tenkms_loss
495,459,1970-1990,0.933771,0.004652,0.061576
496,459,1990-2010,0.934554,0.049085,0.016362
