# BiodivNER Data Preprocessing

Adapted from: Nora Abdelmageed's BiodivBERT (2023) <br>
Original code: https://github.com/fusion-jena/BiodivBERT/tree/main

## Import and Configurations

In [2]:
%pip install matplotlib

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

Collecting matplotlibNote: you may need to restart the kernel to use updated packages.

  Using cached matplotlib-3.8.2-cp311-cp311-win_amd64.whl (7.6 MB)
Collecting contourpy>=1.0.1 (from matplotlib)
  Using cached contourpy-1.2.0-cp311-cp311-win_amd64.whl (187 kB)
Collecting cycler>=0.10 (from matplotlib)
  Using cached cycler-0.12.1-py3-none-any.whl (8.3 kB)
Collecting fonttools>=4.22.0 (from matplotlib)
  Using cached fonttools-4.46.0-cp311-cp311-win_amd64.whl (2.2 MB)
Collecting kiwisolver>=1.3.1 (from matplotlib)
  Using cached kiwisolver-1.4.5-cp311-cp311-win_amd64.whl (56 kB)
Collecting pillow>=8 (from matplotlib)
  Using cached Pillow-10.1.0-cp311-cp311-win_amd64.whl (2.6 MB)
Collecting pyparsing>=2.3.1 (from matplotlib)
  Using cached pyparsing-3.1.1-py3-none-any.whl (103 kB)
Installing collected packages: pyparsing, pillow, kiwisolver, fonttools, cycler, contourpy, matplotlib
Successfully installed contourpy-1.2.0 cycler-0.12.1 fonttools-4.46.0 kiwisolver-1.4.5 matplotlib-3.

## Dataset Configurations

In [24]:
root_data_dir = "../Datasets/NER/BiodivNER/"

dataset = "train"
train_csv_file_path = "train.csv"
val_csv_file_path = "dev.csv"
test_csv_file_path = "test.csv"

## Data Loading Utilities

In [4]:
def loadData(csv_file_path):
  dataset_path = os.path.join(root_data_dir, csv_file_path)
  data = pd.read_csv(dataset_path, encoding="latin1")
  data = data.fillna(method="ffill")
  return data

In [5]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, t) for w, t in zip(s["Word"].values.tolist(),                                                          
                                                        s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

## Load Datasets

In [25]:
data = loadData(train_csv_file_path)

  data = data.fillna(method="ffill")


In [26]:
data.head(10)

Unnamed: 0,Sentence #,Word,Tag
0,Sentence: 0,Samplenr,O
1,Sentence: 0,Seedlingnr,O
2,Sentence: 0,Plot,O
3,Sentence: 0,Record,O
4,Sentence: 0,Date,O
5,Sentence: 0,Planted_Species,O
6,Sentence: 0,Density,B-Quality
7,Sentence: 0,Treatment,B-Phenomena
8,Sentence: 0,Dead,B-Quality
9,Sentence: 0,Height_P,O


In [48]:
val_data = loadData(val_csv_file_path)

  data = data.fillna(method="ffill")


In [50]:
val_data.offset = 465
val_data.head(10)

Unnamed: 0,Sentence #,Word,Tag
0,Sentence: 0,For,O
1,Sentence: 0,live,B-Organism
2,Sentence: 0,snags,I-Organism
3,Sentence: 0,the,O
4,Sentence: 0,measurement,O
5,Sentence: 0,height,B-Quality
6,Sentence: 0,is,O
7,Sentence: 0,indicated,O
8,Sentence: 0,by,O
9,Sentence: 0,a,O
