In [14]:
from transformers import pipeline
import torch
from tqdm import tqdm
import pandas as pd

pipe = pipeline(model="MaRiOrOsSi/t5-base-finetuned-question-answering")

In [15]:
train = pd.read_csv('./bird_train.csv')
test = pd.read_csv('./bird_test.csv')

df = pd.concat((train,test))

df

Unnamed: 0,s1,s2
0,"The Iago sparrow (Passer iagoensis), also know...",Iago sparrow scientific name Passer iagoensis
1,The Ibadan malimbe (Malimbus ibadanensis) is a...,Ibadan malimbe scientific name Malimbus ibadan...
2,The Ibera seedeater (Sporophila iberaensis) is...,Ibera seedeater scientific name Sporophila ibe...
3,The Iberian chiffchaff (Phylloscopus ibericus)...,Iberian chiffchaff scientific name Phylloscopu...
4,The Iberian green woodpecker (Picus sharpei) i...,Iberian green woodpecker scientific name Picus...
...,...,...
173,The Guatemalan tyrannulet or paltry tyrannulet...,Guatemalan tyrannulet alternate name paltry ty...
174,The Guianan schiffornis or olivaceous schiffor...,Guianan schiffornis alternate name olivaceous ...
175,"The Guianan toucanet, or Guyana toucanet (Sele...",Guianan toucanet alternate name Guyana toucanet
176,Güldenstädt's redstart (Phoenicurus erythrogas...,Güldenstädt's redstart alternate name white-wi...


In [16]:
l=[]
with torch.no_grad():
    for context in tqdm(df['s1'].str.lower()):
        bird = pipe(f'question: what is the bird name here?  context: {context}')
        sname = pipe(f'question: what is the scientific name here?  context: {context}')
        alternate = pipe(f'question: what is the alternate name of {bird[0]["generated_text"]}?  context: {context}')

        l.append((bird[0]["generated_text"], sname[0]["generated_text"] ,alternate[0]["generated_text"]))

100%|██████████| 893/893 [16:52<00:00,  1.13s/it]


In [19]:
base_data = pd.read_csv('cleaned_annotations.csv')
base_data.fillna('', inplace=True)

In [None]:
base_data

In [32]:
l1 = [x[0] for x in l] 
l2 = [x[1] for x in l]
l3 = [x[2] for x in l]

df['pred_bird'] = l1
df['pred_sci'] = l2
df['pred_alt'] = l3


In [34]:
final = pd.merge(df, base_data, how='inner', left_on='s1', right_on='raw')

In [37]:
final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 893 entries, 0 to 892
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   s1               893 non-null    object
 1   s2               893 non-null    object
 2   pred_bird        893 non-null    object
 3   pred_sci         893 non-null    object
 4   pred_alt         893 non-null    object
 5   Unnamed: 0       893 non-null    int64 
 6   raw              893 non-null    object
 7   bird             893 non-null    object
 8   alternate_names  893 non-null    object
 9   scientific_name  893 non-null    object
dtypes: int64(1), object(9)
memory usage: 76.7+ KB


In [41]:
from sklearn.metrics import f1_score

f1_bird = f1_score([True for _ in range(len(final))], final['pred_bird'] == final['bird'])
f1_sci = f1_score([True for _ in range(len(final))], final['pred_sci'] == final['scientific_name'])
f1_alt = f1_score([True for _ in range(len(final))], final['pred_alt'] == final['alternate_names'])

In [42]:
f1_bird, f1_sci, f1_alt

(0.17008196721311478, 0.03736263736263736, 0.10201912858660997)

In [43]:
final.to_csv('pred_t5.csv',index=False)