In [47]:
import pandas as pd

In [49]:
species_to_group = pd.read_csv("../infer_to_csv/species_to_group_Robbie.csv")
species_to_group.tail()

Unnamed: 0,species,group,robbie_group,comment
23,Hog_Red_River,Hog_Red_River,Ungulate,
24,Buffalo_African,Buffalo_African,Ungulate,
25,Duiker_Red,Duiker,Ungulate,
26,Duiker_Blue,Duiker,Ungulate,
27,Duiker_Yellow_Backed,Duiker,Ungulate,


In this notebook I produce an output.csv file using real predictions from the stage4a model of the MVP on the entire validation set. However, some missing data (datetimes, GPS coordinates, the folder structure) are faked.

The species from pred_1 are aggregated into larger groups using the file: `species_to_groups_mapping.csv`

In [55]:
# The structure of the folder is supposed to be:
# STATION_*
# ├── Check*
# │   ├── CAM*
# │   │   ├── * [typically date]
# │   │   │   ├── *.jpg
# │   │   │   ├── *.jpeg
# │   │   │   ├── *.png

In [56]:
from datetime import datetime

import os
from pathlib import Path

import pandas as pd

from fastai import *
from fastai.vision import *
import random

In [57]:
HOME_FOLDER_OF_REPO = Path("/Users/jedrzej/Projects/Gabon/gabon_wildlife_training")
PATH_TO_TRAIN_DF = HOME_FOLDER_OF_REPO / "inspect_data_split_validation"

## data and labels

In [58]:
df = pd.read_csv(PATH_TO_TRAIN_DF / "train_valid_df.csv")
classes = df.species.unique()
print(len(classes)) # no "Mammal_Other"
classes

28


array(['Human', 'Blank', 'Elephant_African', 'Hog_Red_River', 'Buffalo_African', 'Leopard_African', 'Monkey',
       'Duiker_Red', 'Civet_African_Palm', 'Squirrel', 'Duiker_Blue', 'Bird', 'Mongoose_Black_Footed', 'Rodent',
       'Duiker_Yellow_Backed', 'Genet', 'Chimpanzee', 'Gorilla', 'Mongoose', 'Porcupine_Brush_Tailed', 'Pangolin',
       'Mandrillus', 'Chevrotain_Water', 'Cat_Golden', 'Rat_Giant', 'Guineafowl_Crested', 'Guineafowl_Black',
       'Rail_Nkulengu'], dtype=object)

## build preds_df

In [59]:
preds = torch.load(HOME_FOLDER_OF_REPO / "training" / "preds-stage4a-5epochs-576_768-rescaled.pt")
images_list = np.load(HOME_FOLDER_OF_REPO / "training" / "images_list-stage4a-5epochs-576_768-rescaled.pt.npy")

In [60]:
df_preds = pd.DataFrame(preds[0].numpy(), columns=classes)
# df_preds["label"] = pd.Series(preds[1]).apply(lambda x: classes[x])
df_preds["img"] = images_list
df_preds["uniqueName"] = df_preds.img.str.extract("\/(\d*\.jpg)$", expand=True)

In [61]:
df_preds_only = pd.DataFrame(preds[0].numpy(), columns=classes)
ranks = df_preds_only.rank(axis=1,method='dense', ascending=False).astype(int)

df_preds["pred_1"] = pd.Series(ranks.where(ranks==1).notnull().values.nonzero()[1]).apply(lambda x: classes[x])
df_preds["pred_2"] = pd.Series(ranks.where(ranks==2).notnull().values.nonzero()[1]).apply(lambda x: classes[x])
df_preds["pred_3"] = pd.Series(ranks.where(ranks==3).notnull().values.nonzero()[1]).apply(lambda x: classes[x])

df_preds["score_1"] = df_preds.apply(lambda x: x[x.pred_1], axis=1)
df_preds["score_2"] = df_preds.apply(lambda x: x[x.pred_2], axis=1)
df_preds["score_3"] = df_preds.apply(lambda x: x[x.pred_3], axis=1)

## species to group translation

In [62]:
species_to_group = pd.read_csv("../infer_to_csv/species_to_group.csv")
species_to_group.tail()

Unnamed: 0,species,group
23,Hog_Red_River,Ungulate
24,Buffalo_African,Ungulate
25,Duiker_Red,Ungulate
26,Duiker_Blue,Ungulate
27,Duiker_Yellow_Backed,Ungulate


In [63]:
species_to_group.group.value_counts()

Ungulate            5
Small carnivore     4
Primate             4
Bird                4
Rodent              4
Cat                 2
Blank               1
Pangolin            1
Human               1
Elephant_African    1
Chevrotain_Water    1
Name: group, dtype: int64

In [64]:
species_to_group_dic = {k:species_to_group.loc[species_to_group.species == k, "group"].values[0] for k in list(species_to_group.species)}
species_to_group_dic

{'Bird': 'Bird',
 'Guineafowl_Crested': 'Bird',
 'Guineafowl_Black': 'Bird',
 'Rail_Nkulengu': 'Bird',
 'Blank': 'Blank',
 'Leopard_African': 'Cat',
 'Cat_Golden': 'Cat',
 'Chevrotain_Water': 'Chevrotain_Water',
 'Elephant_African': 'Elephant_African',
 'Human': 'Human',
 'Pangolin': 'Pangolin',
 'Monkey': 'Primate',
 'Chimpanzee': 'Primate',
 'Gorilla': 'Primate',
 'Mandrillus': 'Primate',
 'Squirrel': 'Rodent',
 'Rodent': 'Rodent',
 'Porcupine_Brush_Tailed': 'Rodent',
 'Rat_Giant': 'Rodent',
 'Civet_African_Palm': 'Small carnivore',
 'Mongoose_Black_Footed': 'Small carnivore',
 'Genet': 'Small carnivore',
 'Mongoose': 'Small carnivore',
 'Hog_Red_River': 'Ungulate',
 'Buffalo_African': 'Ungulate',
 'Duiker_Red': 'Ungulate',
 'Duiker_Blue': 'Ungulate',
 'Duiker_Yellow_Backed': 'Ungulate'}

In [65]:
df_preds["pred_group"] = df_preds.pred_1.apply(lambda x: species_to_group_dic[x])

In [66]:
df_preds.head()

Unnamed: 0,Human,Blank,Elephant_African,Hog_Red_River,Buffalo_African,Leopard_African,Monkey,Duiker_Red,Civet_African_Palm,Squirrel,...,Rail_Nkulengu,img,uniqueName,pred_1,pred_2,pred_3,score_1,score_2,score_3,pred_group
0,3.570265e-05,0.001445,0.0002180603,9.789845e-06,1.021979e-05,1.123897e-05,1.040487e-05,0.0004403269,4.605485e-05,5.058151e-05,...,1.864563e-05,/data/Gabon_trainingData/1039759.jpg,1039759.jpg,Chimpanzee,Genet,Blank,0.994561,0.00178,0.001445,Primate
1,3.359222e-06,0.000893,2.08969e-05,2.785994e-07,1.902536e-05,2.143928e-06,3.144651e-06,1.606906e-05,2.828091e-06,6.575648e-07,...,2.622566e-05,/data/Gabon_trainingData/1195784.jpg,1195784.jpg,Cat_Golden,Blank,Guineafowl_Crested,0.998434,0.000893,0.00021,Cat
2,1.173622e-07,0.998352,8.825286e-07,1.125437e-07,7.130751e-07,3.140708e-07,1.095479e-07,3.303264e-07,7.548314e-07,1.651983e-07,...,2.676656e-08,/data/Gabon_trainingData/0880576.jpg,0880576.jpg,Blank,Chimpanzee,Duiker_Blue,0.998352,0.001,0.000634,Blank
3,0.001604006,0.850026,0.00927361,0.0001160529,0.0001545879,0.001047468,7.21229e-05,0.08633371,0.0006769311,1.903582e-05,...,0.004189791,/data/Gabon_trainingData/1194852.jpg,1194852.jpg,Blank,Duiker_Red,Duiker_Blue,0.850026,0.086334,0.031094,Blank
4,0.0003992095,7.3e-05,0.0001489591,3.351047e-05,4.849581e-05,0.0001795067,1.804623e-05,0.9280165,0.0005082623,1.02194e-05,...,5.552495e-05,/data/Gabon_trainingData/1201480.jpg,1201480.jpg,Duiker_Red,Mandrillus,Genet,0.928016,0.060248,0.004929,Ungulate


## add GPS - pick random coords from a few parks

In [67]:
parks_gps = {}

parks_gps["lope"] = {"lat_max": -0.2,
                 "lat_min": -1.0,
                 "lon_max": 11.7,
                 "lon_min": 11.4}

parks_gps["waka"] = {"lat_max": -1.16,
                 "lat_min": -1.42,
                 "lon_max": 11.2,
                 "lon_min": 11.0}

parks_gps["loango"] = {"lat_max": -1.95,
                 "lat_min": -2.16,
                 "lon_max": 9.57,
                 "lon_min": 9.72}

parks_gps["bateke"] = {"lat_max": -2.06,
                 "lat_min": -2.45,
                 "lon_max": 13.89,
                 "lon_min": 14.14}

In [68]:
parks_list = list(parks_gps.keys())

In [69]:
def get_random_gps(parks_gps, parks_list):
    park_gps = parks_gps[random.choice(parks_list)]
    return [random.uniform(park_gps["lat_min"], park_gps["lat_max"]),
            random.uniform(park_gps["lon_min"], park_gps["lon_max"])] # (latitude, longitude)

####  generate coordinates list with lenght of df_preds

In [70]:
gps_coords_unt = [get_random_gps(parks_gps, parks_list) for i in range(len(df_preds))]
gps_coords = np.array(gps_coords_unt).T.tolist()

In [71]:
df_preds["exif_gps_lat"], df_preds["exif_gps_long"] = gps_coords

In [72]:
df_preds.head()

Unnamed: 0,Human,Blank,Elephant_African,Hog_Red_River,Buffalo_African,Leopard_African,Monkey,Duiker_Red,Civet_African_Palm,Squirrel,...,uniqueName,pred_1,pred_2,pred_3,score_1,score_2,score_3,pred_group,exif_gps_lat,exif_gps_long
0,3.570265e-05,0.001445,0.0002180603,9.789845e-06,1.021979e-05,1.123897e-05,1.040487e-05,0.0004403269,4.605485e-05,5.058151e-05,...,1039759.jpg,Chimpanzee,Genet,Blank,0.994561,0.00178,0.001445,Primate,-2.327886,14.049852
1,3.359222e-06,0.000893,2.08969e-05,2.785994e-07,1.902536e-05,2.143928e-06,3.144651e-06,1.606906e-05,2.828091e-06,6.575648e-07,...,1195784.jpg,Cat_Golden,Blank,Guineafowl_Crested,0.998434,0.000893,0.00021,Cat,-0.656819,11.684987
2,1.173622e-07,0.998352,8.825286e-07,1.125437e-07,7.130751e-07,3.140708e-07,1.095479e-07,3.303264e-07,7.548314e-07,1.651983e-07,...,0880576.jpg,Blank,Chimpanzee,Duiker_Blue,0.998352,0.001,0.000634,Blank,-2.140234,9.715895
3,0.001604006,0.850026,0.00927361,0.0001160529,0.0001545879,0.001047468,7.21229e-05,0.08633371,0.0006769311,1.903582e-05,...,1194852.jpg,Blank,Duiker_Red,Duiker_Blue,0.850026,0.086334,0.031094,Blank,-2.056609,9.604216
4,0.0003992095,7.3e-05,0.0001489591,3.351047e-05,4.849581e-05,0.0001795067,1.804623e-05,0.9280165,0.0005082623,1.02194e-05,...,1201480.jpg,Duiker_Red,Mandrillus,Genet,0.928016,0.060248,0.004929,Ungulate,-2.309895,13.998762


## add datetime - random from last 6 months

In [73]:
# from random import randrange
from datetime import timedelta

def random_date(start, end):
    """
    This function will return a random datetime between two datetime 
    objects.
    """
    delta = end - start
    int_delta = (delta.days * 24 * 60 * 60) + delta.seconds
    random_second = random.randrange(int_delta)
    return (start + timedelta(seconds=random_second)).strftime("%Y:%m:%d %H:%M:%S")

In [74]:
start = datetime.strptime("2019:10:01 03:14:15", "%Y:%m:%d %H:%M:%S")
end = datetime.strptime("2020:03:30 03:14:15", "%Y:%m:%d %H:%M:%S")
random_date(start, end)

'2019:11:20 09:28:05'

In [75]:
df_preds["exif_datetime"] = [random_date(start, end) for i in range(len(df_preds))]

In [76]:
df_preds.head()

Unnamed: 0,Human,Blank,Elephant_African,Hog_Red_River,Buffalo_African,Leopard_African,Monkey,Duiker_Red,Civet_African_Palm,Squirrel,...,pred_1,pred_2,pred_3,score_1,score_2,score_3,pred_group,exif_gps_lat,exif_gps_long,exif_datetime
0,3.570265e-05,0.001445,0.0002180603,9.789845e-06,1.021979e-05,1.123897e-05,1.040487e-05,0.0004403269,4.605485e-05,5.058151e-05,...,Chimpanzee,Genet,Blank,0.994561,0.00178,0.001445,Primate,-2.327886,14.049852,2019:10:31 20:58:04
1,3.359222e-06,0.000893,2.08969e-05,2.785994e-07,1.902536e-05,2.143928e-06,3.144651e-06,1.606906e-05,2.828091e-06,6.575648e-07,...,Cat_Golden,Blank,Guineafowl_Crested,0.998434,0.000893,0.00021,Cat,-0.656819,11.684987,2020:03:13 13:12:50
2,1.173622e-07,0.998352,8.825286e-07,1.125437e-07,7.130751e-07,3.140708e-07,1.095479e-07,3.303264e-07,7.548314e-07,1.651983e-07,...,Blank,Chimpanzee,Duiker_Blue,0.998352,0.001,0.000634,Blank,-2.140234,9.715895,2019:10:07 13:46:08
3,0.001604006,0.850026,0.00927361,0.0001160529,0.0001545879,0.001047468,7.21229e-05,0.08633371,0.0006769311,1.903582e-05,...,Blank,Duiker_Red,Duiker_Blue,0.850026,0.086334,0.031094,Blank,-2.056609,9.604216,2020:02:28 10:21:12
4,0.0003992095,7.3e-05,0.0001489591,3.351047e-05,4.849581e-05,0.0001795067,1.804623e-05,0.9280165,0.0005082623,1.02194e-05,...,Duiker_Red,Mandrillus,Genet,0.928016,0.060248,0.004929,Ungulate,-2.309895,13.998762,2019:10:04 17:14:48


## add path, station, check, camera - fake from lists

In [77]:
stations_names = ["alpha", "beta", "gamma", "delta", "epsilon", "zeta", "eta", "kappa"]
stations = [random.choice(stations_names) for i in range(len(df_preds))]
checks = [str(random.randint(1,10)) for i in range(len(df_preds))]
cameras = [str(random.randint(100,200)) for i in range(len(df_preds))]

In [78]:
df_preds["station"] = stations
df_preds["check"] = checks
df_preds["camera"] = cameras

In [79]:
df_preds["path"] = df_preds.apply(lambda x: "/".join(["STATION_"+x.station, "Check "+x.check, "CAM"+x.camera, x.uniqueName]), axis=1)

In [80]:
df_preds.head()

Unnamed: 0,Human,Blank,Elephant_African,Hog_Red_River,Buffalo_African,Leopard_African,Monkey,Duiker_Red,Civet_African_Palm,Squirrel,...,score_2,score_3,pred_group,exif_gps_lat,exif_gps_long,exif_datetime,station,check,camera,path
0,3.570265e-05,0.001445,0.0002180603,9.789845e-06,1.021979e-05,1.123897e-05,1.040487e-05,0.0004403269,4.605485e-05,5.058151e-05,...,0.00178,0.001445,Primate,-2.327886,14.049852,2019:10:31 20:58:04,gamma,7,138,STATION_gamma/Check 7/CAM138/1039759.jpg
1,3.359222e-06,0.000893,2.08969e-05,2.785994e-07,1.902536e-05,2.143928e-06,3.144651e-06,1.606906e-05,2.828091e-06,6.575648e-07,...,0.000893,0.00021,Cat,-0.656819,11.684987,2020:03:13 13:12:50,zeta,10,133,STATION_zeta/Check 10/CAM133/1195784.jpg
2,1.173622e-07,0.998352,8.825286e-07,1.125437e-07,7.130751e-07,3.140708e-07,1.095479e-07,3.303264e-07,7.548314e-07,1.651983e-07,...,0.001,0.000634,Blank,-2.140234,9.715895,2019:10:07 13:46:08,eta,1,108,STATION_eta/Check 1/CAM108/0880576.jpg
3,0.001604006,0.850026,0.00927361,0.0001160529,0.0001545879,0.001047468,7.21229e-05,0.08633371,0.0006769311,1.903582e-05,...,0.086334,0.031094,Blank,-2.056609,9.604216,2020:02:28 10:21:12,beta,1,160,STATION_beta/Check 1/CAM160/1194852.jpg
4,0.0003992095,7.3e-05,0.0001489591,3.351047e-05,4.849581e-05,0.0001795067,1.804623e-05,0.9280165,0.0005082623,1.02194e-05,...,0.060248,0.004929,Ungulate,-2.309895,13.998762,2019:10:04 17:14:48,beta,6,173,STATION_beta/Check 6/CAM173/1201480.jpg


In [81]:
def order(frame,var):
    if type(var) is str:
        var = [var] #let the command take a string or list
    varlist =[w for w in frame.columns if w not in var]
    frame = frame[var+varlist]
    return frame

In [82]:
df_preds = order(df_preds, ["path", "station", "check", "camera", 
                 "exif_datetime",
                 "exif_gps_long", "exif_gps_lat",
                 "pred_group",
                 "pred_1", "score_1", "pred_2", "score_2", "pred_3", "score_3"])

In [83]:
df_preds = df_preds.drop(columns=["img", "uniqueName"])

In [84]:
df_preds.columns

Index(['path', 'station', 'check', 'camera', 'exif_datetime', 'exif_gps_long',
       'exif_gps_lat', 'pred_group', 'pred_1', 'score_1', 'pred_2', 'score_2',
       'pred_3', 'score_3', 'Human', 'Blank', 'Elephant_African',
       'Hog_Red_River', 'Buffalo_African', 'Leopard_African', 'Monkey',
       'Duiker_Red', 'Civet_African_Palm', 'Squirrel', 'Duiker_Blue', 'Bird',
       'Mongoose_Black_Footed', 'Rodent', 'Duiker_Yellow_Backed', 'Genet',
       'Chimpanzee', 'Gorilla', 'Mongoose', 'Porcupine_Brush_Tailed',
       'Pangolin', 'Mandrillus', 'Chevrotain_Water', 'Cat_Golden', 'Rat_Giant',
       'Guineafowl_Crested', 'Guineafowl_Black', 'Rail_Nkulengu'],
      dtype='object')

## Save to output.csv

In [85]:
df_preds.to_csv("output.csv", index=False)