In [6]:
#Notwendige Imports für das Projekt
import os
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors

from GLC.metrics import top_30_error_rate, top_k_error_rate_from_sets, predict_top_30_set
from GLC.submission import generate_submission_file

In [3]:
DATA_PATH = Path("./data/")

In [4]:
df_obs = pd.read_csv(DATA_PATH / "observations" / "observations_fr_train.csv", sep=";", index_col="observation_id")
df_obs_test = pd.read_csv(DATA_PATH / "observations" / "observations_fr_test.csv", sep=";", index_col="observation_id")

In [5]:
obs_id_train = df_obs.index[df_obs["subset"] == "train"].values
obs_id_val = df_obs.index[df_obs["subset"] == "val"].values

In [8]:
y_train = df_obs.loc[obs_id_train]["species_id"].values  #Id der Arten vom Trainingsdatensatz
y_val = df_obs.loc[obs_id_val]["species_id"].values # Id der Arten vom Validierungsdatensatz

X_train_loc = df_obs.loc[obs_id_train][['latitude','longitude']].values #Breitengrad, Längengrad vom Trainingsdatensatz
X_val_loc = df_obs.loc[obs_id_val][['latitude','longitude']].values # Breitengrad, Längengrad vom Validierungsdatensatz

In [9]:
PREDICTION_COUNT = 30

#Nehme die 30 besten Vorhersagen für eine Koordinate
def pred_coord(coord, y):
    arr = y[coord]
    u, count = np.unique(arr, return_counts=True)   # Artenanzahl in Nachbarn
    count_sort_ind = np.argsort(-count)             # Absteigende Sortierung
    return u[count_sort_ind][:PREDICTION_COUNT]     # Auswahl von Prediction Count

#Treffe Vorhersage für ein Array von Koordinaten
def predict_NN(model, X_loc, y):
    distances, indices = model.kneighbors(X_loc)
    predictions = []
    for i in range(len(indices)):
        predictions.append(pred_coord(indices[i],y))
    return np.array(predictions)

In [11]:
nbrs_100 = NearestNeighbors(n_neighbors=100, algorithm='ball_tree').fit(X_train_loc)
distances_100, indices_100 = nbrs_100.kneighbors(X_train_loc)

In [10]:
nbrs_800 = NearestNeighbors(n_neighbors=800, algorithm='ball_tree').fit(X_train_loc)
distances_800, indices_800 = nbrs_800.kneighbors(X_train_loc)

In [17]:
species_100 = y_train[indices_100]
species_800 = y_train[indices_800]

In [48]:
df_data = []

In [24]:
def computeUniqueness(array):
    uniqueArray = []
    for arr in array:
        uniqueArray.append(np.unique(arr))
        
    return uniqueArray

In [31]:
uniqueArray_100 = computeUniqueness(species_100)
uniqueArray_800 = computeUniqueness(species_800)

In [50]:
uniqueArray_100_sizes = list(map(lambda x: x.size, uniqueArray_100))
uniqueArray_800_sizes = list(map(lambda x: x.size, uniqueArray_800))
avg_species_count = []
avg_species_count_100 = np.mean(uniqueArray_100_sizes) 
avg_species_count_800 = np.mean(uniqueArray_800_sizes)      

avg_species_count.append(avg_species_count_100)
avg_species_count.append(avg_species_count_800)
df_data.append(avg_species_count)

[[77.09224655359208, 403.1786798890079]]

In [53]:
distances_100_avg = list(map(lambda x: np.mean(x), distances_100))
distances_800_avg = list(map(lambda x: np.mean(x), distances_800))

In [56]:
avg_distances = []
avg_distances.append(np.mean(distances_100_avg))
avg_distances.append(np.mean(distances_800_avg))
df_data.append(avg_distances)

In [70]:
counts_100 = []
for arr in species_100:
    spec, count = np.unique(arr, return_counts= True)
    counts_100.append(count)
    
counts_800 = []
for arr in species_800:
    spec, count = np.unique(arr, return_counts= True)
    counts_800.append(count)

In [103]:
maximums_100 = []
for arr in counts_100:
    maximums_100.append(np.amax(arr))
    
maxmimum_100 = np.mean(maximums_100)

maximums_800 = []
for arr in counts_800:
    maximums_800.append(np.amax(arr))
    
maxmimum_800 = np.mean(maximums_800)

maximums = []
maximums.append(maxmimum_100)
maximums.append(maxmimum_800)
df_data.append(maximums)

In [93]:
def count_ones(x):
    sum = 0
    for i in x:
        if i == 1:
            sum += 1
    return sum
    

count_1_observation_100 = list(map(lambda x: count_ones(x), counts_100))
count_1_observation_800 = list(map(lambda x: count_ones(x), counts_800))

In [107]:
avg_count_1 = []
avg_count_1.append(np.mean(count_1_observation_100))
avg_count_1.append(np.mean(count_1_observation_800))
df_data.append(avg_count_1)

In [109]:
pd_neighbors_error_rate = pd.DataFrame(df_data, columns=["k=100", "k=800"])
pd_neighbors_error_rate.index = ["Durchschnitt Anzahl Arten","Durchschnitt Entfernungen", "Durchschnitt Maximales Vorkommen", "Durchschnitt Anzahl Arten, die nur einmal vorkommen"]
pd_neighbors_error_rate

Unnamed: 0,k=100,k=800
Durchschnitt Anzahl Arten,77.092247,403.17868
Durchschnitt Entfernungen,0.025359,0.082746
Durchschnitt Maximales Vorkommen,5.735044,21.170098
"Durchschnitt Anzahl Arten, die nur einmal vorkommen",63.220171,235.975959
