# Initialization

In [None]:
# !pip install python-Levenshtein

In [2]:
import json
from os import path

import numpy as np
import pandas as pd
import requests
import Levenshtein

from google.colab import drive

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [3]:
PROJECT_PATH = ''
SUBMISSION_PATH = PROJECT_PATH + 'submission/'

DATA_PATH = PROJECT_PATH + 'data/'
CHALLENGE_DATA_PATH = DATA_PATH + 'challenge/'

OUTPUT_PATH = PROJECT_PATH + 'output/'
CHALLENGE_OUTPUT_PATH = OUTPUT_PATH + 'challenge/'

MODEL_PATH = PROJECT_PATH + 'model/'
CHALLENGE_MODEL_PATH = MODEL_PATH + 'challenge/'

LOG_PATH = PROJECT_PATH + 'log/'
CHALLENGE_LOG_PATH = LOG_PATH + 'challenge/'

In [None]:
if not path.exists('/content/drive'):
  drive.mount('/content/drive')

In [18]:
def pprint(obj, level=-1):
    if isinstance(obj, list) or isinstance(obj, np.ndarray):
        pprint('[', level)
        for e in obj:
            pprint(e, level + 1)
        pprint(']', level)

    else:
        print('  ' * level + str(obj))

# Handling dfs

In [14]:
df_train = pd.read_csv(CHALLENGE_DATA_PATH + 'geonames-generated-df-train-400.csv')
df_eval = pd.read_csv(CHALLENGE_DATA_PATH + 'geonames-generated-df-eval-76.csv')

In [15]:
train_df = df_train.copy()
eval_df = df_eval.copy()

# Evaluate Results with Levenshtein and Substring Functions

In [10]:
def find_parent_levenstein(target, parents):
    closest_string = min(parents, key=lambda s: Levenshtein.distance(target, s))
    return closest_string

def find_parent_substring(target, parents):
    for parent in parents:
        if target in parent:
            return parent

    return target

In [11]:
def evaluate(y_true, y_pred):

    y_pred_lvn = y_pred.apply(find_parent_levenstein, args=(parents,))
    y_pred_sub = y_pred.apply(find_parent_substring, args=(parents,))

    eval_res = {}

    eval_res['org'] = {
        'len': len(y_pred.unique()),
        'acc': accuracy_score(y_true, y_pred),
        'prc': precision_score(y_true, y_pred, average='weighted'),
        'rcl': recall_score(y_true, y_pred, average='weighted'),
        'f1s': f1_score(y_true, y_pred, average='weighted'),
    }

    eval_res['lvn'] = {
        'len': len(y_pred_lvn.unique()),
        'acc': accuracy_score(y_true, y_pred_lvn),
        'prc': precision_score(y_true, y_pred_lvn, average='weighted'),
        'rcl': recall_score(y_true, y_pred_lvn, average='weighted'),
        'f1s': f1_score(y_true, y_pred_lvn, average='weighted'),
    }

    eval_res['sub'] = {
        'len': len(y_pred_sub.unique()),
        'acc': accuracy_score(y_true, y_pred_sub),
        'prc': precision_score(y_true, y_pred_sub, average='weighted'),
        'rcl': recall_score(y_true, y_pred_sub, average='weighted'),
        'f1s': f1_score(y_true, y_pred_sub, average='weighted'),
    }

    return eval_res

def std_and_json(df, parent_col, child_col, extra_cols):
    df = df.drop(columns=extra_cols)
    df = df.rename(columns={parent_col: 'parent', child_col: 'child'})

    # df = df.fillna("none")

    # swap parent and child columns (if needed)
    # children = df['child']
    # df = df.drop(columns=['child'])
    # df['child'] = children

    # df.to_json(SUBMISSION_PATH + 'submission-bert-5-topdown-5.json', orient='records')

In [16]:
test_df = eval_df[eval_df['label'] == True]

test_df

Unnamed: 0,parent,child,label
0,"stream, lake",abandoned canal,True
1,"stream, lake",sulphur spring(s),True
2,"parks, area",region,True
3,undersea,seachannel,True
4,"parks, area",amusement park,True
...,...,...,...
71,undersea,sill,True
72,"spot, building, farm",ruined bridge,True
73,"spot, building, farm",mall,True
74,"parks, area",continent,True


In [None]:
df_name = 'llama-results-df.csv'

df = pd.read_csv(CHALLENGE_OUTPUT_PATH + df_name)
evaluate(test_df['parent'], df['prediction'])
# std_and_json(df, 'prediction', 'child', ['parent', 'label'])