In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import pandas as pd
from scipy.sparse import vstack
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.utils.extmath import safe_sparse_dot
from tqdm import tqdm

from src.data.ancestry import load_train_test
from src.metrics import metrics
from src.models import utils

In [None]:
np.set_printoptions(suppress=True)
tqdm.pandas()

### Load and process data

In [None]:
# Only need to run this once
# It we split the data into train/test and will persist the data on disk
# dataset.load_split_init(test_size=0.1)

In [None]:
train, test = load_train_test(f"../data/raw/records25k_data_train.csv", f"../data/raw/records25k_data_test.csv")

input_names_train, weighted_actual_names_train, candidate_names_train = train
input_names_test, weighted_actual_names_test, candidate_names_test = test

candidate_names_all = np.concatenate((candidate_names_train, candidate_names_test))

### Model

In [None]:
vectorizer = TfidfVectorizer(ngram_range=(1, 3), analyzer="char_wb", min_df=10, max_df=0.5)

In [None]:
X_train = vectorizer.fit_transform(candidate_names_train)
X_test = vectorizer.transform(candidate_names_test)
X_all = vstack((X_train, X_test))

#### Similarity Function

In [None]:
def get_similars(name, k=10, demo_mode=False):
    if demo_mode:
        name = utils.add_padding(name)
    x = vectorizer.transform([name]).toarray()
    scores = safe_sparse_dot(X_all, x.T).flatten()
    sorted_scores_idx = np.argsort(scores)[::-1][:k]
    candidates = candidate_names_all[sorted_scores_idx]
    if demo_mode:
        candidates = [utils.remove_padding(candidate) for candidate in candidates]
    candidates_scores = scores[sorted_scores_idx]

    return list(zip(candidates, candidates_scores))

#### Demo

In [None]:
get_similars("schumacher", 10, True)

### Generate candidates for all test names

In [None]:
k = 100  # Number of candidates to consider
similar_names_scores_test = list(map(lambda x: get_similars(x, k=k), tqdm(input_names_test)))
similar_names_test = np.array(similar_names_scores_test)[:, :, 0]

In [None]:
len(similar_names_scores_test)

#### Ugh - how can I create a 3D array with (str, float) as the third axis without taking apart and re-assembling the array?

In [None]:
# names is a 2D array axis 0 = names, axis 1 = name of k similar-names
names = np.array(list(list(cell[0] for cell in row) for row in similar_names_scores_test), dtype="O")
# scores is a 2D array axis 0 = names, axis 1 = score of k similar-names
scores = np.array(list(list(cell[1] for cell in row) for row in similar_names_scores_test), dtype="f8")
# similar_names_test is now a 3D array axis 0 = names, axis 1 = k similar-names, axis 2 = name or score
similar_names_scores_test = np.dstack((names, scores))

### Evaluation

### Average precision @0.65

In [None]:
metrics.avg_precision_at_threshold(weighted_actual_names_test, similar_names_scores_test, 0.65)

### Average recall @0.65

In [None]:
metrics.avg_weighted_recall_at_threshold(weighted_actual_names_test, similar_names_scores_test, 0.65)

### Precision-Recall Curve

In [None]:
# minimum score threshold to test
min_threshold = 0.5
metrics.precision_weighted_recall_curve_at_threshold(
    weighted_actual_names_test, similar_names_scores_test, min_threshold
)

### Remove weights for mean average precision evaluations

In [None]:
actual_names_test = [[name for name, _, _ in name_weights] for name_weights in weighted_actual_names_test]

### mAP@1

In [None]:
metrics.mean_avg_precision_k(actual_names_test, similar_names_test, 1)

### mAP@3

In [None]:
metrics.mean_avg_precision_k(actual_names_test, similar_names_test, 3)

### Precision-Recall Curve at k

In [None]:
# Number of rank cutoffs to test i.e precision_{i}, recall_{i} for i in (1, ..., N)
N = 100
metrics.precision_recall_curve_at_k(actual_names_test, similar_names_test, N)

### Test

In [None]:
input_names_test[251]

In [None]:
weighted_actual_names_test[251]

In [None]:
pd.DataFrame(similar_names_scores_test[251, 0:10], columns=["name", "score"])

In [None]:
metrics.weighted_recall_at_threshold(weighted_actual_names_test[251], similar_names_scores_test[251], 0.9)

In [None]:
metrics.weighted_recall_at_threshold(weighted_actual_names_test[251], similar_names_scores_test[251], 0.5)