In [None]:
%load_ext autoreload
%autoreload 2

# Analyze Triplets

- plot pos scores, neg scores, and margins

- review common non-negatives that aren't represented in anchor-pos pairs
- review anchor-pos pairs that aren't represented in common non-negatives

In [None]:
import re

import pandas as pd

from src.data.filesystem import fopen

In [None]:
given_surname = "surname"
sample_frac = 1.0
num_common_names = 1000 if given_surname == "given" else 2500
num_semi_common_names = 1500 if given_surname == "given" else 4000

pref_path = f"s3://familysearch-names/processed/tree-preferred-{given_surname}-aggr.csv.gz"
triplets_path=f"../data/processed/tree-hr-{given_surname}-triplets-v2-1000.csv.gz"
common_non_negatives_path = f"../data/processed/common_{given_surname}_non_negatives.csv"

In [None]:
def stringify_pair(name1, name2):
    return f"{name1}:{name2}"

## Load data

### Triplets

In [None]:
triplets_df = pd.read_csv(triplets_path)
print(len(triplets_df))
triplets_df.head(3)

In [None]:
anchor_pos_pairs = set()
for anchor, pos in zip(triplets_df['anchor'], triplets_df['positive']):
    anchor_pos_pairs.add(stringify_pair(anchor, pos))
len(anchor_pos_pairs)

In [None]:
triplets_df[(triplets_df['anchor'] == 'zsuzsanna') | (triplets_df['positive'] == 'zsuzsanna')]

In [None]:
name = 'quass'
triplets_df[(triplets_df['anchor'] == name) | (triplets_df['positive'] == name)]

### Common names

In [None]:
pref_df = pd.read_csv(pref_path, keep_default_na=False)
common_names = set([name for name in pref_df['name'][:num_common_names].tolist() \
                if len(name) > 1 and re.fullmatch(r'[a-z]+', name)])
len(common_names)

In [None]:
next(iter(common_names))

In [None]:
# zsuzsanna is the hungarian form of Susanna
pref_df[pref_df['name'] == 'zsuzsanna']

In [None]:
semi_common_names = set([name for name in pref_df['name'][:num_semi_common_names].tolist() \
                if len(name) > 1 and re.fullmatch(r'[a-z]+', name)])
len(semi_common_names)

### Common non-negative pairs

In [None]:
common_non_negatives_df = pd.read_csv(common_non_negatives_path, keep_default_na=False)
common_non_negatives = set()
for name1, name2 in common_non_negatives_df.values.tolist():
    common_non_negatives.add((name1, name2))
len(common_non_negatives)

## Analyze positive and negative score distributions

In [None]:
save_df = triplets_df.copy()

In [None]:
triplets_df = save_df.copy()

In [None]:
triplets_df[triplets_df['positive_score'] < triplets_df['negative_score']]

In [None]:
triplets_df['margin'] = triplets_df['positive_score'] - triplets_df['negative_score']

In [None]:
triplets_df['positive_score'].hist(bins=20)

In [None]:
triplets_df['negative_score'].hist(bins=20)

In [None]:
triplets_df['margin'].hist(bins=20)

In [None]:
triplets_df[(triplets_df['margin'] > 0.2)].head(20)

## Review common anchor-pos pairs

In [None]:
cnt = 0
max_cnt = 500
for anchor_pos_pair in anchor_pos_pairs:
    anchor, pos = anchor_pos_pair.split(':')
    if anchor in common_names and pos in common_names:
        if cnt < max_cnt:
            print(anchor, pos)
        cnt += 1
cnt

## Review semi-common non-negatives that aren't represented in anchor-pos pairs

**TODO:** We should ask someone to review these pairs and take out the non-non-negatives (non-matches), 
and then somehow add the remaining matches when we augment the triplets in notebook 207.

In [None]:
cnt = 0
max_cnt = 10000
for name1, name2 in common_non_negatives:
    if name1 in semi_common_names and name2 in semi_common_names and \
            stringify_pair(name1, name2) not in anchor_pos_pairs and \
            stringify_pair(name2, name1) not in anchor_pos_pairs:
        if cnt < max_cnt:
            print(f"{name1} {name2}")
        cnt += 1
cnt     

## Review anchor-pos pairs that aren't represented in common non-negatives

In [None]:
cnt = 0
for anchor, pos in zip(triplets_df['anchor'], triplets_df['positive']):
    if anchor in common_names and pos in common_names and (anchor, pos) not in common_non_negatives:
        if cnt < 1000:
            print(anchor, pos)
        cnt += 1
cnt

## Review strange model errors

In [None]:
def find_pairs(name1, name2):
    result = []
    for anchor_pos_pair in anchor_pos_pairs:
        anchor, pos = anchor_pos_pair.split(':')
        if (anchor.startswith(name1) and pos.startswith(name2)) or \
                (anchor.startswith(name2) and pos.startswith(name1)):
            result.append(anchor_pos_pair)
    return result

In [None]:
# marie + annie
find_pairs('mar', 'ann')

In [None]:
# charles + frances
find_pairs('charl', 'franc')