In [None]:
%load_ext autoreload
%autoreload 2

# Generate n-grams 
Find a reasonable number of n-grams using recursive feature elimination over n-grams generated using TfidfVectorizer

In [None]:
import math
import random

import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from tqdm.auto import tqdm

from src.data.filesystem import fopen

In [None]:
given_surname = "given"
min_df = 2000
ngram_range=(1,3)

sample_frac = 0.05

train_path = f"s3://familysearch-names/processed/tree-hr-{given_surname}-train-v2.csv.gz"
triplets_path=f"s3://familysearch-names/processed/tree-hr-{given_surname}-triplets.csv.gz"
tfidf_path=f"s3://nama-data/data/models/fs-{given_surname}-tfidf-v2.joblib"

## Load data

In [None]:
df = pd.read_csv(train_path, keep_default_na=False)
print(df.shape)
df.head(3)

In [None]:
all_names = set(df['tree_name']) | set(df['record_name'])
print(len(all_names))
next(iter(all_names))

In [None]:
# read triplets
triplets_df = pd.read_csv(triplets_path)
print(len(triplets_df))
triplets_df.head(3)

## Generate n-grams

In [None]:
max_df = 0.5
tfidf_vectorizer = TfidfVectorizer(ngram_range=ngram_range, analyzer="char_wb", min_df=min_df, max_df=max_df)
tfidf_vectorizer.fit(all_names)
vocab = tfidf_vectorizer.vocabulary_
len(vocab)

## Generate training data

In [None]:
pos_df = triplets_df[['anchor', 'positive', 'positive_score']].drop_duplicates()
pos_df.rename(columns={'positive': 'name', 'positive_score': 'score'}, inplace=True)
print('pos', len(pos_df))
neg_df = triplets_df[['anchor', 'negative', 'negative_score']].drop_duplicates()
print('raw neg', len(neg_df))
# drop the really-easy negatives
neg_df = neg_df[neg_df['negative_score'] > 0.03]
print('not too easy neg', len(neg_df))
# sample the remaining negatives so we have 50% more negatives than positives
neg_df = neg_df.sample(int(len(pos_df) * 1.5))
print('sampled neg', len(neg_df))
neg_df.rename(columns={'negative': 'name', 'negative_score': 'score'}, inplace=True)
pairs_df = pd.concat([pos_df, neg_df], ignore_index=True)
# randomize pairs
pairs_df = pairs_df.sample(frac=sample_frac).reset_index(drop=True)
print('total', len(pairs_df))

In [None]:
A = []
B = []
y = []
for anchor, name, score in tqdm(zip(pairs_df['anchor'], pairs_df['name'], pairs_df['score'])):
    anchor = anchor[1:-1]
    name = name[1:-1]
    Xs = tfidf_vectorizer.transform([anchor, name]).todense()
    anchor_X = Xs[0]
    name_X = Xs[1]
    A.append(np.squeeze(np.asarray(anchor_X)))
    B.append(np.squeeze(np.asarray(name_X)))
    y.append(score)

In [None]:
A = np.stack(A, axis=0)
B = np.stack(B, axis=0)
print(A.shape)
print(B.shape)

In [None]:
norm = np.sqrt(np.multiply(np.sum(A*A, axis=1), np.sum(B*B, axis=1)))
X = np.multiply(A, B) / norm[..., np.newaxis]
X.shape

## Test linear regression on all ngrams

In [None]:
clf = LinearRegression().fit(X, y)

In [None]:
clf.intercept_

In [None]:
clf.coef_

In [None]:
clf.score(X, y)

### Predict a single pair

In [None]:
pairs_df.head(1)

In [None]:
anchor = 'aage'
name = 'age'
Xs = tfidf_vectorizer.transform([anchor, name]).todense()
anchor_X = Xs[0]
name_X = Xs[1]
norm = max(0.00001, math.sqrt(np.square(anchor_X).sum() * np.square(name_X).sum()))
x = np.multiply(anchor_X, name_X) / norm
clf.predict(np.asarray(x))

### Cross validation

In [None]:
clf = LinearRegression()
scores = cross_val_score(clf, X, y, scoring='r2', cv=5)
scores

In [None]:
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

#### Given names
min doc freq 1000, (1,3) => mean -1e20, std dev 2e20

min doc freq 1200, (1,3) => mean -5e19, std dev 1e20

min doc freq 1500, (1,3) => mean -2e18, std dev 1e19

min doc freq 2000, (1,3) => mean 0.25, std dev 0.01

min doc freq 3000, (1,3) => mean 0.24, std dev 0.01

min doc freq 1200, (1,2) => mean 1e-20, std dev 1e20

min doc freq 1500, (1,2) => mean 0.20, std dev 0.01

min doc freq 2000, (1,2) => mean 0.21, std dev 0.01

min doc freq 3000, (1,2) => mean 0.21, std dev 0.01

## Recursive Feature Elimination

In [None]:
n_features = A.shape[1]
support = [True] * n_features
ranking = [1] * n_features
all_scores = []

In [None]:
# remove features a few at a time so we can re-normalize after removing features
min_features = 1
step_size = -10
for nf in tqdm(range(n_features+step_size, min_features, step_size)):
    # filter A and B to have only selected features
    A_filtered = A[:, support]
    B_filtered = B[:, support]
    # compute X = (A * B) / sqrt(sum(A^2)*sum(B^2))
    norm = np.sqrt(np.multiply(np.sum(A_filtered*A_filtered, axis=1), \
                               np.sum(B_filtered*B_filtered, axis=1)))
    X = np.multiply(A_filtered, B_filtered) / norm[..., np.newaxis]
    
    # remove one feature
    clf = LinearRegression()
    selector = RFECV(clf, min_features_to_select=nf, scoring='r2', cv=5)
    selector = selector.fit(X, y)
    # which feature did you remove in the original feature space?
    new_support = list(selector.support_)
    new_support_ix = 0
    found = False
    for support_ix in range(n_features):
        if not support[support_ix]:
            continue
        if not new_support[new_support_ix]:
            ranking[support_ix] = A_filtered.shape[1]
            support[support_ix] = False
            found = True
        new_support_ix += 1
    # stop early?
    if not found:
        break
    # calculate CV scores
    clf = LinearRegression()
    scores = cross_val_score(clf, X[:, selector.support_], y, scoring='r2', cv=5)
    print(f"mean={scores.mean()}, std dev={scores.std()}")
    for score in scores:
        all_scores.append((nf, score))

In [None]:
plt.scatter(*zip(*all_scores))
plt.show()

### Cross-validate results

In [None]:
A_filtered = A[:, support]
B_filtered = B[:, support]
# compute X = (A * B) / sqrt(sum(A^2)*sum(B^2))
norm = np.sqrt(np.multiply(np.sum(A_filtered*A_filtered, axis=1), \
                           np.sum(B_filtered*B_filtered, axis=1)))
X = np.multiply(A_filtered, B_filtered) / norm[..., np.newaxis]

In [None]:
clf = LinearRegression()
scores = cross_val_score(clf, X, y, scoring='r2', cv=5)
scores

In [None]:
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

## Review results

In [None]:
norm = np.sqrt(np.multiply(np.sum(A*A, axis=1), \
                           np.sum(B*B, axis=1)))
X = np.multiply(A, B) / norm[..., np.newaxis]

norm_filtered = np.sqrt(np.multiply(np.sum(A_filtered*A_filtered, axis=1), \
                                    np.sum(B_filtered*B_filtered, axis=1)))
X_filtered = np.multiply(A_filtered, B_filtered) / norm_filtered[..., np.newaxis]

clf = LinearRegression().fit(X, y)
clf_filtered = LinearRegression().fit(X_filtered, y)

In [None]:
for ix, (anchor, name, score) in enumerate(zip(pairs_df['anchor'], pairs_df['name'], pairs_df['score'])):
    if ix > 100:
        break
    anchor = anchor[1:-1]
    name = name[1:-1]
    Xs = tfidf_vectorizer.transform([anchor, name]).todense()
    anchor_X = Xs[0]
    name_X = Xs[1]
    norm = math.sqrt(np.square(anchor_X).sum() * np.square(name_X).sum())
    x = np.multiply(anchor_X, name_X) / norm
    predict = clf.predict(np.asarray(x))[0]

    Xs_filtered = Xs[:, support]
    anchor_X_filtered = Xs_filtered[0]
    name_X_filtered = Xs_filtered[1]
    norm_filtered = math.sqrt(np.square(anchor_X_filtered).sum() * np.square(name_X_filtered).sum())
    x_filtered = np.multiply(anchor_X_filtered, name_X_filtered) / norm_filtered
    predict_filtered = clf_filtered.predict(np.asarray(x_filtered))[0]
    
    print(f"{anchor} {name} {score} {predict} {predict_filtered}")

## Save TfidfVectorizer
don't filter n-grams - there's no benefit

In [None]:
joblib.dump(tfidf_vectorizer, fopen(tfidf_path, mode='wb'))