# How Optimum Threshold Changes Based on Test Size?
We know that test dataset is x2 times the training datasize. Therefore,
We know that having more/less crowded space might effect the value of optimum threshold to decide a match. Given that test dataset is x2 times the training datasize, I will calculate optimum threshold values for differnet datasize to understand the relationship between datasize and optimum threshold. Hence, we can come up with an estimate of optimum threshold for 70k test examples.

Another factor effecting the relationship between test size and threshold can be the number of dimensions of our embeddings. Therefore, I will be running those experiments for different embedding sizes too and compare the results. 

I ran a couple of experiments before to decide ranges to search optimum threshold.

**FOR [RESULTS](#conclusion) PLEASE GO TO THE END OF THE NOTEBOOK**

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GroupKFold
import cupy as cp
from tqdm import tqdm
import plotly.express as px

data_folder = "../input/shopee-product-matching/"
#ranges0 = [(0.55, 0.65), (0.50, 0.60), (0.45, 0.55), (0.45, 0.55), (0.40, 0.50)]
ranges1 = [(0.50, 0.60), (0.45, 0.55), (0.45, 0.55), (0.50, 0.60), (0.55, 0.65)]
#ranges2 = [(0.50, 0.60), (0.45, 0.55), (0.40, 0.50), (0.40, 0.50), (0.35, 0.45)]
#ranges3 = [(0.45, 0.55), (0.40, 0.50), (0.40, 0.50), (0.30, 0.40), (0.30, 0.40)]
#ranges4 = [(0.40, 0.50), (0.30, 0.40), (0.20, 0.30), (0.20, 0.30), (0.20, 0.30)]
#ranges5 = [(0.20, 0.30), (0.20, 0.30), (0.15, 0.25), (0.10, 0.20), (0.10, 0.20)]

In [None]:
def create_submission_format(df):
    tmp = df.groupby("label_group").posting_id.unique().to_dict()
    matches = df.label_group.map(lambda x: " ".join(tmp[x]))
    return matches

In [None]:
train = pd.read_csv(data_folder+"train.csv")

train["target"] = create_submission_format(train)

cv_splitter = GroupKFold(n_splits=5)
train["fold"] = -1

# Assign folds for validation
for fold, (train_idx, valid_idx) in enumerate(cv_splitter.split(train, None, train.label_group)):
    train.loc[valid_idx, "fold"] = fold


In [None]:
# Taken from Gunes Evitan's post here:  https://www.kaggle.com/c/shopee-product-matching/discussion/224782
def matches_to_f1_score(y_true, y_pred, mean=True):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))

    tp = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    fp = y_pred.apply(lambda x: len(x)).values - tp
    fn = y_true.apply(lambda x: len(x)).values - tp

    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1 = 2 * ((precision * recall) / (precision + recall))

    if mean:
        f1 = f1.mean()

    return f1

def get_best_threshold(method, embeddings, posting_ids, correct_matches, candidates):

    scores = dict()
    for threshold in candidates:

        matches = method(embeddings, posting_ids, threshold, create_submission=False)
        
        scores[threshold] = matches_to_f1_score(pd.Series(matches), pd.Series(correct_matches))

        print(f"Method:{method.__name__},   Threshold:{threshold:.4f},   F1-Score: {scores[threshold]:.4f}")

    best_threshold = max(scores, key=scores.get)
    best_score = scores[best_threshold]
    print("*"*50)
    print(f"Best Threshold:{best_threshold:.4f},  Best F1-Score: {best_score:.4f}")
    print("*"*50)
    
    return best_threshold, best_score

# Modified xhulu's euclidian distance code for cosine distance 
def cosine_find_matches_cupy(embeddings, posting_ids, threshold, create_submission=True):
    empty_emb_idx = np.squeeze(np.argwhere(embeddings.sum(axis=1) == 0), axis=1)
    embeddings = cp.array(embeddings)
    embeddings =  embeddings / cp.linalg.norm(embeddings, axis=1)[:,None]
    N = embeddings.shape[0]
    matches = []
   

    for i in tqdm(range(N)):
        v = embeddings[i, :]
        thresholded_bool = 1 - cp.dot(embeddings,v) < threshold
        thresholded_ix = cp.argwhere(thresholded_bool).squeeze(-1)
        thresholded_ix = thresholded_ix.get()
        match = " ".join(posting_ids[thresholded_ix])
        matches.append(match)
    
    # Match zero vector embeddins only with themselves
    for i in empty_emb_idx:
        matches[i] = posting_ids[i]
    
    return matches

In [None]:
# Create embeddings
dimension = 25000
vectorizer = TfidfVectorizer(stop_words = 'english', binary = True, max_features = dimension)
text_emb = vectorizer.fit_transform(pd.read_csv(data_folder + "train.csv").title)

tracker = pd.DataFrame(columns=["dimension", "n_label_group", "n_post", "optimum_threshold", "score"], data=np.zeros((10,5)))

print("************************ EMBEDDING SIZE: ", dimension, "**************************************************")
for folds_before, (search_from, search_to) in enumerate(ranges1):
    print("="*50)
    print("All Folds up to Fold:", folds_before)
    print("="*50)
    valid_emb = text_emb[train.fold <= folds_before,].toarray().astype(np.float32)
    valid_df = train.loc[train.fold <= folds_before,]
    n_label_group = valid_df.label_group.nunique()
    n_post = valid_df.shape[0]
    print("Number of Label Groups: ", n_label_group)
    print("Number of Posts: ", n_post)
    best_threshold, best_score = get_best_threshold(cosine_find_matches_cupy, valid_emb, valid_df.posting_id.values, valid_df.target.values, np.arange(search_from, search_to, 0.02))
    tracker.iloc[folds_before,] = (dimension, n_label_group, n_post, best_threshold, best_score)
    del valid_emb
    del valid_df
    

In [None]:
tracker[:5]

In [None]:
TRACKER = tracker[:5]
fig = px.scatter(TRACKER, x="n_post", y="optimum_threshold", trendline="ols", facet_col="dimension")
fig.show()