# Token and Embeddings Fundamentals

In [None]:
import random
from sentence_transformers import SentenceTransformer
from sklearn.manifold import TSNE
import warnings
from random import random, randint
from math import floor, log
import networkx as nx
import numpy as np
import matplotlib as mtplt
from matplotlib import pyplot as plt

warnings.filterwarnings('ignore')

https://www.sbert.net/docs/sentence_transformer/pretrained_models.html

In [None]:
model = SentenceTransformer("all-MiniLM-L6-v2")
model

## Tokenization

* Text tokenization is a fundamental process in natural language processing (NLP) and information retrieval. 
* The primary goal of tokenization is to represent text in a manner that's meaningful for machines without losing its context.
* It involves breaking down a given text into smaller, meaningful units called tokens. 
* These tokens can be individual words, phrases, or even sentences, depending on the level of granularity required for analysis or processing.
* By converting text into tokens, algorithms can more easily identify patterns, which is crucial for tasks such as sentiment analysis, machine translation, text classification, and keyword extraction23.

In [None]:
tokenized_data = model.tokenize(["The future belongs to those who prepare for it today"])
tokenized_data

In [None]:
model.tokenizer.convert_ids_to_tokens(tokenized_data["input_ids"][0])

## K Nearest Neighbour (k-NN)

* The main purpose of the k-nearest neighbors (k-NN) algorithm is to classify or predict the value of a data point based on the data points that are closest to it in the feature space. 
* In other words, it uses the similarity between data points to make decisions.

In [8]:
vec_num = 30 # Number of vectors (nodes)
dim = 2 ## Dimention. Set to be 2. All the graph plots are for dim 2. If changed, then plots should be commented. 
m_nearest_neighbor = 2 # M Nearest Neigbor used in construction of the Navigable Small World (NSW)

vec_pos = np.random.uniform(size=(vec_num, dim))

In [None]:
## Query
query_vec = [0.5, 0.5]

nodes = []
nodes.append(("Q",{"pos": query_vec}))

G_query = nx.Graph()
G_query.add_nodes_from(nodes)

print("nodes = ", nodes, flush=True)

pos_query=nx.get_node_attributes(G_query,'pos')

def nearest_neigbor(vec_pos,query_vec):
    nearest_neighbor_index = -1
    nearest_dist = float('inf')

    nodes = []
    edges = []
    for i in range(np.shape(vec_pos)[0]):
        nodes.append((i,{"pos": vec_pos[i,:]}))
        if i<np.shape(vec_pos)[0]-1:
            edges.append((i,i+1))
        else:
            edges.append((i,0))

        dist = np.linalg.norm(query_vec-vec_pos[i])
        if dist < nearest_dist:
            nearest_neighbor_index = i
            nearest_dist = dist
        
    G_lin = nx.Graph()
    G_lin.add_nodes_from(nodes)
    G_lin.add_edges_from(edges)

    nodes = []
    nodes.append(("*",{"pos": vec_pos[nearest_neighbor_index,:]}))
    G_best = nx.Graph()
    G_best.add_nodes_from(nodes)
    return G_lin, G_best

In [None]:
(G_lin, G_best) = nearest_neigbor(vec_pos,query_vec)

pos_lin=nx.get_node_attributes(G_lin,'pos')
pos_best=nx.get_node_attributes(G_best,'pos')

fig, axs = plt.subplots()

nx.draw(G_lin, pos_lin, with_labels=True, node_size=150, node_color=[[0.8,0.8,1]], width=0.0, font_size=7, ax = axs)
nx.draw(G_query, pos_query, with_labels=True, node_size=200, node_color=[[0.5,0,0]], font_color='white', width=0.5, font_size=7, font_weight='bold', ax = axs)
nx.draw(G_best, pos_best, with_labels=True, node_size=200, node_color=[[0.85,0.7,0.2]], width=0.5, font_size=7, font_weight='bold', ax = axs)