### Tokenizer

In [1]:
from transformers import AutoTokenizer
import warnings
warnings.filterwarnings("ignore")
import pandas as pd

In [2]:
tokenizer=AutoTokenizer.from_pretrained('bert-base-cased')

In [3]:
sentense= "In Natural Language Processing (NLP), tokenization is a fundamental step where text is divided into smaller components, known as tokens."

In [4]:
tokenizer(sentense)

{'input_ids': [101, 1130, 6240, 6828, 18821, 1158, 113, 21239, 2101, 114, 117, 22559, 2734, 1110, 170, 8148, 2585, 1187, 3087, 1110, 3233, 1154, 2964, 5644, 117, 1227, 1112, 22559, 1116, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

input_ids are the most direct representation of the input text in numerical form. During tokenization, each piece of text (e.g., a word or subword) is mapped to a unique integer based on a predefined vocabulary.

attention_mask is a binary mask (sequence of 1s and 0s) that specifies which tokens should be attended to and which should not. 

token_type_ids (also known as segment_ids) are used to distinguish between different parts of the input, especially in tasks that involve multiple input sequences (like question-answering or text-pair classification tasks).

In [5]:
token=tokenizer.tokenize(sentense)

In [18]:
len(token)

29

In [7]:
ids=tokenizer.convert_tokens_to_ids(token)

In [8]:
#ids

In [9]:
decode_string=tokenizer.decode(ids)

In [10]:
decode_string

'In Natural Language Processing ( NLP ), tokenization is a fundamental step where text is divided into smaller components, known as tokens.'

In [11]:
vocab=tokenizer.vocab

## vocab

In [12]:
vocab_df=pd.DataFrame({"token":vocab.keys(),"token_id":vocab.values()})
vocab_df=vocab_df.sort_values(by='token_id').set_index("token_id")

In [14]:
#vocab_df

In [15]:
token_id=tokenizer.encode(sentense)

In [17]:
#token_id

In [19]:
len(token_id)

31

#### Model

https://huggingface.co/google-bert/bert-base-cased

In [22]:
from transformers import BertModel,AutoTokenizer

In [23]:
model_name="bert-base-cased"

In [25]:
model=BertModel.from_pretrained(model_name)
tokenizer=AutoTokenizer.from_pretrained(model_name)

In [26]:
model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(28996, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

### Number of encoder

In [28]:
len(model.encoder.layer)

12

In [30]:
model.encoder.layer[0]

BertLayer(
  (attention): BertAttention(
    (self): BertSelfAttention(
      (query): Linear(in_features=768, out_features=768, bias=True)
      (key): Linear(in_features=768, out_features=768, bias=True)
      (value): Linear(in_features=768, out_features=768, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (output): BertSelfOutput(
      (dense): Linear(in_features=768, out_features=768, bias=True)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
  )
  (intermediate): BertIntermediate(
    (dense): Linear(in_features=768, out_features=3072, bias=True)
    (intermediate_act_fn): GELUActivation()
  )
  (output): BertOutput(
    (dense): Linear(in_features=3072, out_features=768, bias=True)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
)

### Use Case 1: similarities 

In [32]:
sent1="Deep learning is a principle within the realm of Artificial Intelligence"
sent2="Enhancing one's abilities is solely achieved through learning"

##### # Tokenizing the sentences

In [33]:
tokens1 = tokenizer.tokenize(sent1)
tokens2 = tokenizer.tokenize(sent2)

In [69]:
tokens1

['Deep',
 'learning',
 'is',
 'a',
 'principle',
 'within',
 'the',
 'realm',
 'of',
 'Art',
 '##ific',
 '##ial',
 'Intelligence']

In [44]:
encoded_inputs = tokenizer(sent1, return_tensors="pt")

#### Defining a function to encode the input text and get model predictions

In [40]:
def predict(text):
    encoded_inputs = tokenizer(text, return_tensors="pt")
    return model(**encoded_inputs)[0]

In [41]:
# Getting model predictions for the sentences
out1 = predict(sent1)
out2 = predict(sent2)

##### Extracting embeddings for the word 'learning' in both sentences

In [70]:
emb1 = out1[0:, tokens1.index("learning"), :].detach()[0]
emb2 = out2[0:, tokens2.index("learning"), :].detach()[0]

In [71]:
from scipy.spatial.distance import cosine

In [72]:
cosine(emb1,emb2)

0.525614857673645

#### Use Case 2: semantic similarity between two text string

In [73]:
from transformers import BertModel, BertTokenizer

In [74]:
tokenizer=BertTokenizer.from_pretrained(model_name)
model=BertModel.from_pretrained(model_name)

In [75]:
### encode text

In [76]:
def bert_encode(sent):
    inputs_ids=tokenizer(sent,return_tensors='pt',padding=True,truncation=True)
    outputs=model(**inputs_ids)
    return outputs.last_hidden_state.mean(1)

In [77]:
sent1="Deep learning is a principle within the realm of Artificial Intelligence"
sent2="Enhancing one's abilities is solely achieved through learning"

In [79]:
vec1 = bert_encode(sent1)
vec2 = bert_encode(sent2)

In [90]:
vec11 = vec1.detach().numpy()
vec22=vec2.detach().numpy()

In [87]:
from sklearn.metrics.pairwise import cosine_similarity

In [91]:
cosine_similarity(vec11,vec22)

array([[0.8455614]], dtype=float32)