In [10]:
import torch
from transformers import AutoModel, AutoTokenizer

In [11]:
import os

# Change the current working directory
new_directory = "C:/Users/shaur/Downloads"
os.chdir(new_directory)

# Verify the change
print("Current working directory:", os.getcwd())


Current working directory: C:\Users\shaur\Downloads


In [12]:
model_name = "distilbert-base-uncased"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModel.from_pretrained(model_name).to(device)

In [4]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [5]:
text = "The black cat sat on the couch and the brown dog slept on the rug."
tokenzed_text = tokenizer(text, return_tensors="pt")

In [6]:
inputs = {k:v.to(device) for k,v in tokenzed_text.items()}
with torch.no_grad():
    outputs = model(**inputs)
print(outputs)

BaseModelOutput(last_hidden_state=tensor([[[-0.1878, -0.0283, -0.1687,  ..., -0.1363,  0.7483,  0.1205],
         [-0.2751,  0.0506, -0.4477,  ...,  0.1321,  1.3671, -0.4097],
         [ 0.0966,  0.1944, -0.0447,  ..., -0.4726,  0.7158, -0.3971],
         ...,
         [ 0.1167,  0.0275, -0.0336,  ..., -0.2981,  0.5688, -0.2564],
         [ 0.6191,  0.1539, -0.6147,  ...,  0.0474, -0.1249, -0.5419],
         [ 0.2106,  0.3435, -0.1342,  ..., -0.1020,  0.3287, -0.4919]]],
       device='cuda:0'), hidden_states=None, attentions=None)


In [7]:
black = outputs.last_hidden_state[:,2,:]
brown = outputs.last_hidden_state[:,10,:]

In [8]:
from numpy import dot
from numpy.linalg import norm

def cos_sim(a,b):
    return dot(a, b)/(norm(a)*norm(b))
cos_sim(black.cpu().squeeze(0).numpy(),brown.cpu().squeeze(0).numpy())

0.90681314

## Multi-head attention

In [85]:
from scipy.special import softmax
import numpy as np

In [86]:
print("Step 1: Input : 3 inputs, d_model=4")
x =np.array([[1.0, 0.0, 1.0, 0.0], # Input 1
[0.0, 2.0, 0.0, 2.0], # Input 2
[1.0, 1.0, 1.0, 1.0]]) # Input 3
print(x)

Step 1: Input : 3 inputs, d_model=4
[[1. 0. 1. 0.]
 [0. 2. 0. 2.]
 [1. 1. 1. 1.]]


In [87]:
print("Step 2: weights 3 dimensions x d_model=4")
print("w_query")
w_query =np.array([[1, 0, 1],
[1, 0, 0],
[0, 0, 1],
[0, 1, 1]])
print(w_query)

Step 2: weights 3 dimensions x d_model=4
w_query
[[1 0 1]
 [1 0 0]
 [0 0 1]
 [0 1 1]]


In [88]:
print("w_key")
w_key =np.array([[0, 0, 1],
[1, 1, 0],
[0, 1, 0],
[1, 1, 0]])
print(w_key)

w_key
[[0 0 1]
 [1 1 0]
 [0 1 0]
 [1 1 0]]


In [89]:
print("w_value")
w_value = np.array([[0, 2, 0],
[0, 3, 0],
[1, 0, 3],
[1, 1, 0]])
print(w_value)

w_value
[[0 2 0]
 [0 3 0]
 [1 0 3]
 [1 1 0]]


In [90]:
print("Step 3: Matrix multiplication to obtain Q,K,V")
print("Query: x * w_query")
Q=np.matmul(x,w_query)
print(Q)

Step 3: Matrix multiplication to obtain Q,K,V
Query: x * w_query
[[1. 0. 2.]
 [2. 2. 2.]
 [2. 1. 3.]]


In [91]:
print("Key: x * w_key")
K=np.matmul(x,w_key)
print(K)

Key: x * w_key
[[0. 1. 1.]
 [4. 4. 0.]
 [2. 3. 1.]]


In [92]:
print("Value: x * w_value")
V=np.matmul(x,w_value)
print(V)

Value: x * w_value
[[1. 2. 3.]
 [2. 8. 0.]
 [2. 6. 3.]]


In [93]:
print("Step 4: Scaled Attention Scores")
k_d = 1 #square root of k_d=3 rounded down to 1 for this example
attention_scores = (Q @ K.transpose())/k_d
print(attention_scores)

Step 4: Scaled Attention Scores
[[ 2.  4.  4.]
 [ 4. 16. 12.]
 [ 4. 12. 10.]]


In [94]:
print("Step 5: Scaled softmax attention_scores for each vector")
attention_scores[0]=softmax(attention_scores[0])
attention_scores[1]=softmax(attention_scores[1])
attention_scores[2]=softmax(attention_scores[2])
print(attention_scores[0])
print(attention_scores[1])
print(attention_scores[2])

Step 5: Scaled softmax attention_scores for each vector
[0.06337894 0.46831053 0.46831053]
[6.03366485e-06 9.82007865e-01 1.79861014e-02]
[2.95387223e-04 8.80536902e-01 1.19167711e-01]


In [100]:
print("Step 6: attention value obtained by score1/k_d * V")
print(V[0])
print(V[1])
print(V[2])
print("Attention 1")
attention1=attention_scores[0].reshape(-1,1)
attention1=attention_scores[0][0]*V[0]
print(attention1)
print("Attention 2")
attention2=attention_scores[0][1]*V[1]
print(attention2)
print("Attention 3")
attention3=attention_scores[0][2]*V[2]
print(attention3)

Step 6: attention value obtained by score1/k_d * V
[1. 2. 3.]
[2. 8. 0.]
[2. 6. 3.]
Attention 1
[0.06337894 0.12675788 0.19013681]
Attention 2
[0.93662106 3.74648425 0.        ]
Attention 3
[0.93662106 2.80986319 1.40493159]


In [102]:
attention_scores[0].reshape(-1,1)

array([[0.06337894],
       [0.46831053],
       [0.46831053]])

In [104]:
attention_scores[0][0]*V[0]

array([0.06337894, 0.12675788, 0.19013681])

In [105]:
from transformers import pipeline
translator = pipeline("translation_en_to_fr")
#One line of code!
print(translator("It is easy to translate languages with transformers",
max_length=40))

No model was supplied, defaulted to t5-base and revision 686f1db (https://huggingface.co/t5-base).
Using a pipeline without specifying a model name and revision in production is not recommended.
Downloading (…)lve/main/config.json: 100%|██████████| 1.21k/1.21k [00:00<?, ?B/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Downloading model.safetensors: 100%|██████████| 892M/892M [01:20<00:00, 11.1MB/s] 
Downloading (…)neration_config.json: 100%|██████████| 147/147 [00:00<?, ?B/s] 
Downloading (…)ve/main/spiece.model: 100%|██████████| 792k/792k [00:00<00:00, 3.44MB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 1.39M/1.39M [00:00<00:00, 4.06MB/s]
For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is Tru

[{'translation_text': "Il est facile de traduire des langues à l'aide de transformateurs"}]
