In [1]:
from __future__ import annotations
import scipy
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import gensim.models.word2vec as w2v

## Embeddings

In [2]:
def build_w2v_model(
        tokens:list[list[str]],
        n_features:int,
        seed:int = 1,
        workers = 1,
        sg:int = 0,
        context_size:int = 5,
        down_sampling:int = 1e-3,
        min_word_count:int = 0) -> w2v:

    return w2v.Word2Vec(
        sentences=tokens,
        sg=sg,
        seed=seed,
        workers = workers,
        vector_size = n_features,
        min_count = min_word_count,
        window = context_size,
        sample = down_sampling
    )

def w2v_embedding(
        tokens: list[list[str]], 
        wv: w2v.wv) -> np.ndarray:
    
    sentence_vectors = []
    for sentence in tokens:
        word_vectors = []
        for token in sentence:
            word_vectors.append(wv.get_vector(token))
        sentence_vectors.append(list(np.mean(word_vectors, axis=0)))

    return np.array(sentence_vectors)

def test_build_w2v_model():
    # Given
    doc = [['this', 'is', 'a', 'test'], ['this', 'is', 'another', 'test']]
    n_features = 10
    n_epochs = 10

    # When
    model = build_w2v_model(doc, n_features, n_epochs)

    # Then
    assert isinstance(model, w2v.Word2Vec)
    assert model.vector_size == n_features

def test_w2v_embedding():
    # Given
    doc = [['this', 'is', 'a', 'test'], ['this', 'is', 'another', 'test']]
    n_features = 10
    n_epochs = 10
    model = build_w2v_model(doc, n_features, n_epochs)

    # When
    embedding = w2v_embedding(doc, model)

    # Then
    assert isinstance(embedding, np.ndarray)
    assert embedding.shape == (2, n_features)

test_build_w2v_model()
test_w2v_embedding()

AttributeError: 'Word2Vec' object has no attribute 'get_vector'

## TF-IDF and Cosine Distance

In [3]:
def tfidf_vectorizer(
        corpus: list[str],
        max_features=1000) -> TfidfVectorizer:
    vectorizer = TfidfVectorizer(
        max_features=max_features)
    return vectorizer.fit(corpus)


def cosine_distance(
        vector1: scipy.sparse.csr.csr_matrix | np.ndarray, 
        vector2: scipy.sparse.csr.csr_matrix | np.ndarray) -> float:
    if isinstance(vector1, scipy.sparse.csr.csr_matrix):
        return np.dot(
            vector1.T.toarray()[0]/np.linalg.norm(vector1.toarray()),
            vector2.T.toarray()[0]/np.linalg.norm(vector2.toarray()))
    else:
        return np.dot(
            vector1/np.linalg.norm(vector1),
            vector2/np.linalg.norm(vector2))
    
def test_tfidf_vectorizer():
    # Given
    doc = ["This is a test", "This is another test"]

    # When
    vectorizer = tfidf_vectorizer(doc)

    # Then
    assert isinstance(vectorizer, TfidfVectorizer)
    assert (vectorizer.get_feature_names_out() == ["another", "is", "test", "this"]).all()

def test_cosine_distance():
    a = np.array([1, 0, 0])
    b = np.array([0, 1, 0])
    assert cosine_distance(a, b) == 0.0

test_tfidf_vectorizer()
test_cosine_distance()

  if isinstance(vector1, scipy.sparse.csr.csr_matrix):


## Bash Script Docker Build

In [4]:
!bash docker-build.sh

[1A[1B[0G[?25l[+] Building 0.0s (0/1)                                                         
[?25h[1A[0G[?25l[+] Building 0.2s (2/3)                                                         
[34m => [internal] load build definition from Dockerfile                       0.0s
[0m[34m => => transferring dockerfile: 205B                                       0.0s
[0m[34m => [internal] load .dockerignore                                          0.0s
[0m[34m => => transferring context: 2B                                            0.0s
[0m => [internal] load metadata for docker.io/library/python:3.9              0.1s
[?25h[1A[1A[1A[1A[1A[1A[0G[?25l[+] Building 0.3s (2/3)                                                         
[34m => [internal] load build definition from Dockerfile                       0.0s
[0m[34m => => transferring dockerfile: 205B                                       0.0s
[0m[34m => [internal] load .dockerignore                           