In [1]:
from __future__ import annotations
import scipy
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import gensim.models.word2vec as w2v

## Embeddings

In [2]:
def build_w2v_model(
        tokens:list[list[str]],
        n_features:int,
        seed:int = 1,
        workers = 1,
        sg:int = 0,
        context_size:int = 5,
        down_sampling:int = 1e-3,
        min_word_count:int = 0) -> w2v:

    return w2v.Word2Vec(
        sentences=tokens,
        sg=sg,
        seed=seed,
        workers = workers,
        vector_size = n_features,
        min_count = min_word_count,
        window = context_size,
        sample = down_sampling
    )

def w2v_embedding(
        tokens: list[list[str]], 
        wv: w2v.wv) -> np.ndarray:
    
    sentence_vectors = []
    for sentence in tokens:
        word_vectors = []
        for token in sentence:
            word_vectors.append(wv.get_vector(token))
        sentence_vectors.append(list(np.mean(word_vectors, axis=0)))

    return np.array(sentence_vectors)


## TF-IDF and Cosine Distance

In [3]:
def tfidf_vectorizer(
        corpus: list[str],
        max_features=1000) -> TfidfVectorizer:
    vectorizer = TfidfVectorizer(
        max_features=max_features)
    return vectorizer.fit(corpus)


def cosine_distance(
        vector1: scipy.sparse.csr.csr_matrix | np.ndarray, 
        vector2: scipy.sparse.csr.csr_matrix | np.ndarray) -> float:
    if isinstance(vector1, scipy.sparse.csr.csr_matrix):
        return np.dot(
            vector1.T.toarray()[0]/np.linalg.norm(vector1.toarray()),
            vector2.T.toarray()[0]/np.linalg.norm(vector2.toarray()))
    else:
        return np.dot(
            vector1/np.linalg.norm(vector1),
            vector2/np.linalg.norm(vector2))
    

# Testing Functions

In [4]:
!python3.9 -m pytest ./nlp_quora/tests.py

platform darwin -- Python 3.9.16, pytest-7.3.1, pluggy-1.0.0
rootdir: /Users/eyuelmelese/Desktop/master/NLP/NLP-Quora
plugins: anyio-3.6.2
collected 4 items                                                              [0m[1m

nlp_quora/tests.py [32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m                                                  [100%][0m



## Bash Script Docker Build

In [5]:
!bash docker-build.sh

[1A[1B[0G[?25l[+] Building 0.0s (0/1)                                                         
[?25h[1A[0G[?25l[+] Building 0.2s (2/3)                                                         
[34m => [internal] load build definition from Dockerfile                       0.0s
[0m[34m => => transferring dockerfile: 205B                                       0.0s
[0m[34m => [internal] load .dockerignore                                          0.0s
[0m[34m => => transferring context: 2B                                            0.0s
[0m => [internal] load metadata for docker.io/library/python:3.9              0.1s
[?25h[1A[1A[1A[1A[1A[1A[0G[?25l[+] Building 0.3s (2/3)                                                         
[34m => [internal] load build definition from Dockerfile                       0.0s
[0m[34m => => transferring dockerfile: 205B                                       0.0s
[0m[34m => [internal] load .dockerignore                           