In [None]:
from __future__ import annotations
import scipy
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import gensim.models.word2vec as w2v

## Embeddings

- `build_w2v_model`() takes several parameters:
    - `tokens`: A list of lists of strings, where each inner list represents the tokens in a single document.
    - `n_features`: An integer that specifies the dimensionality of the word vectors.
    - `seed`: An optional integer that sets the random seed for the Word2Vec model.
    - `workers`: An optional integer that specifies the number of worker threads to use when training the model.
    - `sg`: An optional integer that specifies the training algorithm: 0 for CBOW, 1 for skip-gram.
    - `context_size`: An optional integer that specifies the size of the context window.
    - `down_sampling`: An optional float that specifies the threshold for downsampling high-frequency words.
    - `min_word_count`: An optional integer that specifies the minimum frequency of a word to be included in the vocabulary.
- The function returns a trained Word2Vec model.
- The model is trained using the `Word2Vec` class from the `gensim` library.
- The `sentences` parameter of the `Word2Vec` class is set to the `tokens` parameter of the `build_w2v_model` function.
- The `sg`, `seed`, `workers`, `vector_size`, `min_count`, `window`, and `sample` parameters of the `Word2Vec` class are set to the corresponding parameters of the `build_w2v_model` function.
- The trained Word2Vec model can be used to generate word embeddings for words in the vocabulary.

- `w2v_embedding`() function takes in two parameters, `tokens`, a list of lists of strings, and `wv`, a word vector object.
    - It initializes an empty list called `sentence_vectors`.
    - It loops through each list of strings in `tokens`.
        - For each string in the list, it retrieves the corresponding word vector from `wv`.
        - It adds the word vectors to a list called `word_vectors`.
        - It calculates the mean of the `word_vectors` list along the first axis (i.e., the mean of all the word vectors in the list).
        - It appends the resulting sentence vector to the `sentence_vectors` list.
- Finally, it returns a numpy array containing all the sentence vectors.

The purpose of the `w2v_embedding` function is to convert a list of lists of words into a matrix of sentence vectors, where each sentence vector represents the average of the word vectors for the words in that sentence.

In [2]:
def build_w2v_model(
        tokens:list[list[str]],
        n_features:int,
        seed:int = 1,
        workers = 1,
        sg:int = 0,
        context_size:int = 5,
        down_sampling:int = 1e-3,
        min_word_count:int = 0) -> w2v:

    return w2v.Word2Vec(
        sentences=tokens,
        sg=sg,
        seed=seed,
        workers = workers,
        vector_size = n_features,
        min_count = min_word_count,
        window = context_size,
        sample = down_sampling
    )

def w2v_embedding(
        tokens: list[list[str]], 
        wv: w2v.wv) -> np.ndarray:
    
    sentence_vectors = []
    for sentence in tokens:
        word_vectors = []
        for token in sentence:
            word_vectors.append(wv.get_vector(token))
        sentence_vectors.append(list(np.mean(word_vectors, axis=0)))

    return np.array(sentence_vectors)


## TF-IDF and Cosine Distance

- `tfidf_vectorizer()`:
    - This function takes in a list of strings (`corpus`) and a maximum number of features to include in the vectorizer (`max_features`).
    - It returns a `TfidfVectorizer` object that has been trained on the input corpus.
    - The vectorizer converts the corpus of strings into a sparse matrix of TF-IDF features, where each row represents a document and each column represents a unique word in the corpus.
    - The TF-IDF values represent how important each word is to each document in the corpus.
- `cosine_distance()`:
    - This function takes in two vectors (`vector1` and `vector2`) and calculates the cosine similarity between them.
    - If the input vectors are sparse matrices, they are first converted to dense arrays.
    - The function calculates the dot product of the two vectors after normalizing them to unit vectors using L2 normalization.
    - The resulting value represents the cosine of the angle between the two vectors, which is a measure of their similarity.

In [3]:
def tfidf_vectorizer(
        corpus: list[str],
        max_features=1000) -> TfidfVectorizer:
    vectorizer = TfidfVectorizer(
        max_features=max_features)
    return vectorizer.fit(corpus)


def cosine_distance(
        vector1: scipy.sparse.csr.csr_matrix | np.ndarray, 
        vector2: scipy.sparse.csr.csr_matrix | np.ndarray) -> float:
    if isinstance(vector1, scipy.sparse.csr.csr_matrix):
        return np.dot(
            vector1.T.toarray()[0]/np.linalg.norm(vector1.toarray()),
            vector2.T.toarray()[0]/np.linalg.norm(vector2.toarray()))
    else:
        return np.dot(
            vector1/np.linalg.norm(vector1),
            vector2/np.linalg.norm(vector2))
    

# Testing Functions

In [4]:
!python3.9 -m pytest tests.py

platform darwin -- Python 3.9.16, pytest-7.3.1, pluggy-1.0.0
rootdir: /Users/eyuelmelese/Desktop/master/NLP/NLP-Quora
plugins: anyio-3.6.2
collected 4 items                                                              [0m[1m

nlp_quora/tests.py [32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m                                                  [100%][0m



## Bash Script Docker Build

In [5]:
!bash docker-build.sh

[1A[1B[0G[?25l[+] Building 0.0s (0/1)                                                         
[?25h[1A[0G[?25l[+] Building 0.2s (2/3)                                                         
[34m => [internal] load build definition from Dockerfile                       0.0s
[0m[34m => => transferring dockerfile: 205B                                       0.0s
[0m[34m => [internal] load .dockerignore                                          0.0s
[0m[34m => => transferring context: 2B                                            0.0s
[0m => [internal] load metadata for docker.io/library/python:3.9              0.1s
[?25h[1A[1A[1A[1A[1A[1A[0G[?25l[+] Building 0.3s (2/3)                                                         
[34m => [internal] load build definition from Dockerfile                       0.0s
[0m[34m => => transferring dockerfile: 205B                                       0.0s
[0m[34m => [internal] load .dockerignore                           