In [7]:
# ! pip install -U datasets

In [1]:
from datasets import load_dataset
dataset = load_dataset("wikimedia/wikipedia", "20231101.en")

Resolving data files:   0%|          | 0/41 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/39 [00:00<?, ?it/s]

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'url', 'title', 'text'],
        num_rows: 6407814
    })
})

In [4]:
train_dataset = dataset["train"]
len(train_dataset)

6407814

In [7]:
import random
num_rows = len(train_dataset)
random_indices = random.sample(range(num_rows), 10000)
random_rows = [train_dataset[idx] for idx in random_indices]

In [9]:
articles = [x["text"] for x in random_rows]

In [11]:
import json
with open("articles", "w") as fp:
    json.dump(articles, fp)

### Load the dataset

In [5]:
import json
with open("articles") as fp:
    articles = json.load(fp)

### Bag of words

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()

In [14]:
%%time

X = vectorizer.fit_transform(articles)

CPU times: user 2.02 s, sys: 27.4 ms, total: 2.05 s
Wall time: 2.05 s


In [16]:
X.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [17]:
X.shape

(10000, 178521)

### TF-IDF

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

In [16]:
%%time
X = vectorizer.fit_transform(articles)

CPU times: user 2.19 s, sys: 24 ms, total: 2.21 s
Wall time: 2.29 s


In [21]:
X.shape

(10000, 178521)

In [22]:
X[0]

<1x178521 sparse matrix of type '<class 'numpy.float64'>'
	with 237 stored elements in Compressed Sparse Row format>

In [24]:
# ! pip install gensim

In [None]:
from gensim.models import Word2Vec
model = Word2Vec(sentences=articles, 

In [None]:
vector_size

In [19]:
=100, window=5, min_count=1, workers=4)

In [20]:
articles[0]

"Sitra ( or , As-Sitra), also known as Sitrah () or Sitra Island (), is an island in Bahrain.\nIt lies  south of the capital, Manama, on Bahrain Island.\n\nHistory\nConflict with Al Khalifa in 1782\nIn 1782, a conflict occurred between locals and a number of Al Khalifa who came from Zubara to buy supplies. The clashes resulted in deaths from both sides.\n\nArab Spring\nDuring the Arab Spring, there was a big protest here. Many protesters were injured or killed. (See Day of Rage (Bahrain)).\n\nGeography\nThe Island is located just east of Bahrain Island in Persian Gulf. It lies south of Manama and Nabih Saleh. The island's western coast forms the boundary of Tubli Bay.\nThe island used to be covered in date palm groves and farms, watered by several freshwater springs. Mangroves used to line the western coast, however they have almost disappeared due to development.\n\nDemography\n\nMost of the inhabitants of the island live in nine historic villages: \nWadyan\nAl Kharijiya\nMarquban\nAl

In [21]:
import numpy as np
def article_to_vector(article, model):
    vector = np.zeros(model.vector_size)
    num_words = 0
    for word in article:
        if word in model.wv:
            vector += model.wv[word]
            num_words += 1
    if num_words != 0:
        vector /= num_words
    return vector

In [22]:
article_vectors = [article_to_vector(article, model) for article in articles]

In [23]:
article_vectors[0]

array([ 0.34021224, -0.21922995, -0.2356602 ,  0.19753146, -0.91467903,
        0.36657454, -0.09976109,  0.23634404, -0.96922715,  0.16623633,
        0.17313602, -0.06611474,  0.15381039,  0.15492317,  0.32039204,
       -0.23883983,  0.36863259,  0.38314734, -0.17077742,  0.17209184,
        0.24701716, -0.21354776,  0.12740255,  0.46317985, -0.038999  ,
       -0.60685119,  0.22168862, -0.26773318, -0.517857  , -0.36590138,
        0.35977748, -0.23360687,  0.29289608,  0.00871249, -0.0891808 ,
        0.19812398, -0.29531354,  0.24406662, -0.09329762,  0.37858076,
        0.4332346 ,  0.62099814,  0.5017787 , -0.19824505,  0.35266646,
        0.16335402, -0.25312676,  0.09579886, -0.74115586, -0.00105779,
        0.26995877,  0.23250769,  0.30332283,  0.41007989, -0.72894871,
       -0.1468927 ,  0.09623729, -0.73183683,  0.01961846,  0.07429802,
       -0.10556534,  0.08230671,  0.19105565, -0.36490342,  0.33447916,
        0.24876654,  0.07114003,  0.39434186, -0.07526083, -0.27

### Sentence Transformers

In [42]:
# ! pip install -U transformers nltk

In [2]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("all-mpnet-base-v2")


In [6]:
from tqdm import tqdm

result = []
for article in tqdm(articles):
    result.append(model.encode(article))


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [34:11<00:00,  4.87it/s]


In [7]:
from langchain_community.embeddings import HuggingFaceEmbeddings

In [8]:
embeddings = HuggingFaceEmbeddings()

In [9]:
text = "This is a test document."

In [10]:
query_result = embeddings.embed_query(text)

In [14]:
query_result

[-0.04895174130797386,
 -0.03986193984746933,
 -0.021562762558460236,
 0.009908498264849186,
 -0.03810398653149605,
 0.012684423476457596,
 0.043494582176208496,
 0.07183392345905304,
 0.009748516604304314,
 -0.006986990571022034,
 0.06352811306715012,
 -0.03032265603542328,
 0.013839470222592354,
 0.02580595389008522,
 -0.0011362829245626926,
 -0.01456361822783947,
 0.041640304028987885,
 0.03622829169034958,
 -0.02680082619190216,
 0.025120679289102554,
 -0.024978652596473694,
 -0.004533246159553528,
 -0.026667216792702675,
 0.004100698512047529,
 -0.05204800143837929,
 -0.009930416941642761,
 -0.052065230906009674,
 0.008992036804556847,
 -0.0383005253970623,
 -0.044058434665203094,
 -0.004204359371215105,
 0.07047969847917557,
 0.0051339310593903065,
 -0.07161542028188705,
 1.6975318430922925e-06,
 -0.00604771263897419,
 -0.011076350696384907,
 0.017513390630483627,
 -0.022299880161881447,
 0.040954988449811935,
 0.03379013016819954,
 0.05665036290884018,
 -0.07114938646554947,
 0.

### Text Embedding inference

In [None]:
model=BAAI/bge-large-en-v1.5
revision=refs/pr/5
volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run

docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:1.1 --model-id $model --revision $revision

In [None]:
curl 127.0.0.1:8080/embed \
    -X POST \
    -d '{"inputs":"What is Deep Learning?"}' \
    -H 'Content-Type: application/json'