In [28]:
%%capture
!pip install semchunk tiktoken sentence_transformers==2.7.0 elasticsearch

In [36]:
import time, json
import requests
from tqdm.auto import tqdm
import semchunk
import tiktoken
from sentence_transformers import SentenceTransformer
from elasticsearch import Elasticsearch

In [18]:
with open('data/The_Adventure_of_the_Speckled_Band.txt', 'r') as file:
    content = file.read()

len(content)

52991

In [22]:
chunk_size = 300
chunker = semchunk.chunkerify(tiktoken.encoding_for_model('gpt-4o'), chunk_size)

In [23]:
chunk = chunker(content)
len(chunk)

51

In [31]:
model_name = "multi-qa-distilbert-cos-v1"
embedding_model = SentenceTransformer(model_name)



In [32]:
user_question = "I just discovered the course. Can I still join it?"
v = embedding_model.encode(user_question)

0.07822265

In [33]:
docs = []
for ch in tqdm(chunk):
    doc = {
        'text': ch,
        'vector': embedding_model.encode(ch),
    }
    docs.append(doc)
len(docs)

51

```shell
docker run -it \
    --rm \
    --name elasticsearch \
    -p 9200:9200 \
    --memory="2g" \
    --security-opt seccomp=unconfined \
    -e "discovery.type=single-node" \
    -e "xpack.security.enabled=false" \
    docker.elastic.co/elasticsearch/elasticsearch:8.9.0
```

In [37]:
url_elasticsearch = 'http://localhost:9200'

def create_elasticsearch_client():
    while True:
        try:
            response = requests.get(url_elasticsearch)
        except requests.ConnectionError:
            time.sleep(10)
        else:
            break
    client = Elasticsearch(url_elasticsearch)
    print(json.dumps(client.info().raw, indent=4))
    return client

In [None]:
es_client = create_elasticsearch_client()

In [None]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "vector": {
                "type": "dense_vector",
                "dims": len(v),
                "index": True,
                "similarity": "cosine"
            },
        }
    }
}

index_name = "story_chunks"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

In [None]:
for doc in tqdm(docs):
    es_client.index(index=index_name, document=doc)

In [None]:
def elastic_search_knn(field, vector):
    knn = {
        "field": field,
        "query_vector": vector,
        "k": 5,
        "num_candidates": 10000
    }

    search_query = {
        "knn": knn,
        "_source": ["text"]
    }

    es_results = es_client.search(
        index=index_name,
        body=search_query
    )

    result_docs = []
    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])
    return result_docs

In [None]:
results = elastic_search_knn('vector', v)
results

```shell
docker run --rm -d \
    -v ollama:/root/.ollama \
    -p 11434:11434 \
    --security-opt seccomp=unconfined \
    --name ollama \
    ollama/ollama # 內部跑 ollama serve

docker exec -it ollama ollama pull phi3 # pull 沒有互動介面, 第一次 request 才會執行
```