In [1]:
%%capture
!pip install chromadb ollama
!pip install sentence-transformers

In [2]:
from chromadb import Documents, EmbeddingFunction, Embeddings, PersistentClient
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

class MyEmbeddingFunction(EmbeddingFunction):
    def __call__(self, input: Documents) -> Embeddings:
        batch_embeddings = embedding_model.encode(input)
        return batch_embeddings.tolist()

embed_fn = MyEmbeddingFunction()

client = PersistentClient(path="./chromadb")

collection = client.get_or_create_collection(
    name=f"openml-translations"
)

  return torch._C._cuda_getDeviceCount() > 0


In [3]:
import json, uuid
with open('data/data.jsonl') as f:
    for line in f:
        pair=json.loads(line)
        inp=pair['input'] 
        output=pair['output']
        embedding = embedding_model.encode(inp)

        # upsert to chromadb
        collection.upsert(
            ids=[str(uuid.uuid1())],
            metadatas=[dict(translation=output)],
            documents=[inp],
            embeddings=embedding.tolist(),
        )




In [5]:
 collection.query(
    query_texts=["What is the height of the objects in the dataset?"],
    n_results=6,
)

{'ids': [['dc68fa84-0f6d-11ef-b06b-b0dcefa5c64d',
   'd63743f6-0ed6-11ef-9795-b0dcefa5c64d',
   'd664404a-0ed6-11ef-9795-b0dcefa5c64d',
   'd6c9b362-0ed6-11ef-9795-b0dcefa5c64d',
   'd6b2d28c-0ed6-11ef-9795-b0dcefa5c64d',
   'd708c9a8-0ed6-11ef-9795-b0dcefa5c64d']],
 'distances': [[0.1591210514307022,
   0.15912123024463654,
   0.35951027274131775,
   0.35951027274131775,
   0.35951027274131775,
   0.35951027274131775]],
 'metadatas': [[{'translation': 'load dataset, calculate height using data'},
   {'translation': 'load dataset, calculate height using data'},
   {'translation': 'load dataset, calculate height using data and volume columns'},
   {'translation': 'load dataset, calculate height using data and volume columns'},
   {'translation': 'load dataset, calculate height using data and volume columns'},
   {'translation': 'load dataset, calculate height using data and volume columns'}]],
 'embeddings': None,
 'documents': [['What is the height of the objects in the loaded dataset?

In [7]:
def question(q):
    return """
    [INST] 
    <<SYS>> 
    You are used by a software package as an API. So you should only respond with a direct answer.
    
    You should translate English sentences into sentences of a reduced English vocabulary, such that these result sentences can serve as input for an AutoML system.
The reduced vocabulary consists of the following tokens: load, dataset, calculate, volume, cluster, clustering, regression, data, count, clusters, a, and, perform, using, id, surface, height, mass, density, columns.

The sentences are queries or instructions regarding a dataset consisting of four columns: id (integer), surface (in square meters, float), height (in meters, float), mass (in kilograms, float). The 'id', 'surface', 'height' and 'mass' tokens of the reduced vocabulary correspond to these column names.
These columns describe the id, surface, height and mass of each object. The volume of the object is not a column but can be calculated as the product of the surface and height. Density of the object can be calculated by dividing the mass by the volume of the object.

You shouldn't do any calculations, only translate sentences to the reduced vocabulary while considering the relations between the colums of the dataset, as detailed out in the previous 2 sentences.
From the input sentence you should deduce the type of machine learning algorithm. It should be either regression, clustering or none at all. 

As an example, a sentence such as `Given the objects dataset, how many types of objects can be found, according to their volume?` would be translated into `load dataset, calculate volume using the surface and height columns, cluster and count clusters`. In this case the required ML algorithm is clustering as there is no column indicating the object type. Instead, the object type is deduced by performing clustering, a type of unsupervised learning.
Another example:  "Train a model that can estimate density based on surface and mass given the objects dataset." is translated into "load dataset, calculate density using surface and mass columns, and perform regression". In this case the required ML algorithm is regression as density is an object property that is calculated by other columns and can thus be learned by regression, a supervised learning algorithm. 
    <</SYS>>
    

Can you translate the sentence "%s" and reply only with the translation?
[/INST]
""" %(q)


In [9]:
import ollama

ollama.chat(model='llama2', messages=[{'role': 'user', 'content': question("Calculate the density of the objects")}])

{'model': 'llama2',
 'created_at': '2024-05-11T12:43:30.500005365Z',
 'message': {'role': 'assistant',
  'content': ' Calculate: load dataset, calculate density using surface and mass columns, perform regression.'},
 'done': True,
 'total_duration': 55940056860,
 'load_duration': 4191761435,
 'prompt_eval_count': 563,
 'prompt_eval_duration': 49942460000,
 'eval_count': 18,
 'eval_duration': 1803028000}

In [None]:

PersistentClient

collection.upsert(
        ids=batch_ids,
        metadatas=batch_metadata,
        documents=batch_titles,
        embeddings=batch_embeddings.tolist(),
    )