From 2c8ca194405b014a7a16775814a3014026359128 Mon Sep 17 00:00:00 2001 From: thucpn Date: Wed, 21 Feb 2024 15:14:12 +0700 Subject: [PATCH 1/7] refactor: use nodeToMedadata --- .../core/src/storage/vectorStore/PineconeVectorStore.ts | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/packages/core/src/storage/vectorStore/PineconeVectorStore.ts b/packages/core/src/storage/vectorStore/PineconeVectorStore.ts index 86af6f8b4b..c7e7099eec 100644 --- a/packages/core/src/storage/vectorStore/PineconeVectorStore.ts +++ b/packages/core/src/storage/vectorStore/PineconeVectorStore.ts @@ -6,7 +6,7 @@ import { VectorStoreQueryResult, } from "./types"; -import { BaseNode, Document, Metadata, MetadataMode } from "../../Node"; +import { BaseNode, Document, Metadata } from "../../Node"; import { GenericFileSystem } from "../FileSystem"; import { @@ -15,6 +15,7 @@ import { Pinecone, ScoredPineconeRecord, } from "@pinecone-database/pinecone"; +import { nodeToMetadata } from "./utils"; type PineconeParams = { indexName?: string; @@ -207,14 +208,10 @@ export class PineconeVectorStore implements VectorStore { nodeToRecord(node: BaseNode) { let id: any = node.id_.length ? node.id_ : null; - let meta: any = node.metadata || {}; - meta.create_date = new Date(); - meta.text = node.getContent(MetadataMode.EMBED); - return { id: id, values: node.getEmbedding(), - metadata: meta, + metadata: nodeToMetadata(node), }; } } From b8f6e9c451677cc2b252f4d096a25a3230966f07 Mon Sep 17 00:00:00 2001 From: thucpn Date: Wed, 21 Feb 2024 15:14:50 +0700 Subject: [PATCH 2/7] feat: support pinecone vectordb for typescript templates --- packages/create-llama/helpers/types.ts | 2 +- packages/create-llama/questions.ts | 1 + .../typescript/pinecone/generate.mjs | 35 +++++++++++++++++++ .../vectordbs/typescript/pinecone/index.ts | 29 +++++++++++++++ .../vectordbs/typescript/pinecone/shared.mjs | 22 ++++++++++++ 5 files changed, 88 insertions(+), 1 deletion(-) create mode 100644 packages/create-llama/templates/components/vectordbs/typescript/pinecone/generate.mjs create mode 100644 packages/create-llama/templates/components/vectordbs/typescript/pinecone/index.ts create mode 100644 packages/create-llama/templates/components/vectordbs/typescript/pinecone/shared.mjs diff --git a/packages/create-llama/helpers/types.ts b/packages/create-llama/helpers/types.ts index 5e4a9f6efc..19253cc152 100644 --- a/packages/create-llama/helpers/types.ts +++ b/packages/create-llama/helpers/types.ts @@ -4,7 +4,7 @@ export type TemplateType = "simple" | "streaming" | "community" | "llamapack"; export type TemplateFramework = "nextjs" | "express" | "fastapi"; export type TemplateEngine = "simple" | "context"; export type TemplateUI = "html" | "shadcn"; -export type TemplateVectorDB = "none" | "mongo" | "pg"; +export type TemplateVectorDB = "none" | "mongo" | "pg" | "pinecone"; export type TemplatePostInstallAction = "none" | "dependencies" | "runApp"; export type TemplateDataSource = { type: TemplateDataSourceType; diff --git a/packages/create-llama/questions.ts b/packages/create-llama/questions.ts index 62fdb7eca0..b0cca28cf0 100644 --- a/packages/create-llama/questions.ts +++ b/packages/create-llama/questions.ts @@ -89,6 +89,7 @@ const getVectorDbChoices = (framework: TemplateFramework) => { }, { title: "MongoDB", value: "mongo" }, { title: "PostgreSQL", value: "pg" }, + { title: "Pinecone", value: "pinecone" }, ]; const vectordbLang = framework === "fastapi" ? "python" : "typescript"; diff --git a/packages/create-llama/templates/components/vectordbs/typescript/pinecone/generate.mjs b/packages/create-llama/templates/components/vectordbs/typescript/pinecone/generate.mjs new file mode 100644 index 0000000000..b371a639a8 --- /dev/null +++ b/packages/create-llama/templates/components/vectordbs/typescript/pinecone/generate.mjs @@ -0,0 +1,35 @@ +/* eslint-disable turbo/no-undeclared-env-vars */ +import * as dotenv from "dotenv"; +import { + PineconeVectorStore, + SimpleDirectoryReader, + VectorStoreIndex, + storageContextFromDefaults, +} from "llamaindex"; +import { STORAGE_DIR, checkRequiredEnvVars } from "./shared.mjs"; + +dotenv.config(); + +async function loadAndIndex() { + // load objects from storage and convert them into LlamaIndex Document objects + const documents = await new SimpleDirectoryReader().loadData({ + directoryPath: STORAGE_DIR, + }); + + // create vector store + const vectorStore = new PineconeVectorStore(); + + // create index from all the Documentss and store them in Pinecone + console.log("Start creating embeddings..."); + const storageContext = await storageContextFromDefaults({ vectorStore }); + await VectorStoreIndex.fromDocuments(documents, { storageContext }); + console.log( + "Successfully created embeddings and save to your Pinecone index.", + ); +} + +(async () => { + checkRequiredEnvVars(); + await loadAndIndex(); + console.log("Finished generating storage."); +})(); diff --git a/packages/create-llama/templates/components/vectordbs/typescript/pinecone/index.ts b/packages/create-llama/templates/components/vectordbs/typescript/pinecone/index.ts new file mode 100644 index 0000000000..be18486c4e --- /dev/null +++ b/packages/create-llama/templates/components/vectordbs/typescript/pinecone/index.ts @@ -0,0 +1,29 @@ +/* eslint-disable turbo/no-undeclared-env-vars */ +import { + ContextChatEngine, + LLM, + PineconeVectorStore, + VectorStoreIndex, + serviceContextFromDefaults, +} from "llamaindex"; +import { CHUNK_OVERLAP, CHUNK_SIZE, checkRequiredEnvVars } from "./shared.mjs"; + +async function getDataSource(llm: LLM) { + checkRequiredEnvVars(); + const serviceContext = serviceContextFromDefaults({ + llm, + chunkSize: CHUNK_SIZE, + chunkOverlap: CHUNK_OVERLAP, + }); + const store = new PineconeVectorStore(); + return await VectorStoreIndex.fromVectorStore(store, serviceContext); +} + +export async function createChatEngine(llm: LLM) { + const index = await getDataSource(llm); + const retriever = index.asRetriever({ similarityTopK: 5 }); + return new ContextChatEngine({ + chatModel: llm, + retriever, + }); +} diff --git a/packages/create-llama/templates/components/vectordbs/typescript/pinecone/shared.mjs b/packages/create-llama/templates/components/vectordbs/typescript/pinecone/shared.mjs new file mode 100644 index 0000000000..f9140261c3 --- /dev/null +++ b/packages/create-llama/templates/components/vectordbs/typescript/pinecone/shared.mjs @@ -0,0 +1,22 @@ +export const STORAGE_DIR = "./data"; +export const CHUNK_SIZE = 512; +export const CHUNK_OVERLAP = 20; + +const REQUIRED_ENV_VARS = ["PINECONE_ENVIRONMENT", "PINECONE_API_KEY"]; + +export function checkRequiredEnvVars() { + const missingEnvVars = REQUIRED_ENV_VARS.filter((envVar) => { + return !process.env[envVar]; + }); + + if (missingEnvVars.length > 0) { + console.log( + `The following environment variables are required but missing: ${missingEnvVars.join( + ", ", + )}`, + ); + throw new Error( + `Missing environment variables: ${missingEnvVars.join(", ")}`, + ); + } +} From c9fa927828e4e7cbd6d4227de6ad208cd71b6ea6 Mon Sep 17 00:00:00 2001 From: Thuc Pham <51660321+thucpn@users.noreply.github.com> Date: Wed, 21 Feb 2024 16:43:18 +0700 Subject: [PATCH 3/7] Create pink-tools-look.md --- .changeset/pink-tools-look.md | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 .changeset/pink-tools-look.md diff --git a/.changeset/pink-tools-look.md b/.changeset/pink-tools-look.md new file mode 100644 index 0000000000..b83a0232ba --- /dev/null +++ b/.changeset/pink-tools-look.md @@ -0,0 +1,6 @@ +--- +"llamaindex": patch +"create-llama": patch +--- + +feat: add pinecone support to create llama From 071fe5e75cc80762b8fff3aa4e3dda154ed30b7a Mon Sep 17 00:00:00 2001 From: thucpn Date: Thu, 22 Feb 2024 04:28:01 +0700 Subject: [PATCH 4/7] refactor: detach changeset --- .changeset/happy-monkeys-cross.md | 5 +++++ .changeset/pink-tools-look.md | 1 - 2 files changed, 5 insertions(+), 1 deletion(-) create mode 100644 .changeset/happy-monkeys-cross.md diff --git a/.changeset/happy-monkeys-cross.md b/.changeset/happy-monkeys-cross.md new file mode 100644 index 0000000000..2af52edafe --- /dev/null +++ b/.changeset/happy-monkeys-cross.md @@ -0,0 +1,5 @@ +--- +"llamaindex": patch +--- + +fix: update pinecone vector store diff --git a/.changeset/pink-tools-look.md b/.changeset/pink-tools-look.md index b83a0232ba..909195e576 100644 --- a/.changeset/pink-tools-look.md +++ b/.changeset/pink-tools-look.md @@ -1,5 +1,4 @@ --- -"llamaindex": patch "create-llama": patch --- From 809b9122afbecd95669c63fcebfa071649939f34 Mon Sep 17 00:00:00 2001 From: thucpn Date: Thu, 22 Feb 2024 04:42:12 +0700 Subject: [PATCH 5/7] fix: using metadataDictToNode for pinecone vector store --- .../vectorStore/PineconeVectorStore.ts | 20 ++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/packages/core/src/storage/vectorStore/PineconeVectorStore.ts b/packages/core/src/storage/vectorStore/PineconeVectorStore.ts index c7e7099eec..08f6384f3d 100644 --- a/packages/core/src/storage/vectorStore/PineconeVectorStore.ts +++ b/packages/core/src/storage/vectorStore/PineconeVectorStore.ts @@ -6,7 +6,7 @@ import { VectorStoreQueryResult, } from "./types"; -import { BaseNode, Document, Metadata } from "../../Node"; +import { BaseNode, Metadata } from "../../Node"; import { GenericFileSystem } from "../FileSystem"; import { @@ -15,7 +15,7 @@ import { Pinecone, ScoredPineconeRecord, } from "@pinecone-database/pinecone"; -import { nodeToMetadata } from "./utils"; +import { metadataDictToNode, nodeToMetadata } from "./utils"; type PineconeParams = { indexName?: string; @@ -156,12 +156,18 @@ export class PineconeVectorStore implements VectorStore { const rows = Object.values(records.records); const nodes = rows.map((row) => { - return new Document({ - id_: row.id, - text: this.textFromResultRow(row), - metadata: this.metaWithoutText(row.metadata), - embedding: row.values, + const metadata = this.metaWithoutText(row.metadata); + const text = this.textFromResultRow(row); + const node = metadataDictToNode(metadata, { + fallback: { + id: row.id, + text, + metadata, + embedding: row.values, + }, }); + node.setContent(text); + return node; }); const ret = { From 4802a1ba8721028e1fcf1cc76e558cb9b587eb90 Mon Sep 17 00:00:00 2001 From: thucpn Date: Thu, 22 Feb 2024 07:32:38 +0700 Subject: [PATCH 6/7] feat: add pinecone support for python template --- .../vectordbs/python/pinecone/__init__.py | 0 .../vectordbs/python/pinecone/constants.py | 3 ++ .../vectordbs/python/pinecone/context.py | 14 ++++++ .../vectordbs/python/pinecone/generate.py | 45 +++++++++++++++++++ .../vectordbs/python/pinecone/index.py | 23 ++++++++++ 5 files changed, 85 insertions(+) create mode 100644 packages/create-llama/templates/components/vectordbs/python/pinecone/__init__.py create mode 100644 packages/create-llama/templates/components/vectordbs/python/pinecone/constants.py create mode 100644 packages/create-llama/templates/components/vectordbs/python/pinecone/context.py create mode 100644 packages/create-llama/templates/components/vectordbs/python/pinecone/generate.py create mode 100644 packages/create-llama/templates/components/vectordbs/python/pinecone/index.py diff --git a/packages/create-llama/templates/components/vectordbs/python/pinecone/__init__.py b/packages/create-llama/templates/components/vectordbs/python/pinecone/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/packages/create-llama/templates/components/vectordbs/python/pinecone/constants.py b/packages/create-llama/templates/components/vectordbs/python/pinecone/constants.py new file mode 100644 index 0000000000..0dd46619b2 --- /dev/null +++ b/packages/create-llama/templates/components/vectordbs/python/pinecone/constants.py @@ -0,0 +1,3 @@ +DATA_DIR = "data" # directory containing the documents to index +CHUNK_SIZE = 512 +CHUNK_OVERLAP = 20 diff --git a/packages/create-llama/templates/components/vectordbs/python/pinecone/context.py b/packages/create-llama/templates/components/vectordbs/python/pinecone/context.py new file mode 100644 index 0000000000..ceb8a50ae0 --- /dev/null +++ b/packages/create-llama/templates/components/vectordbs/python/pinecone/context.py @@ -0,0 +1,14 @@ +from llama_index import ServiceContext + +from app.context import create_base_context +from app.engine.constants import CHUNK_SIZE, CHUNK_OVERLAP + + +def create_service_context(): + base = create_base_context() + return ServiceContext.from_defaults( + llm=base.llm, + embed_model=base.embed_model, + chunk_size=CHUNK_SIZE, + chunk_overlap=CHUNK_OVERLAP, + ) diff --git a/packages/create-llama/templates/components/vectordbs/python/pinecone/generate.py b/packages/create-llama/templates/components/vectordbs/python/pinecone/generate.py new file mode 100644 index 0000000000..8c0e1c0b42 --- /dev/null +++ b/packages/create-llama/templates/components/vectordbs/python/pinecone/generate.py @@ -0,0 +1,45 @@ +from dotenv import load_dotenv + +load_dotenv() +import os +import logging +from llama_index.vector_stores import PineconeVectorStore + +from app.engine.constants import DATA_DIR +from app.engine.context import create_service_context +from app.engine.loader import get_documents + + +from llama_index import ( + SimpleDirectoryReader, + VectorStoreIndex, + StorageContext, +) + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger() + + +def generate_datasource(service_context): + logger.info("Creating new index") + # load the documents and create the index + documents = get_documents() + store = PineconeVectorStore( + api_key=os.environ["PINECONE_API_KEY"], + index_name=os.environ["PINECONE_INDEX_NAME"], + environment=os.environ["PINECONE_ENVIRONMENT"], + ) + storage_context = StorageContext.from_defaults(vector_store=store) + VectorStoreIndex.from_documents( + documents, + service_context=service_context, + storage_context=storage_context, + show_progress=True, # this will show you a progress bar as the embeddings are created + ) + logger.info( + f"Successfully created embeddings and save to your Pinecone index {os.environ['PINECONE_INDEX_NAME']}" + ) + + +if __name__ == "__main__": + generate_datasource(create_service_context()) diff --git a/packages/create-llama/templates/components/vectordbs/python/pinecone/index.py b/packages/create-llama/templates/components/vectordbs/python/pinecone/index.py new file mode 100644 index 0000000000..6e9b88102f --- /dev/null +++ b/packages/create-llama/templates/components/vectordbs/python/pinecone/index.py @@ -0,0 +1,23 @@ +import logging +import os + +from llama_index import ( + VectorStoreIndex, +) +from llama_index.vector_stores import PineconeVectorStore + +from app.engine.context import create_service_context + + +def get_index(): + service_context = create_service_context() + logger = logging.getLogger("uvicorn") + logger.info("Connecting to index from Pinecone...") + store = PineconeVectorStore( + api_key=os.environ["PINECONE_API_KEY"], + index_name=os.environ["PINECONE_INDEX_NAME"], + environment=os.environ["PINECONE_ENVIRONMENT"], + ) + index = VectorStoreIndex.from_vector_store(store, service_context) + logger.info("Finished connecting to index from Pinecone.") + return index From 5e39d3ca0c4f5ec0eb3a5e1f2675aae3ea359219 Mon Sep 17 00:00:00 2001 From: thucpn Date: Thu, 22 Feb 2024 08:46:48 +0700 Subject: [PATCH 7/7] feat: update pinecone to latest version --- packages/core/package.json | 2 +- pnpm-lock.yaml | 65 ++++++++------------------------------ 2 files changed, 15 insertions(+), 52 deletions(-) diff --git a/packages/core/package.json b/packages/core/package.json index 7f6b471759..3bccb075fe 100644 --- a/packages/core/package.json +++ b/packages/core/package.json @@ -8,7 +8,7 @@ "@datastax/astra-db-ts": "^0.1.4", "@mistralai/mistralai": "^0.0.10", "@notionhq/client": "^2.2.14", - "@pinecone-database/pinecone": "^1.1.3", + "@pinecone-database/pinecone": "^2.0.1", "@qdrant/js-client-rest": "^1.7.0", "@xenova/transformers": "^2.15.0", "assemblyai": "^4.2.2", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 44a4750e51..fdb2d31a4d 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -178,8 +178,8 @@ importers: specifier: ^2.2.14 version: 2.2.14 '@pinecone-database/pinecone': - specifier: ^1.1.3 - version: 1.1.3 + specifier: ^2.0.1 + version: 2.0.1 '@qdrant/js-client-rest': specifier: ^1.7.0 version: 1.7.0(typescript@5.3.3) @@ -3447,6 +3447,16 @@ packages: encoding: 0.1.13 dev: false + /@pinecone-database/pinecone@2.0.1: + resolution: {integrity: sha512-a1ejzrqdSQ2yW+9QUi2TVlKwYUbrvGH+QH6POJhITyaOz9ANE+EhXqToC9af93Ctzq9n87+bOUvBvewLeW++Mw==} + engines: {node: '>=14.0.0'} + dependencies: + '@sinclair/typebox': 0.29.6 + ajv: 8.12.0 + cross-fetch: 3.1.8(encoding@0.1.13) + encoding: 0.1.13 + dev: false + /@pkgjs/parseargs@0.11.0: resolution: {integrity: sha512-+1VkjdD0QBLPodGrJUeqarH8VAIvQODIbwh9XpP5Syisf7YoQgsJKPNFoqqLQlu+VQ/tVSshMR6loPMn8U+dPg==} engines: {node: '>=14'} @@ -10213,55 +10223,6 @@ packages: wrap-ansi: 9.0.0 dev: true - /llamaindex@0.1.9(typescript@5.3.3): - resolution: {integrity: sha512-MAMGV5MXXcJ4rSV2kqCZENf7B+q1zTwPnHpnHJgEiEzP5+djNdLmbz/zaCmxpB8wgNNLUem9iJt53iwDBJ4ZBA==} - engines: {node: '>=18.0.0'} - dependencies: - '@anthropic-ai/sdk': 0.12.4 - '@datastax/astra-db-ts': 0.1.4 - '@mistralai/mistralai': 0.0.10 - '@notionhq/client': 2.2.14 - '@pinecone-database/pinecone': 1.1.3 - '@qdrant/js-client-rest': 1.7.0(typescript@5.3.3) - '@xenova/transformers': 2.14.1 - assemblyai: 4.2.1 - chromadb: 1.7.3(openai@4.26.0) - file-type: 18.7.0 - js-tiktoken: 1.0.8 - lodash: 4.17.21 - mammoth: 1.6.0 - md-utils-ts: 2.0.0 - mongodb: 6.3.0 - notion-md-crawler: 0.0.2 - openai: 4.26.0 - papaparse: 5.4.1 - pathe: 1.1.2 - pdf2json: 3.0.5 - pg: 8.11.3 - pgvector: 0.1.7 - portkey-ai: 0.1.16 - rake-modified: 1.0.8 - replicate: 0.25.2 - string-strip-html: 13.4.5 - wink-nlp: 1.14.3 - transitivePeerDependencies: - - '@aws-sdk/credential-providers' - - '@google/generative-ai' - - '@mongodb-js/zstd' - - bufferutil - - cohere-ai - - debug - - encoding - - gcp-metadata - - kerberos - - mongodb-client-encryption - - pg-native - - snappy - - socks - - typescript - - utf-8-validate - dev: false - /load-yaml-file@0.2.0: resolution: {integrity: sha512-OfCBkGEw4nN6JLtgRidPX6QxjBQGQf72q3si2uvqyFEMbycSFFHwAZeXx6cJgFM9wmLrf9zBwCP3Ivqa+LLZPw==} engines: {node: '>=6'} @@ -12062,6 +12023,8 @@ packages: resolution: {integrity: sha512-Un1yLbSlk/zfwrltgguskExIioXZlFSFwsyXU0cnBorLywbTbcdzmJJEebh+U2cFCtR7y8nDs5lPHAe7ldxjZg==} engines: {node: '>=18.12.1', npm: '>=8.19.2'} hasBin: true + dependencies: + '@xmldom/xmldom': 0.8.10 dev: false bundledDependencies: - '@xmldom/xmldom'