# Lesson 3: Vectorstores and embeddings

# Vectorstore ingestion

In [1]:
import "dotenv/config";

[Module: null prototype] { default: {} }

In [2]:
import { OpenAIEmbeddings } from "@langchain/openai";

const embeddings = new OpenAIEmbeddings();

await embeddings.embedQuery("This is some sample text");

[
   [33m-0.010416139[39m,   [33m0.0024119338[39m, [33m-0.00073926017[39m,  [33m-0.010882434[39m,  [33m-0.0114866495[39m,
    [33m0.022907624[39m,   [33m-0.014645643[39m,    [33m0.001761746[39m,   [33m-0.01757477[39m,     [33m-0.0192692[39m,
    [33m0.005201502[39m,    [33m0.034151275[39m,   [33m-0.012268188[39m,  [33m0.0019308605[39m,    [33m0.004745057[39m,
    [33m0.013174511[39m,    [33m0.024588916[39m,   [33m0.0017256244[39m,  [33m0.0045020576[39m,  [33m-0.0062358915[39m,
  [33m-0.0050931373[39m,   [33m-0.000690414[39m,   [33m-0.008183171[39m,   [33m0.014002022[39m,   [33m-0.009010682[39m,
  [33m-0.0040620314[39m, [33m-0.00077127694[39m,   [33m-0.019728929[39m,   [33m0.003966802[39m,  [33m-0.0016689793[39m,
    [33m0.016051099[39m,   [33m-0.021935627[39m, [33m-0.00076758274[39m,  [33m-0.022316543[39m,   [33m0.0063278372[39m,
    [33m0.007046984[39m,   [33m-0.011184542[39m,   [33m-0.013502888[39m,   [33m0.

In [3]:
import { similarity } from "ml-distance";

const vector1 = await embeddings.embedQuery(
    "What are vectors useful for in machine learning?"
);
const unrelatedVector = await embeddings.embedQuery(
    "A group of parrots is called a pandemonium."
);

In [4]:
similarity.cosine(vector1, unrelatedVector);

[33m0.6959591747618556[39m

In [5]:
const similarVector = await embeddings.embedQuery(
    "Vectors are representations of information."
);

similarity.cosine(vector1, similarVector);

[33m0.8586349152599215[39m

In [6]:
// Peer dependency
import * as parse from "pdf-parse";
import { PDFLoader } from "langchain/document_loaders/fs/pdf";
import { 
    RecursiveCharacterTextSplitter
} from "langchain/text_splitter";

const loader = new PDFLoader("./data/MachineLearning-Lecture01.pdf");

const rawCS229Docs = await loader.load();

const splitter = new RecursiveCharacterTextSplitter({
  chunkSize: 128,
  chunkOverlap: 0,
});

const splitDocs = await splitter.splitDocuments(rawCS229Docs);

In [7]:
import { MemoryVectorStore } from "langchain/vectorstores/memory";

const vectorstore = new MemoryVectorStore(embeddings);

In [8]:
await vectorstore.addDocuments(splitDocs);

In [9]:
const retrievedDocs = await vectorstore.similaritySearch(
    "What is deep learning?", 
    4
);

const pageContents = retrievedDocs.map(doc => doc.pageContent);

pageContents

[
  [32m"piece of research in machine learning, okay?"[39m,
  [32m"are using a learning algorithm, perhaps without even being aware of it."[39m,
  [32m"some of my own excitement about machine learning to you."[39m,
  [32m"of the class, and then we'll start to talk a bit about machine learning."[39m
]

# Retrievers

In [10]:
const retriever = vectorstore.asRetriever();

In [11]:
await retriever.invoke("What is deep learning?")

[
  Document {
    pageContent: [32m"piece of research in machine learning, okay?"[39m,
    metadata: {
      source: [32m"./data/MachineLearning-Lecture01.pdf"[39m,
      pdf: {
        version: [32m"1.10.100"[39m,
        info: {
          PDFFormatVersion: [32m"1.4"[39m,
          IsAcroFormPresent: [33mfalse[39m,
          IsXFAPresent: [33mfalse[39m,
          Title: [32m""[39m,
          Author: [32m""[39m,
          Creator: [32m"PScript5.dll Version 5.2.2"[39m,
          Producer: [32m"Acrobat Distiller 8.1.0 (Windows)"[39m,
          CreationDate: [32m"D:20080711112523-07'00'"[39m,
          ModDate: [32m"D:20080711112523-07'00'"[39m
        },
        metadata: Metadata { _metadata: [36m[Object: null prototype][39m },
        totalPages: [33m22[39m
      },
      loc: { pageNumber: [33m8[39m, lines: { from: [33m2[39m, to: [33m2[39m } }
    }
  },
  Document {
    pageContent: [32m"are using a learning algorithm, perhaps without even being aw