# Lesson 2: Loading and preparing data

# Loading

In [1]:
import "dotenv/config";

[Module: null prototype] { default: {} }

In [2]:
import { GithubRepoLoader } from "langchain/document_loaders/web/github";
// Peer dependency, used to support .gitignore syntax
import ignore from "ignore";

In [3]:
// Will not include anything under "ignorePaths"
const loader = new GithubRepoLoader(
  "https://github.com/langchain-ai/langchainjs",
  { recursive: false, ignorePaths: ["*.md", "yarn.lock"] }
);

In [4]:
const docs = await loader.load();

console.log(docs.slice(0, 3));

[
  Document {
    pageContent: "coverage:\n" +
      "  status:\n" +
      "    project:\n" +
      "      default:\n" +
      "        informational: true\n" +
      "    patch:\n" +
      "      default:\n" +
      "        informational: true\n" +
      "\n" +
      "\n" +
      "# When modifying this file, please validate using\n" +
      "# curl -X POST --data-binary @codecov.yml https://codecov.io/validate",
    metadata: {
      source: ".codecov.yml",
      repository: "https://github.com/langchain-ai/langchainjs",
      branch: "main"
    }
  },
  Document {
    pageContent: "# top-most EditorConfig file\n" +
      "root = true\n" +
      "\n" +
      "# Unix-style newlines with a newline ending every file\n" +
      "[*]\n" +
      "end_of_line = lf",
    metadata: {
      source: ".editorconfig",
      repository: "https://github.com/langchain-ai/langchainjs",
      branch: "main"
    }
  },
  Document {
    pageContent: "* text=auto eol=lf",
    metadata: {
      source: "

In [5]:
// Peer dependency
import * as parse from "pdf-parse";
import { PDFLoader } from "langchain/document_loaders/fs/pdf";

In [6]:
const loader = new PDFLoader("./data/MachineLearning-Lecture01.pdf");

In [7]:
const rawCS229Docs = await loader.load();

console.log(rawCS229Docs.slice(0, 5));

[
  Document {
    pageContent: "MachineLearning-Lecture01  \n" +
      "Instructor (Andrew Ng): Okay. Good morning. Welcome to CS229, the machine \n" +
      "learning class. So what I wanna do today is just spend a little time going over the logistics \n" +
      "of the class, and then we'll start to talk a bit about machine learning.  \n" +
      "By way of introduction, my name's Andrew Ng and I'll be instructor for this class. And so \n" +
      "I personally work in machine learning, and I've worked on it for about 15 years now, and \n" +
      "I actually think that machine learning is the most exciting field of all the computer \n" +
      "sciences. So I'm actually always excited about teaching this class. Sometimes I actually \n" +
      "think that machine learning is not only the most exciting thing in computer science, but \n" +
      "the most exciting thing in all of human endeavor, so maybe a little bias there.  \n" +
      "I also want to introduce the TAs, who are al

# Splitting

In [8]:
import { RecursiveCharacterTextSplitter } from "langchain/text_splitter";

In [9]:
const splitter = RecursiveCharacterTextSplitter.fromLanguage("js", {
  chunkSize: 32,
  chunkOverlap: 0,
});

In [10]:
const code = `function helloWorld() {
console.log("Hello, World!");
}
// Call the function
helloWorld();`;

await splitter.splitText(code);

[
  [32m"function helloWorld() {"[39m,
  [32m'console.log("Hello, World!");\n}'[39m,
  [32m"// Call the function"[39m,
  [32m"helloWorld();"[39m
]

In [11]:
import { CharacterTextSplitter } from "langchain/text_splitter";

const splitter = new CharacterTextSplitter({
  chunkSize: 32,
  chunkOverlap: 0,
  separator: " "
});

await splitter.splitText(code);

[
  [32m"function helloWorld()"[39m,
  [32m'{\nconsole.log("Hello,'[39m,
  [32m'World!");\n}\n// Call the'[39m,
  [32m"function\nhelloWorld();"[39m
]

In [12]:
const splitter = RecursiveCharacterTextSplitter.fromLanguage("js", {
  chunkSize: 64,
  chunkOverlap: 32,
});

await splitter.splitText(code);

[
  [32m'function helloWorld() {\nconsole.log("Hello, World!");\n}'[39m,
  [32m'console.log("Hello, World!");\n}\n// Call the function'[39m,
  [32m"}\n// Call the function\nhelloWorld();"[39m
]

In [13]:
const splitter = new RecursiveCharacterTextSplitter({
  chunkSize: 512,
  chunkOverlap: 64,
});

In [14]:
const splitDocs = await splitter.splitDocuments(rawCS229Docs);

console.log(splitDocs.slice(0, 5));

[
  Document {
    pageContent: "MachineLearning-Lecture01  \n" +
      "Instructor (Andrew Ng): Okay. Good morning. Welcome to CS229, the machine \n" +
      "learning class. So what I wanna do today is just spend a little time going over the logistics \n" +
      "of the class, and then we'll start to talk a bit about machine learning.  \n" +
      "By way of introduction, my name's Andrew Ng and I'll be instructor for this class. And so \n" +
      "I personally work in machine learning, and I've worked on it for about 15 years now, and",
    metadata: {
      source: "./data/MachineLearning-Lecture01.pdf",
      pdf: {
        version: "1.10.100",
        info: {
          PDFFormatVersion: "1.4",
          IsAcroFormPresent: false,
          IsXFAPresent: false,
          Title: "",
          Author: "",
          Creator: "PScript5.dll Version 5.2.2",
          Producer: "Acrobat Distiller 8.1.0 (Windows)",
          CreationDate: "D:20080711112523-07'00'",
          ModDate: "D: