# Document loaders and Embeddings

In [1]:
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from dotenv import load_dotenv

In [2]:
load_dotenv()

True

In [15]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

In [3]:
text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=200,
    chunk_overlap=0
)

In [4]:
loader = TextLoader("facts.txt")
docs = loader.load_and_split(
    text_splitter=text_splitter
)

In [5]:
print(docs)

[Document(metadata={'source': 'facts.txt'}, page_content='1. "Dreamt" is the only English word that ends with the letters "mt."\n2. An ostrich\'s eye is bigger than its brain.\n3. Honey is the only natural food that is made without destroying any kind of life.'), Document(metadata={'source': 'facts.txt'}, page_content="4. A snail can sleep for three years.\n5. The longest word in the English language is 'pneumonoultramicroscopicsilicovolcanoconiosis.'\n6. The elephant is the only mammal that can't jump."), Document(metadata={'source': 'facts.txt'}, page_content="7. The letter 'Q' is the only letter not appearing in any U.S. state name.\n8. The heart of a shrimp is located in its head.\n9. Australia is the only continent covered by a single country."), Document(metadata={'source': 'facts.txt'}, page_content="10. The Great Wall of China is approximately 13,171 miles long.\n11. Bananas are berries, but strawberries aren't.\n12. The Sphinx of Giza has the body of a lion and the head of a h

In [12]:
for doc in docs:
    print(doc.page_content)
    print("----")

1. "Dreamt" is the only English word that ends with the letters "mt."
2. An ostrich's eye is bigger than its brain.
3. Honey is the only natural food that is made without destroying any kind of life.
----
4. A snail can sleep for three years.
5. The longest word in the English language is 'pneumonoultramicroscopicsilicovolcanoconiosis.'
6. The elephant is the only mammal that can't jump.
----
7. The letter 'Q' is the only letter not appearing in any U.S. state name.
8. The heart of a shrimp is located in its head.
9. Australia is the only continent covered by a single country.
----
10. The Great Wall of China is approximately 13,171 miles long.
11. Bananas are berries, but strawberries aren't.
12. The Sphinx of Giza has the body of a lion and the head of a human.
----
13. The first computer bug was an actual bug trapped in a computer.
14. Neil Armstrong was the first man to walk on the moon.
----
15. The Eiffel Tower in Paris leans slightly in the sun due to thermal expansion.
16. Quee

In [16]:
embeddings.embed_query(docs[0].page_content)

[0.07003479450941086,
 0.023903530091047287,
 0.009431370534002781,
 0.009143133647739887,
 -0.009491698816418648,
 -0.008063922636210918,
 0.007936562411487103,
 0.04174738749861717,
 -0.015926748514175415,
 0.026182610541582108,
 0.05115864798426628,
 -0.03158537298440933,
 -0.018701864406466484,
 0.03718922659754753,
 -0.05737919732928276,
 0.0038945460692048073,
 -0.02482856810092926,
 0.00476931082084775,
 -0.004444206599146128,
 -0.00913643091917038,
 0.04796793684363365,
 0.013701294548809528,
 -0.03252381458878517,
 0.02882366254925728,
 0.05518055707216263,
 0.02099435217678547,
 0.02438616007566452,
 0.011127274483442307,
 -0.00020371374557726085,
 0.005731216631829739,
 0.02327343262732029,
 -0.036947913467884064,
 -0.016918819397687912,
 -0.025311198085546494,
 0.006421643774956465,
 0.003036539303138852,
 0.025016257539391518,
 -0.008492926135659218,
 -0.03936105594038963,
 -0.007574590388685465,
 0.022174110636115074,
 0.009786639362573624,
 0.0004214622895233333,
 -0.001

# Older method

The older method is same this time. The only change is that it used this import:

```python
from langchain.text_splitter import CharacterTextSplitter
```

instead of the current:

```python
from langchain_text_splitters import CharacterTextSplitter
```