Skip to content

Commit

Permalink
add .doc and .ppt support (#86)
Browse files Browse the repository at this point in the history
* add .doc and .ppt support

* update lock
  • Loading branch information
hippalectryon-0 committed May 17, 2023
1 parent 8b1453e commit 7bab669
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 108 deletions.
6 changes: 4 additions & 2 deletions casalioy/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,14 @@
from langchain.docstore.document import Document
from langchain.document_loaders import (
CSVLoader,
Docx2txtLoader,
OutlookMessageLoader,
PDFMinerLoader,
TextLoader,
UnstructuredEmailLoader,
UnstructuredEPubLoader,
UnstructuredHTMLLoader,
UnstructuredPowerPointLoader,
UnstructuredWordDocumentLoader,
)
from langchain.text_splitter import RecursiveCharacterTextSplitter
from load_env import chunk_overlap, chunk_size, documents_directory, get_embedding_model, ingest_n_threads, persist_directory
Expand All @@ -37,8 +37,10 @@ class Ingester:
"csv": CSVLoader,
"epub": UnstructuredEPubLoader,
"html": UnstructuredHTMLLoader,
"docx": Docx2txtLoader,
"docx": UnstructuredWordDocumentLoader,
"doc": UnstructuredWordDocumentLoader,
"pptx": UnstructuredPowerPointLoader,
"ppt": UnstructuredPowerPointLoader,
"eml": UnstructuredEmailLoader,
"msg": OutlookMessageLoader,
}
Expand Down
137 changes: 31 additions & 106 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ qdrant-client = "^1.1.7"
unstructured = "^0.6.6" # Handle ingestion file formats
pypandoc-binary = "^1.11" # doc conversion
docx2txt = "^0.8" # Handle docx ingestion file formats
tabulate = "^0.9.0" # Also required for docx
extract-msg = "^0.41.1" # Handle email file formats
llama-cpp-python = "^0.1.50" # 0.1.50 raises an AssertionError / NameError on <5 vic models
sentence_transformers = "^2.2.2" # doesn't install torch properly with poetry, but should be better in later versions
Expand Down

0 comments on commit 7bab669

Please sign in to comment.