## CSV EXCEL INGESTION

In [2]:
import pandas as pd
import os

In [3]:
# create sample data and save as CSV

data = {
    'Product': ['Laptop', 'Chair', 'T-Shirt'],
    'Category': ['Electronics', 'Furniture', 'Clothing'],
    'Price': [100, 150, 200],
    'Stock': [30, 20, 50],
    'Discription': [
        'High-quality electronic product',
        'Comfortable and stylish furniture',
        'Trendy and affordable clothing'
    ]
}

df = pd.DataFrame(data)
csv_path = '../data/structured_files/products.csv'
df.to_csv(csv_path, index=False)

In [4]:
# create sample data with sheet with xlsx

excel_path = '../data/structured_files/products_data.xlsx'
    

# CSV Proccessing

In [5]:
from langchain_community.document_loaders import CSVLoader
from langchain_community.document_loaders import UnstructuredCSVLoader

In [6]:
# Load CSV using CSVLoader

print('CSV Loader: row based document loader')
csv_loader = CSVLoader(file_path=csv_path, encoding='utf-8', csv_args={'delimiter': ',','quotechar': '"'})
csv_documents = csv_loader.load()

print(f"Loaded {len(csv_documents)} documents using CSVLoader.")
print('first document:')
print(csv_documents[0])
print('meta data:',csv_documents[0].metadata)

CSV Loader: row based document loader
Loaded 3 documents using CSVLoader.
first document:
page_content='Product: Laptop
Category: Electronics
Price: 100
Stock: 30
Discription: High-quality electronic product' metadata={'source': '../data/structured_files/products.csv', 'row': 0}
meta data: {'source': '../data/structured_files/products.csv', 'row': 0}


In [7]:
from typing import List
from langchain_core.documents import Document
import pandas as pd

def process_csv_with_custom_metadat(filePath: str) -> List[Document]:
    documents = []
    df = pd.read_csv(filePath)
    
    for index, row in df.iterrows():
        page_content = f"""Product Information:
                Product: {row['Product']}
                Category: {row['Category']}
                Price: {row['Price']}
                Stock: {row['Stock']}
                Description: {row['Discription']}
                """
        custom_document = Document(
                page_content = page_content,
                metadata={
                    'source': os.path.basename(filePath),
                    'row_index': index,
                    'Product': row['Product'],
                    'Category': row['Category'],
                    'Price': row['Price'],
                    'Stock': row['Stock'],
                    'data_type': 'product_info'
            }
        )
        documents.extend(custom_document)
    
    return documents

In [8]:
process_csv_with_custom_metadat(csv_path)

[('id', None),
 ('metadata',
  {'source': 'products.csv',
   'row_index': 0,
   'Product': 'Laptop',
   'Category': 'Electronics',
   'Price': 100,
   'Stock': 30,
   'data_type': 'product_info'}),
 ('page_content',
  'Product Information:\n                Product: Laptop\n                Category: Electronics\n                Price: 100\n                Stock: 30\n                Description: High-quality electronic product\n                '),
 ('type', 'Document'),
 ('id', None),
 ('metadata',
  {'source': 'products.csv',
   'row_index': 1,
   'Product': 'Chair',
   'Category': 'Furniture',
   'Price': 150,
   'Stock': 20,
   'data_type': 'product_info'}),
 ('page_content',
  'Product Information:\n                Product: Chair\n                Category: Furniture\n                Price: 150\n                Stock: 20\n                Description: Comfortable and stylish furniture\n                '),
 ('type', 'Document'),
 ('id', None),
 ('metadata',
  {'source': 'products.csv',


In [9]:
print('Comparison between CSVLoader and custom metadata loader:')
print('Normal csv loader loses table content structure')
print('Custom metadata loader retains table structure in content and with rich metadata')

Comparison between CSVLoader and custom metadata loader:
Normal csv loader loses table content structure
Custom metadata loader retains table structure in content and with rich metadata


### Excel Processing

In [10]:
## Excel Processing with multiple sheets with pandas

def process_excel_with_custom_metadata(filePath: str) -> List[Document]:
    documents = []
    xls = pd.ExcelFile(filePath)
    print(f"Found sheets: {xls.sheet_names}")
    
    for sheet_name in xls.sheet_names:
        df = pd.read_excel(xls, sheet_name=sheet_name)
        
        for index, row in df.iterrows():
            page_content = f"""
                        Sheet: {sheet_name}
                        rows: {len(df)}
                        columns: {', '.join(df.columns)}
                    """
            custom_document = Document(
                    page_content = page_content,
                    metadata={
                        'source': os.path.basename(filePath),
                        'sheet_name': sheet_name,
                        'row_index': index,
                        'Category': row['Category'],
                        'Price': row['Price'],
                        'data_type': 'product_info'
                }
            )
            documents.extend(custom_document)
    
    return documents

In [11]:
process_excel_with_custom_metadata(excel_path)

Found sheets: ['Sheet1', 'Sheet2']


[('id', None),
 ('metadata',
  {'source': 'products_data.xlsx',
   'sheet_name': 'Sheet1',
   'row_index': 0,
   'Category': 'Electronics',
   'Price': 100,
   'data_type': 'product_info'}),
 ('page_content',
  '\n                        Sheet: Sheet1\n                        rows: 5\n                        columns: Product, Category, Price, Stock, Description\n                    '),
 ('type', 'Document'),
 ('id', None),
 ('metadata',
  {'source': 'products_data.xlsx',
   'sheet_name': 'Sheet1',
   'row_index': 1,
   'Category': 'Furniture',
   'Price': 150,
   'data_type': 'product_info'}),
 ('page_content',
  '\n                        Sheet: Sheet1\n                        rows: 5\n                        columns: Product, Category, Price, Stock, Description\n                    '),
 ('type', 'Document'),
 ('id', None),
 ('metadata',
  {'source': 'products_data.xlsx',
   'sheet_name': 'Sheet1',
   'row_index': 2,
   'Category': 'Clothing',
   'Price': 200,
   'data_type': 'product

In [15]:
import pandas as pd
from langchain_community.document_loaders import UnstructuredCSVLoader

xls = pd.ExcelFile(excel_path)
all_documents = []

for sheet_name in xls.sheet_names:
    df = pd.read_excel(xls, sheet_name=sheet_name)
    csv_path = f'../data/structured_files/{sheet_name}.csv'
    df.to_csv(csv_path, index=False, encoding="utf-8")
    loader = UnstructuredCSVLoader(file_path=csv_path, mode="elements")
    docs = loader.load()
    all_documents.extend(docs)

# all_documents now contains documents from all sheets

In [17]:
all_documents

[Document(metadata={'source': '../data/structured_files/Sheet1.csv', 'file_directory': '../data/structured_files', 'filename': 'Sheet1.csv', 'last_modified': '2025-10-26T09:37:37', 'text_as_html': '<table><tr><td>Product</td><td>Category</td><td>Price</td><td>Stock</td><td>Description</td></tr><tr><td>Laptop</td><td>Electronics</td><td>100</td><td>30</td><td>High-quality electronic product</td></tr><tr><td>Chair</td><td>Furniture</td><td>150</td><td>20</td><td>Comfortable and stylish furniture</td></tr><tr><td>T-Shirt</td><td>Clothing</td><td>200</td><td>50</td><td>Trendy and affordable clothing</td></tr><tr><td>Book</td><td>Books</td><td>50</td><td>100</td><td>Informative and engaging reading material</td></tr><tr><td>Phone</td><td>Electronics</td><td>80</td><td>40</td><td>Latest model with advanced features</td></tr></table>', 'languages': ['eng'], 'filetype': 'text/csv', 'category': 'Table', 'element_id': '7ea38f0d5f50a9d63cba509d5ba1a94e'}, page_content='Product Category Price Stoc