#### CSV and EXCEL Parsing ->STRUCTURED DATA

In [1]:
import pandas as pd
import os

In [2]:
os.makedirs("data/structured files",exist_ok=True)

In [3]:
# Create sample data
data = {
    'Product': ['Laptop', 'Mouse', 'Keyboard', 'Monitor', 'Webcam'],
    'Category': ['Electronics', 'Accessories', 'Accessories', 'Electronics', 'Electronics'],
    'Price': [999.99, 29.99, 79.99, 299.99, 89.99],
    'Stock': [50, 200, 150, 75, 100],
    'Description': [
        'High-performance laptop with 16GB RAM and 512GB SSD',
        'Wireless optical mouse with ergonomic design',
        'Mechanical keyboard with RGB backlighting',
        '27-inch 4K monitor with HDR support',
        '1080p webcam with noise cancellation'
    ]
}

# Save as CSV
df=pd.DataFrame(data)
df.to_csv("data/structured files/product.csv",index=False)

In [6]:
# Save as Excel with multiple sheets
with pd.ExcelWriter('data/structured files/inventory.xlsx') as writer:
    df.to_excel(writer, sheet_name='Products', index=False)
    
    # Add another sheet
    summary_data = {
        'Category': ['Electronics', 'Accessories'],
        'Total_Items': [3, 2],
        'Total_Value': [1389.97, 109.98]
    }
    pd.DataFrame(summary_data).to_excel(writer, sheet_name='Summary', index=False)

## CSV Processing

In [7]:
from langchain.document_loaders import CSVLoader
from langchain_community.document_loaders import UnstructuredCSVLoader

In [9]:
# Method 1:CSVLoader where each row will become a document object
loader=CSVLoader(
    file_path="data/structured files/product.csv",
    encoding="utf-8",
    csv_args={
        "delimiter":",",
        "quotechar":'"'
    }
)

docs=loader.load()
print(f"Created {len(docs)} documents")
print("\n\n")
print(docs)
print("\n\n")
print(f"First docs content {docs[0].page_content}")
print(f"FFirst document metadaata: {docs[0].metadata}")

Created 5 documents



[Document(metadata={'source': 'data/structured files/product.csv', 'row': 0}, page_content='Product: Laptop\nCategory: Electronics\nPrice: 999.99\nStock: 50\nDescription: High-performance laptop with 16GB RAM and 512GB SSD'), Document(metadata={'source': 'data/structured files/product.csv', 'row': 1}, page_content='Product: Mouse\nCategory: Accessories\nPrice: 29.99\nStock: 200\nDescription: Wireless optical mouse with ergonomic design'), Document(metadata={'source': 'data/structured files/product.csv', 'row': 2}, page_content='Product: Keyboard\nCategory: Accessories\nPrice: 79.99\nStock: 150\nDescription: Mechanical keyboard with RGB backlighting'), Document(metadata={'source': 'data/structured files/product.csv', 'row': 3}, page_content='Product: Monitor\nCategory: Electronics\nPrice: 299.99\nStock: 75\nDescription: 27-inch 4K monitor with HDR support'), Document(metadata={'source': 'data/structured files/product.csv', 'row': 4}, page_content='Product: Webcam\

### Custom CSV processing

In [13]:
from typing import List
from langchain_core.documents import Document

# Method 1: Custom CSV processing
print("\n Custom CSV Processing \n")
def custom_csv_processing(filepath: str)->List[Document]:
    "Customized CSV Procesing"
    df=pd.read_csv(filepath)
    documents=[]
    # Structure 1:One document per row with structured content
    for idx,row in df.iterrows():
        # Create structure content
        content=f""""Product information:
        Name: {row['Product']}
        category: {row['Category']}
        price: ${row['Price']}
        stock: {row['Stock']}
        description: {row['Description']}
        """

        # Create document with rich metadata
        doc=Document(
            page_content=content,
            metadata={
                "source":filepath,
                "row indx":idx,
                "category":row['Category'],
                "price": row['Price'],
                "data_type": "product_info"
            }
        )

        documents.append(doc)
    return documents


 Custom CSV Processing 



In [14]:
custom_csv_processing("data/structured files/product.csv")

[Document(metadata={'source': 'data/structured files/product.csv', 'row indx': 0, 'category': 'Electronics', 'price': 999.99, 'data_type': 'product_info'}, page_content='"Product information:\n        Name: Laptop\n        category: Electronics\n        price: $999.99\n        stock: 50\n        description: High-performance laptop with 16GB RAM and 512GB SSD\n        '),
 Document(metadata={'source': 'data/structured files/product.csv', 'row indx': 1, 'category': 'Accessories', 'price': 29.99, 'data_type': 'product_info'}, page_content='"Product information:\n        Name: Mouse\n        category: Accessories\n        price: $29.99\n        stock: 200\n        description: Wireless optical mouse with ergonomic design\n        '),
 Document(metadata={'source': 'data/structured files/product.csv', 'row indx': 2, 'category': 'Accessories', 'price': 79.99, 'data_type': 'product_info'}, page_content='"Product information:\n        Name: Keyboard\n        category: Accessories\n        pric

## Excel File Processing

In [19]:
from langchain_community.document_loaders import UnstructuredExcelLoader
excel_loader=UnstructuredExcelLoader(
    r"data\structured files\inventory.xlsx",
    mode='elements'
)

docs=excel_loader.load()
print(f"Created {len(docs)} documents")
print(docs)

Created 2 documents
[Document(metadata={'source': 'data\\structured files\\inventory.xlsx', 'file_directory': 'data\\structured files', 'filename': 'inventory.xlsx', 'last_modified': '2025-10-22T14:01:39', 'page_name': 'Products', 'page_number': 1, 'text_as_html': '<table><tr><td>Product</td><td>Category</td><td>Price</td><td>Stock</td><td>Description</td></tr><tr><td>Laptop</td><td>Electronics</td><td>999.99</td><td>50</td><td>High-performance laptop with 16GB RAM and 512GB SSD</td></tr><tr><td>Mouse</td><td>Accessories</td><td>29.99</td><td>200</td><td>Wireless optical mouse with ergonomic design</td></tr><tr><td>Keyboard</td><td>Accessories</td><td>79.99</td><td>150</td><td>Mechanical keyboard with RGB backlighting</td></tr><tr><td>Monitor</td><td>Electronics</td><td>299.99</td><td>75</td><td>27-inch 4K monitor with HDR support</td></tr><tr><td>Webcam</td><td>Electronics</td><td>89.99</td><td>100</td><td>1080p webcam with noise cancellation</td></tr></table>', 'languages': ['eng'], 