In [16]:
import hashlib
import logging
import os
import pickle
import re

import bs4
import numpy as np
import pandas as pd
import torch
from dotenv import load_dotenv
from langchain_community.document_loaders import DirectoryLoader, WebBaseLoader
from langchain_core.documents import Document
from langchain_experimental.text_splitter import SemanticChunker
from langchain_huggingface.embeddings.huggingface import HuggingFaceEmbeddings
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain_text_splitters import (CharacterTextSplitter,
                                      RecursiveCharacterTextSplitter)
from pinecone import Pinecone, ServerlessSpec
from pinecone.data.index import Index
from tqdm import tqdm
from typing_extensions import List, TypedDict

In [2]:
# from rag_pipeline import RetrieverModel
DEVICE = ('cuda' if torch.cuda.is_available() else
          'mps' if torch.backends.mps.is_available() else 'cpu')
DEVICE

'cuda'

In [3]:
embeddings = HuggingFaceEmbeddings(model_name='all-mpnet-base-v2',
                                   model_kwargs={'device': DEVICE})
# dimension = len(embeddings.embed_documents(['test'])[0])
# dimension

In [4]:
pc = Pinecone()
pc_index = pc.Index('chunking-test')
vector_store = PineconeVectorStore(pc_index, embeddings)

In [5]:
loader = DirectoryLoader('raw_data', glob='**/*.txt',
                         show_progress=True, use_multithreading=False)
docs = loader.load()

100%|██████████| 136/136 [01:41<00:00,  1.34it/s]


In [8]:
[doc.metadata['source'] for doc in docs[:10]]

['raw_data/firesale.txt',
 'raw_data/sports_pittsburgh_wiki.txt',
 'raw_data/pittsburghvisit_conbined.txt',
 'raw_data/citypittsburgh_conbined.txt',
 'raw_data/pirates/pirates_nonroster.txt',
 'raw_data/pirates/pirates_roster.txt',
 'raw_data/pirates/pirate_coach_info.txt',
 'raw_data/pirates/pirates_schedule_2025.txt',
 'raw_data/pirates/pirate_transaction_info.txt',
 'raw_data/tax_info/tax_website.txt']

In [9]:
text_splitter = SemanticChunker(embeddings)
chunks = [chunk for doc in tqdm(docs)
          for chunk in text_splitter.split_documents([doc])]

100%|██████████| 136/136 [05:43<00:00,  2.53s/it]


In [12]:
with open('semantic_chunks.pkl', 'wb') as f:
    pickle.dump(chunks, f)

In [13]:
len(chunks)

8156

In [15]:
lengths = [len(chunk.page_content) for chunk in chunks]

In [17]:
lengths = pd.Series(lengths)
lengths.describe()

count      8156.000000
mean       3356.156081
std       12992.914470
min           0.000000
25%         405.000000
50%        1453.500000
75%        3428.250000
max      375970.000000
dtype: float64

In [28]:
lengths[lengths <= 10000].describe()

count     7727.000000
mean      2032.916009
std       2169.957951
min          0.000000
25%        368.500000
50%       1261.000000
75%       2991.000000
max      10000.000000
dtype: float64

In [25]:
for i in lengths.index:
    if lengths[i] > 10000:
        print(chunks[i].page_content)
        break

URL: https://en.wikipedia.org/wiki/Sports_in_Pittsburgh

TITLE: Sports_in_Pittsburgh

CONTENT:

See also: American football in Western Pennsylvania Main article: Pittsburgh As the home of the MLB 's Pittsburgh Pirates baseball team, PNC Park is located in North Shore , in front of the Allegheny River , Roberto Clemente Bridge , and the Pittsburgh city skyline . Sports in Pittsburgh have been played dating back to the American Civil War . Baseball , hockey , and the first professional American football game had been played in the city by 1892. Pittsburgh was first known as the "City of Champions" when the Pittsburgh Pirates , Pittsburgh Panthers football team , and Pittsburgh Steelers won multiple championships in the 1970s. [ 1 ] Today, the city has three major professional sports franchises, the Pirates, Steelers, and Penguins ; while the University of Pittsburgh Panthers compete in a Division I Power Five conference , the highest level of collegiate athletics in the United States, in

In [None]:
vector_store.add_documents(chunks)

API call failed after 3 attempts: (400)
Reason: Bad Request
HTTP response headers: HTTPHeaderDict({'Date': 'Thu, 13 Mar 2025 16:47:28 GMT', 'Content-Type': 'application/json', 'Content-Length': '115', 'Connection': 'keep-alive', 'x-pinecone-request-latency-ms': '10', 'x-pinecone-request-id': '2389069407928395462', 'x-envoy-upstream-service-time': '2', 'server': 'envoy'})
HTTP response body: {"code":3,"message":"Metadata size is 59901 bytes, which exceeds the limit of 40960 bytes per vector","details":[]}



PineconeApiException: (400)
Reason: Bad Request
HTTP response headers: HTTPHeaderDict({'Date': 'Thu, 13 Mar 2025 16:47:28 GMT', 'Content-Type': 'application/json', 'Content-Length': '115', 'Connection': 'keep-alive', 'x-pinecone-request-latency-ms': '10', 'x-pinecone-request-id': '2389069407928395462', 'x-envoy-upstream-service-time': '2', 'server': 'envoy'})
HTTP response body: {"code":3,"message":"Metadata size is 59901 bytes, which exceeds the limit of 40960 bytes per vector","details":[]}


API call failed after 3 attempts: (400)
Reason: Bad Request
HTTP response headers: HTTPHeaderDict({'Date': 'Thu, 13 Mar 2025 16:47:31 GMT', 'Content-Type': 'application/json', 'Content-Length': '115', 'Connection': 'keep-alive', 'x-pinecone-request-latency-ms': '12', 'x-pinecone-request-id': '1269923821580933383', 'x-envoy-upstream-service-time': '2', 'server': 'envoy'})
HTTP response body: {"code":3,"message":"Metadata size is 73711 bytes, which exceeds the limit of 40960 bytes per vector","details":[]}

API call failed after 3 attempts: (400)
Reason: Bad Request
HTTP response headers: HTTPHeaderDict({'Date': 'Thu, 13 Mar 2025 16:47:35 GMT', 'Content-Type': 'application/json', 'Content-Length': '115', 'Connection': 'keep-alive', 'x-pinecone-request-latency-ms': '10', 'x-pinecone-request-id': '2799722801074898343', 'x-envoy-upstream-service-time': '1', 'server': 'envoy'})
HTTP response body: {"code":3,"message":"Metadata size is 76768 bytes, which exceeds the limit of 40960 bytes per v

In [None]:
max_length = 10000

re_chunks = []