In [1]:
# ✅ 1. Install required libraries
!pip install --quiet langchain langchain-openai faiss-cpu python-dotenv pymupdf openai

In [2]:
# ✅ 2. Import necessary packages
import os
import re
from dotenv import load_dotenv
from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI


In [4]:
from dotenv import load_dotenv
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")

embeddings = OpenAIEmbeddings(openai_api_key=api_key)
llm = OpenAI(openai_api_key=api_key)


  llm = OpenAI(openai_api_key=api_key)


In [5]:
#pdf loader

In [6]:
pdf_path = "./pdfs/textclustering.pdf"
loader = PyMuPDFLoader(pdf_path)

In [7]:
documents = loader.load()

In [8]:
print(documents[0].page_content[:100])

The peer-reviewed version of this paper is published in the International Journal of Cognitive Compu


In [9]:
#Text Splitting

In [11]:
# chunk_size: Maximum number of characters per chunk
# chunk_overlap: Number of overlapping characters between chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=100
)

In [12]:
#Split the loaded documents into smaller chunks
# Input should be a list of Document objects
texts = text_splitter.split_documents(documents)

In [13]:
print(f"Number of text chunks: {len(texts)}")

Number of text chunks: 71


In [14]:
#Embeddings

In [15]:
#RAG Chain creation

In [16]:
embeddings = OpenAIEmbeddings(openai_api_key=api_key)

In [17]:
vectorstore = FAISS.from_documents(texts, embeddings)

In [18]:
#Build RetrievalQA chain using OpenAI LLM and FAISS retriever
qa_chain = RetrievalQA.from_chain_type(
    llm=OpenAI(openai_api_key=api_key),  
    retriever=vectorstore.as_retriever()
)

In [19]:
query = "What are the main contributions of this paper?"
result = qa_chain.run(query)
print(result)

  result = qa_chain.run(query)


 The main contributions of this paper are testing and identifying optimal combinations of embeddings and clustering algorithms for text clustering tasks, comparing the performance of embeddings derived from large language models (LLMs) with traditional embedding techniques, and evaluating the impact of model size and dimensionality reduction on clustering efficiency. 


In [20]:
#Text cleaning

In [22]:
#clean_text : cleans input text by removing page numbers, collapsing extra blank lines, and stripping unwanted line characters
def clean_text(text):
    # Remove page numbers, e.g., "Page 1 of 10"
    text = re.sub(r"Page \d+ of \d+", "", text)
    # Replace multiple newlines with a single newline
    text = re.sub(r"\n\s*\n", "\n", text)
    # Remove special characters except word characters, whitespace, and basic punctuation
    text = re.sub(r"[^\w\s.,?!]", "", text)
    # Trim leading and trailing whitespace
    return text.strip()

In [23]:
documents = loader.load()

In [24]:
#loops through each document object in the documents list
for doc in documents:
    # Clean the text content of each document using the clean_text function
    doc.page_content = clean_text(doc.page_content)
    #applies clean_text func to the page_content attribute of each document to remove unwanted characters, numbers, and extra blank lines

In [25]:
# Splits long texts into smaller chunks, each about 1000 characters long
# chunk_overlap=100 means each chunk overlaps the previous chunk by 100 characters
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
texts = text_splitter.split_documents(documents)  # Split the cleaned documents into smaller chunks

In [26]:
#FAISS VECTORSTORE

In [27]:
# Save the FAISS vector store to local disk
vectorstore.save_local("faiss_index")

In [28]:
# Set allow_dangerous_deserialization=True ONLY IF you trust the data source
vectorstore = FAISS.load_local(
    "faiss_index",
    embeddings,
    allow_dangerous_deserialization=True
)

In [29]:
# Recreate embedding object with your OpenAI API key
embeddings = OpenAIEmbeddings(openai_api_key=api_key)

In [30]:
# Initialize OpenAI LLM with your API key
llm = OpenAI(openai_api_key=api_key)

In [31]:
# Create the RetrievalQA chain with the vectorstore retriever
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=vectorstore.as_retriever()
)


In [32]:
# Create the RetrievalQA chain with the vectorstore retriever
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=vectorstore.as_retriever()
)


In [33]:
# Run a query against your PDF documents
query = "What are the main contributions of this paper?"
result = qa_chain.run(query)

In [34]:
print(result)

 The main contributions of this paper are: 
1. Testing and identifying optimal combinations of embeddings and clustering algorithms for text clustering tasks 
2. Evaluating the performance of LLM embeddings compared to traditional techniques 
3. Examining the impact of model size and dimensionality reduction on clustering performance 
4. Highlighting the need to balance detailed text representation with computational feasibility in text clustering tasks.


In [35]:
#FAISS Index optimization => Create IVF(Inverted File index)

In [36]:
embeddings = OpenAIEmbeddings(openai_api_key=api_key)

In [37]:
vectorstore = FAISS.from_documents(texts, embeddings)

In [38]:
import faiss
import numpy as np

In [39]:
# the length of the embedding vector
dimension = len(embeddings.embed_query("test"))

In [40]:
print(dimension)

1536


In [41]:
#IVF(Inverted File index)
"""
What is IVF : used to speed up vector similarity search by partitioning the entire vector space into multiple clusters (also called centroids), and assigning each vector to its nearest cluster.
why use IVF : Searching through all vectors in a large dataset is computationally expensive and slow. IVF helps by limiting the search to only the most relevant clusters, dramatically improving search speed.
How it work? 
The vector space is first partitioned into n clusters using k-means clustering. This step requires calling index.train(vectors).
Each vector is assigned to its closest centroid (cluster center).
During search, instead of scanning all clusters, only a few relevant clusters are searched, making the process much faster.
"""

'\nWhat is IVF : used to speed up vector similarity search by partitioning the entire vector space into multiple clusters (also called centroids), and assigning each vector to its nearest cluster.\nwhy use IVF : Searching through all vectors in a large dataset is computationally expensive and slow. IVF helps by limiting the search to only the most relevant clusters, dramatically improving search speed.\nHow it work? \nThe vector space is first partitioned into n clusters using k-means clustering. This step requires calling index.train(vectors).\nEach vector is assigned to its closest centroid (cluster center).\nDuring search, instead of scanning all clusters, only a few relevant clusters are searched, making the process much faster.\n'

In [42]:
# IVF index setup
nlist = 10  # num of cluster
quantizer = faiss.IndexFlatL2(dimension)  # # Flat index for L2 distance (used as the coarse quantizer)


In [43]:
# Create an IVF flat index (coarse quantizer + flat index within each cluster)
# - quantizer: used to assign vectors to clusters
# - dimension: dimensionality of the embedding vectors
# - nlist: number of clusters (coarse centroids)
# - faiss.METRIC_L2: use L2 (Euclidean) distance for similarity search
index = faiss.IndexIVFFlat(quantizer, dimension, nlist, faiss.METRIC_L2)

In [44]:
# Retrieve vectors from the existing FAISS index inside the vectorstore
# reconstruct_n(start, count) returns 'count' vectors starting from index 'start'
vectors = vectorstore.index.reconstruct_n(0, vectorstore.index.ntotal)  # (ntotal, dimension) ndarray

In [45]:
# debugging
print(type(vectors), vectors.shape)

<class 'numpy.ndarray'> (70, 1536)


In [47]:
 # vectors가 ndarray임을 확인했으면 별도 변환 불필요
# 만약 리스트라면 numpy array로 변환
if not isinstance(vectors, np.ndarray):
    vectors = np.array(vectors).astype('float32')

In [48]:
# 인덱스 학습 확인 후 학습
if not index.is_trained:
    print("Index is not trained. Training now...")
    index.train(vectors)   # 인덱스가 학습되지 않았으면 벡터로 학습함
index.add(vectors)        # 학습된 인덱스에 벡터 추가
print("Vectors added successfully.")

Index is not trained. Training now...
Vectors added successfully.


