In [None]:
!pip install openai

# Database options
!pip install chromadb # if you use chromadb as your vector database

# Others
!pip install langchain-community # if you use langchain for orchastration
!pip install transformers #if you use huggingface for vector embedding

In [None]:
# enable GPU if needed, GPU can speed up your vector embedding if you computing these vectors locally (not using API)

import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

In [None]:

import os
import json
import chromadb
import openai
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA

# Set OpenAI API Key
os.environ["OPENAI_API_KEY"] = ""


In [None]:
# Load the Drive and mount
from google.colab import drive
drive.mount('/content/drive/')

In [None]:
folder_path = "/content/drive/Shared drives/Datathon/Data/hackathon_data/"# Google drive path of the dataset
files_in_folder = os.listdir(folder_path)

len(files_in_folder)

In [None]:
print("total file number:", len(files_in_folder))
for f in files_in_folder[:10]:
  p

In [None]:
import json

# file paths
json_path = "/content/drive/Shared drives/Datathon/Data/hackathon_data/1-act.com.json"

# JSON loading
with open(json_path, 'r', encoding='utf-8') as f:
    data = json.load(f)

# extract desired text(contents inside 'text_by_page_url')
page_texts = data.get("text_by_page_url", {})


In [None]:
from transformers import pipeline

ner = pipeline("ner", model="dslim/bert-base-NER", grouped_entities=True)
entities = ner(text[:1000])  # we usually limit for performance

for e in entities:
    print(e)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import json

# open the json
json_path = "/content/drive/Shared drives/Datathon/Data/hackathon_data/1-act.com.json"
with open(json_path, 'r', encoding='utf-8') as f:
    data = json.load(f)

# bring real data from 'text_by_page_url'
text = data["text_by_page_url"]["http://1-act.com/"]

# calculating TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform([text])

# extracting Top keywords
feature_names = vectorizer.get_feature_names_out()
tfidf_scores = X.toarray()[0]
top_indices = np.argsort(tfidf_scores)[::-1][:20]

print("\n🔑 Top Keywords:")
for i in top_indices:
    print(f"{feature_names[i]}: {tfidf_scores[i]:.4f}")


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

# 1. break text entries (by line breaks)
items = [line.strip() for line in text.split("\n") if len(line.strip()) > 3]

# 2. TF-IDF vecterization
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(items)

# 3. clustering (ex: divide to 5 clusters)
kmeans = KMeans(n_clusters=5, random_state=42)
labels = kmeans.fit_predict(X)

# 4. print the results
for i, label in enumerate(labels):
    print(f"[Cluster {label}] {items[i]}")


In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain



In [None]:
from langchain.llms import HuggingFacePipeline
from transformers import pipeline

# download model in local and execute
pipe = pipeline("text-generation", model="google/flan-t5-base", max_new_tokens=128)

llm = HuggingFacePipeline(pipeline=pipe)


In [None]:
from langchain.text_splitter import CharacterTextSplitter

text_splitter = CharacterTextSplitter(chunk_size=200, chunk_overlap=20)
chunks = text_splitter.split_text(text)
documents = [Document(page_content=chunk) for chunk in chunks]


In [None]:
retriever = vector_db.as_retriever(search_kwargs={"k": 2})  # only return 2 most relevant chunks


In [None]:
import tempfile

persist_directory = tempfile.mkdtemp()  # creates a temp folder for chroma to work in


# split into short chunks
text_splitter = CharacterTextSplitter(chunk_size=200, chunk_overlap=20)
chunks = text_splitter.split_text(text)
documents = [Document(page_content=chunk) for chunk in chunks]

# embed and build vector DB
embedding = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
vector_db = Chroma.from_documents(
    documents=documents,
    embedding=embedding,
    persist_directory=persist_directory
)
# HuggingFace model
from transformers import pipeline
from langchain.llms import HuggingFacePipeline

pipe = pipeline("text2text-generation", model="google/flan-t5-base", max_new_tokens=256)
llm = HuggingFacePipeline(pipeline=pipe)

# Build retriever with limited context
retriever = vector_db.as_retriever(search_kwargs={"k": 2})

# Final RAG chain
from langchain.chains import RetrievalQA
qa = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)

# Ask question
query = "Where is it located?"
response = qa.run(query)
print(response)
