In [3]:
!pip install langchain streamlit faiss-cpu pytesseract pillow openai whisper


Collecting streamlit
  Downloading streamlit-1.40.2-py2.py3-none-any.whl.metadata (8.4 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting whisper
  Downloading whisper-1.1.10.tar.gz (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m179.3 kB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m774.0 kB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.40.2-py2.py3-none-any.whl (8.6 MB)
[2K   [90m━━━━━━━━

In [4]:
from google.colab import files
import zipfile
import os

# Upload the zip file
uploaded = files.upload()

# Extract the uploaded zip file
for filename in uploaded.keys():
    with zipfile.ZipFile(filename, 'r') as zip_ref:
        zip_ref.extractall("/content/uploaded_data")
print("Files extracted to /content/uploaded_data")
print("Contents:", os.listdir("/content/uploaded_data"))


Saving sample_data.zip to sample_data.zip
Files extracted to /content/uploaded_data
Contents: ['FAQs.txt', 'UserGuide.pdf', 'ErrorLogs.log', 'ProcessFlow.png']


In [5]:
%%writefile app.py
import os
from langchain.document_loaders import PyPDFLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI
from PIL import Image
import pytesseract
import streamlit as st

# Data preprocessing functions
def preprocess_text_files(directory):
    splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    documents = []
    for file in os.listdir(directory):
        if file.endswith('.txt'):
            loader = TextLoader(os.path.join(directory, file))
        elif file.endswith('.pdf'):
            loader = PyPDFLoader(os.path.join(directory, file))
        else:
            continue
        docs = loader.load_and_split(splitter)
        documents.extend(docs)
    return documents

def process_images(directory):
    text_data = []
    for file in os.listdir(directory):
        if file.endswith(('.png', '.jpg', '.jpeg')):
            image_path = os.path.join(directory, file)
            text = pytesseract.image_to_string(Image.open(image_path))
            text_data.append(text)
    return text_data

def process_audio_video(directory):
    import whisper
    model = whisper.load_model("base")
    transcripts = []
    for file in os.listdir(directory):
        if file.endswith(('.mp4', '.mp3')):
            result = model.transcribe(os.path.join(directory, file))
            transcripts.append(result["text"])
    return transcripts

# Create vector store
def create_vector_store(documents):
    embeddings = OpenAIEmbeddings()
    vector_store = FAISS.from_documents(documents, embeddings)
    return vector_store

# RAG workflow
def rag_workflow(query, vector_store):
    retriever = vector_store.as_retriever()
    qa_chain = RetrievalQA.from_chain_type(llm=OpenAI(), retriever=retriever)
    response = qa_chain.run(query)
    return response

# Streamlit app
st.title("RAG System")
st.header("Ask Questions from the Knowledge Base")

uploaded_files_dir = "/content/uploaded_data"  # Colab-specific directory

if "vector_store" not in st.session_state:
    st.write("Preprocessing data and building vector store...")
    text_docs = preprocess_text_files(uploaded_files_dir)
    image_texts = process_images(uploaded_files_dir)
    audio_transcripts = process_audio_video(uploaded_files_dir)

    # Combine all data
    documents = text_docs + [{"text": text} for text in image_texts + audio_transcripts]
    st.session_state["vector_store"] = create_vector_store(documents)
    st.success("Vector store created!")

query = st.text_input("Enter your query:")
if query:
    response = rag_workflow(query, st.session_state["vector_store"])
    st.write("Response:", response)


Writing app.py


In [6]:
!pip install streamlit




In [7]:
!nohup streamlit run app.py &


nohup: appending output to 'nohup.out'


In [11]:
!python3 -m http.server 8503 &


Serving HTTP on 0.0.0.0 port 8503 (http://0.0.0.0:8503/) ...


KeyboardInterrupt: 

In [13]:
!npm install -g localtunnel


[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0K⠙[1G[0K
added 22 packages in 2s
[1G[0K⠙[1G[0K
[1G[0K⠙[1G[0K3 packages are looking for funding
[1G[0K⠙[1G[0K  run `npm fund` for details
[1G[0K⠙[1G[0K

In [14]:
!curl ifconfig.me


34.19.91.72

In [15]:
!lt --port 8501


your url is: https://rich-socks-check.loca.lt
^C
