In [1]:
# Install required packages (run once)
!pip install langchain langchain-community langchain-huggingface faiss-cpu sentence-transformers huggingface-hub accelerate transformers streamlit --quiet


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m69.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m57.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m110.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.2/45.2 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m95.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m81.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [79]:
!pip install transformers torch





In [80]:
import pandas as pd
from langchain.docstore.document import Document
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from transformers import pipeline
from langchain.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA

class RAGApp:
    def __init__(self, csv_path):
        self.df = pd.read_csv(csv_path, low_memory=False)
        print("Total rows before sampling:", len(self.df))

        # Build context for each row
        self.df['context'] = self.df.apply(self.build_context, axis=1)
        self.df = self.df[self.df['context'] != "Context unavailable."]
        print("Empty contexts:", (self.df['context'] == "Context unavailable.").sum())

        # Sample for performance
        sampled_df = self.df.sample(n=min(2000, len(self.df)), random_state=42)

        # Create documents
        self.docs = [Document(page_content=row['context']) for _, row in sampled_df.iterrows()]
        print("Total documents:", len(self.docs))

        # Split into chunks
        self.splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)
        self.split_docs = self.splitter.split_documents(self.docs)
        print("Total chunks after splitting:", len(self.split_docs))

        # Use local embedding model
        embed_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
        self.vectorstore = FAISS.from_documents(self.split_docs, embedding=embed_model)
        self.retriever = self.vectorstore.as_retriever()

        # Load local LLM (GPT2)
        generator = pipeline("text-generation", model="gpt2", max_new_tokens=100)
        self.llm = HuggingFacePipeline(pipeline=generator)

        # RetrievalQA chain
        self.qa_chain = RetrievalQA.from_chain_type(llm=self.llm, retriever=self.retriever)

    @staticmethod
    def build_context(row):
        try:
            date = row['DATE_OF_STOP']
            time = row['TIME_OF_STOP']
            city = row.get('CLOSEST_CITY', 'Unknown')

            ticket_count = row.get('ROS_CITATION_CDS', 0)
            warning_count = row.get('ROS_WARNING_CDS', 0)

            gender_cols = [
                'G_MALE', 'G_FEMALE', 'G_TRANSGENDER_MAN',
                'G_TRANSGENDER_WOMAN', 'G_GENDER_NONCONFORMING', 'G_MULTIGENDER'
            ]
            gender = 'Unknown'
            for col in gender_cols:
                if col in row and row[col] == 1:
                    gender = col.replace('G_', '').replace('_', ' ').title()
                    break

            person_type = 'Student' if row.get('STOP_STUDENT', 0) == 1 else 'Non-student'

            return (f"On {date} at {time}, a {person_type} {gender} was stopped in {city}. "
                    f"Tickets issued: {ticket_count}, Warnings issued: {warning_count}.")
        except Exception:
            return "Context unavailable."

    def query(self, question):
        return self.qa_chain.invoke({"query": question})


In [81]:
rag = RAGApp("RIPA_2023_Merged_Cities.csv")

Total rows before sampling: 70211
Empty contexts: 0
Total documents: 2000
Total chunks after splitting: 2000


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cpu
  self.llm = HuggingFacePipeline(pipeline=generator)


In [82]:
response = rag.query("How many stops were there on 25-05-2023")
print(response["result"])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.





Question: How many stops were there on 25-05-2023
Helpful Answer:



On 25-05-2023, a Non-student Female was stopped in BLUE LAKE. Tickets
