In [1]:
!pip install pandas torch sentence-transformers faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0.post1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.0 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-m

# **1.IMPORT LIBRARIES and DEFINE THE RAG PIPELINE**

# **1.1. Data Loading and Corpus Generation**

In [12]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import time
import os

# --- Step 1: Data Loading and Corpus Generation ---
def load_and_prepare_corpus(filepath: str):
    """
    Loads the FULL RIPA dataset and generates a descriptive text document
    for each stop, robustly handling mixed data types.
    """
    print("Loading and preparing data...")
    if not os.path.exists(filepath):
        print(f"❌ Error: The file '{filepath}' was not found.")
        print("Please make sure the CSV file is uploaded to your Colab session.")
        return None

    # We will use the full dataset now to maximize our chances.
    df = pd.read_csv(filepath, low_memory=False)

    documents = []
    action_cols = [col for col in df.columns if col.startswith('ADS_')]
    race_cols = {
        'RAE_ASIAN': 'Asian', 'RAE_BLACK_AFRICAN_AMERICAN': 'Black/African American',
        'RAE_HISPANIC_LATINO': 'Hispanic/Latino', 'RAE_MIDDLE_EASTERN_SOUTH_ASIAN': 'Middle Eastern/South Asian',
        'RAE_NATIVE_AMERICAN': 'Native American', 'RAE_PACIFIC_ISLANDER': 'Pacific Islander', 'RAE_WHITE': 'White'
    }

    for _, row in df.iterrows():
        age = row.get('AGE', 'Unknown')
        races = [race_name for col, race_name in race_cols.items() if row.get(col) == 1]
        race_desc = races[0] if races else "Unknown Race"

        reason = str(row.get('REASON_FOR_STOP', 'Unknown Reason')).lower()
        if pd.notna(row.get('RFS_TRAFFIC_VIOLATION_TYPE')):
            reason = f"{str(row['RFS_TRAFFIC_VIOLATION_TYPE']).lower()} traffic violation"

        actions_taken = [col.replace('ADS_', '').replace('_', ' ').lower() for col in action_cols if row.get(col) == 1]
        actions_desc = f"Actions taken included: {', '.join(actions_taken)}." if actions_taken else "No specific actions were recorded."

        duration = int(row.get('STOP_DURATION', 0))

        doc = (
            f"A stop involving a {age}-year-old {race_desc} individual for a {reason}. "
            f"{actions_desc} "
            f"The total duration was {duration} minutes."
        )
        documents.append(doc)

    df['document'] = documents
    print(f"✅ Successfully created {len(df)} documents for embedding.")
    return df

# **2. Embedding the Corpus and Creating a Vector Store**

In [13]:
# --- Step 2:  ---
def create_vector_store(df: pd.DataFrame, model_name: str = 'all-MiniLM-L6-v2'):
    """
    Embeds the generated text documents and stores them in a FAISS index.
    """
    print("\nEmbedding corpus... (This will take a few minutes for the full dataset)")
    start_time = time.time()

    model = SentenceTransformer(model_name)
    embeddings = model.encode(df['document'].tolist(), show_progress_bar=True)

    d = embeddings.shape[1]
    index = faiss.IndexFlatL2(d)
    index = faiss.IndexIDMap(index)
    index.add_with_ids(np.array(embeddings, dtype='float32'), df.index.values)

    end_time = time.time()
    print(f"✅ Embedding and indexing complete in {end_time - start_time:.2f} seconds.")
    return index, model


# **3: The RAG Pipeline Logic (with Flexible Filtering)**

In [17]:
def run_rag_pipeline(query: dict, index: faiss.Index, model: SentenceTransformer, corpus_df: pd.DataFrame):
    """
    Simulates the RAG pipeline, now with a step to let the user inspect
    the retrieved documents before filtering.
    """
    print("\n" + "="*50)
    print("🚀 Running RAG Pipeline for New Query")
    print("="*50)

    query_doc = (
        f"A stop involving a {query['age']}-year-old {query['race']} individual "
        f"for a {query['violation']}. Actions being considered are {query['proposed_actions']}."
    )
    print(f"📝 Formatted Query: {query_doc}")
    query_embedding = model.encode([query_doc])

    k = 50
    distances, ids = index.search(np.array(query_embedding, dtype='float32'), k)
    retrieved_docs = corpus_df.loc[ids[0]]
    print(f"\n🔍 Retrieved {len(retrieved_docs)} potentially similar past stops.")

    # --- NEW INSPECTION STEP ---
    # Ask the user if they want to see the retrieved documents before filtering.
    inspect = input("Would you like to see the details of these retrieved stops? (yes/no): ").lower()
    if inspect == 'yes':
        print("\n--- Top 50 Retrieved Documents (Before Filtering) ---")
        # Define which columns are most useful to see
        display_cols = [
            'document',
            'STOP_DURATION',
            'ADS_VEHICLE_IMPOUND',
            'ADS_SEARCH_PROPERTY',
            'ROS_CITATION',
            'ROS_WARNING'
        ]
        # Ensure the full 'document' text is visible
        pd.set_option('display.max_colwidth', None)
        # Display the selected columns of the retrieved dataframe
        print(retrieved_docs[display_cols])
        pd.reset_option('display.max_colwidth') # Reset for cleaner future outputs
        print("--------------------------------------------------\n")

    # --- DYNAMIC FILTERING LOGIC (from previous version) ---
    action_map = {
        "vehicle impound": ("ADS_VEHICLE_IMPOUND", 0),
        "vehicle search": ("ADS_SEARCH_PROPERTY", 0),
        "issuing a full citation": ("ROS_CITATION", 0)
    }

    mask = (retrieved_docs['STOP_DURATION'] < query['current_duration'])
    actions_to_check = []

    for phrase, (column, skip_value) in action_map.items():
        if phrase in query['proposed_actions'].lower():
            actions_to_check.append(phrase)
            mask = mask & (retrieved_docs[column] == skip_value)

    efficient_stops = retrieved_docs[mask].copy()

    if efficient_stops.empty:
        print("\n❌ No faster, alternative cases found that match the specified skipped actions.")
        return "Could not find any similar, faster past stops to generate a suggestion."
    else:
        print(f"✅ Found {len(efficient_stops)} similar stops that were faster and skipped the proposed action(s).")

        efficient_stops.sort_values(by='STOP_DURATION', inplace=True)
        best_case = efficient_stops.iloc[0]

        if not actions_to_check:
             return "Logic Error: Found efficient stops but couldn't identify which actions were skipped."

        actions_text = " and ".join(actions_to_check)
        best_case_duration = int(best_case['STOP_DURATION'])

        generated_suggestion = (
            f"Prior cases skipped {actions_text} and ended in {best_case_duration} mins "
            f"instead of {query['current_duration']}."
        )

        print("\n💡 --- Final Generated Suggestion --- 💡")
        return generated_suggestion

## **4. MAIN EXECUTION BLOCK**

In [18]:
if __name__ == "__main__":
    # --- 1. ONE-TIME SETUP ---
    # This part runs only once when you start the script.
    print("--- Starting One-Time Setup ---")
    FILEPATH = 'RIPA_2023_Biggest_Three_Cities_Exceeding_10_Minutes.csv'
    corpus_df = load_and_prepare_corpus(filepath=FILEPATH)

    if corpus_df is not None:
        # This is the time-consuming step that we now only do once.
        index, model = create_vector_store(df=corpus_df)
        print("\n--- Setup Complete. Entering Interactive Query Mode. ---")
        print("Type 'exit' at any prompt to quit.")

        # --- 2. INTERACTIVE QUERY LOOP ---
        # This loop runs continuously, allowing for fast, repeated queries.
        while True:
            print("\n--- Enter New Query Details ---")

            try:
                age = input("Enter Age (e.g., 25): ")
                if age.lower() == 'exit': break

                race = input("Enter Race (e.g., White, Hispanic/Latino): ")
                if race.lower() == 'exit': break

                violation = input("Enter Violation (e.g., speeding): ")
                if violation.lower() == 'exit': break

                proposed_actions = input("Enter Proposed Actions (e.g., vehicle search): ")
                if proposed_actions.lower() == 'exit': break

                current_duration = input("Enter Current Duration in minutes (e.g., 6): ")
                if current_duration.lower() == 'exit': break

                # Construct the query dictionary
                interactive_query = {
                    "age": int(age),
                    "race": race,
                    "violation": violation,
                    "proposed_actions": proposed_actions,
                    "current_duration": int(current_duration)
                }

                # Run the pipeline with the user's query
                suggestion = run_rag_pipeline(
                    query=interactive_query,
                    index=index,
                    model=model,
                    corpus_df=corpus_df
                )
                print(f"\n{suggestion}\n")

            except ValueError:
                print("\n❌ Invalid input. Please enter numbers for age and duration. Let's try again.")
            except Exception as e:
                print(f"An error occurred: {e}")
                break

        print("--- Exiting Interactive Mode. ---")

--- Starting One-Time Setup ---
Loading and preparing data...
✅ Successfully created 82435 documents for embedding.

Embedding corpus... (This will take a few minutes for the full dataset)


Batches:   0%|          | 0/2577 [00:00<?, ?it/s]

✅ Embedding and indexing complete in 1270.85 seconds.

--- Setup Complete. Entering Interactive Query Mode. ---
Type 'exit' at any prompt to quit.

--- Enter New Query Details ---
Enter Age (e.g., 25): 40
Enter Race (e.g., White, Hispanic/Latino): white
Enter Violation (e.g., speeding): speeding
Enter Proposed Actions (e.g., vehicle search): vehicle search
Enter Current Duration in minutes (e.g., 6): 22

🚀 Running RAG Pipeline for New Query
📝 Formatted Query: A stop involving a 40-year-old white individual for a speeding. Actions being considered are vehicle search.

🔍 Retrieved 50 potentially similar past stops.
Would you like to see the details of these retrieved stops? (yes/no): yes

--- Top 50 Retrieved Documents (Before Filtering) ---
                                                                                                                                                                                                                                                          