# All-in-One RAG Pipeline Notebook
This notebook walks through document preprocessing, embedding, and querying with GPT-4 for transportation engineering.

In [1]:
!pip install -q sentence-transformers openai scikit-learn

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m22.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m22.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m28.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import os
import re
import glob
import numpy as np
from pathlib import Path
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import openai

In [3]:
# Mount Google Drive if needed
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
# Set your project folder path in Drive
BASE_PATH = '/content/drive/MyDrive/transpo_slm_assistant'
RAW_DIR = os.path.join(BASE_PATH, 'data/raw')
CLEAN_DIR = os.path.join(BASE_PATH, 'data/cleaned')
EMBED_DIR = os.path.join(BASE_PATH, 'data/embeddings')
os.makedirs(CLEAN_DIR, exist_ok=True)
os.makedirs(EMBED_DIR, exist_ok=True)

## Step 1: Preprocess Text Files

In [7]:
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\n+', '\n', text)
    return text.strip()

# Clean and split into chunks
CHUNK_SIZE = 1000
OVERLAP = 200

def split_text(text, size=CHUNK_SIZE, overlap=OVERLAP):
    return [text[i:i+size] for i in range(0, len(text), size - overlap)]

for file_path in glob.glob(f'{RAW_DIR}/*.txt'):
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
        raw = f.read()
    cleaned = clean_text(raw)
    chunks = split_text(cleaned)
    for i, chunk in enumerate(chunks):
        fname = Path(file_path).stem + f'_chunk_{i}.txt'
        with open(os.path.join(CLEAN_DIR, fname), 'w') as out:
            out.write(chunk)
    print(f'Processed: {file_path}')

## Step 2: Embed Text Chunks

In [9]:
model = SentenceTransformer('all-MiniLM-L6-v2')

for file_path in glob.glob(f'{CLEAN_DIR}/*.txt'):
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
    emb = model.encode(text)
    fname = Path(file_path).stem + '.npy'
    np.save(os.path.join(EMBED_DIR, fname), emb)
    print(f'Embedded: {fname}')

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

## Step 3: Query Your Assistant

In [10]:
# Load embeddings and texts
embed_model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = []
texts = []

for file_path in sorted(glob.glob(f'{EMBED_DIR}/*.npy')):
    emb = np.load(file_path)
    embeddings.append(emb)
    text_file = file_path.replace('embeddings', 'cleaned').replace('.npy', '.txt')
    with open(text_file, 'r', encoding='utf-8') as f:
        texts.append(f.read())

embeddings = np.array(embeddings)

def retrieve_context(query, top_k=3):
    query_vec = embed_model.encode([query])
    scores = cosine_similarity(query_vec, embeddings)[0]
    top_indices = np.argsort(scores)[-top_k:][::-1]
    return [texts[i] for i in top_indices]

In [11]:
# Set your OpenAI API key
openai.api_key = 'your-api-key-here'  # Replace with os.getenv('OPENAI_API_KEY')

In [12]:
def generate_answer(query):
    context = retrieve_context(query)
    context_block = '\n---\n'.join(context)
    prompt = f"Answer the question based on the following documents:\n{context_block}\n\nQuestion: {query}\nAnswer:"
    response = openai.ChatCompletion.create(
        model='gpt-4',
        messages=[{'role': 'user', 'content': prompt}],
        temperature=0.2
    )
    return response['choices'][0]['message']['content']

In [13]:
# Ask a question
query = input("Ask a transportation engineering question: ")
print("\nAnswer:\n")
print(generate_answer(query))

Ask a transportation engineering question: what color is  stop sign

Answer:



ValueError: Expected 2D array, got 1D array instead:
array=[].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.