In [3]:
pip install requests beautifulsoup4 pdfplumber python-pptx python-docx sentence-transformers faiss-cpu tqdm

Collecting requests
  Using cached requests-2.32.4-py3-none-any.whl (64 kB)
Collecting beautifulsoup4
  Downloading beautifulsoup4-4.13.4-py3-none-any.whl (187 kB)
[K     |████████████████████████████████| 187 kB 2.5 MB/s eta 0:00:01
[?25hCollecting pdfplumber
  Downloading pdfplumber-0.11.7-py3-none-any.whl (60 kB)
[K     |████████████████████████████████| 60 kB 4.6 MB/s eta 0:00:01
[?25hCollecting python-pptx
  Downloading python_pptx-1.0.2-py3-none-any.whl (472 kB)
[K     |████████████████████████████████| 472 kB 15.7 MB/s eta 0:00:01
[?25hCollecting python-docx
  Downloading python_docx-1.2.0-py3-none-any.whl (252 kB)
[K     |████████████████████████████████| 252 kB 26.2 MB/s eta 0:00:01
[?25hCollecting sentence-transformers
  Downloading sentence_transformers-4.1.0-py3-none-any.whl (345 kB)
[K     |████████████████████████████████| 345 kB 47.6 MB/s eta 0:00:01
[?25hCollecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp39-cp39-macosx_14_0_arm64.whl (3.3 MB)
[K     |█████

In [20]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import os

import pdfplumber
from pptx import Presentation
from docx import Document
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss
from tqdm import tqdm


import json



In [21]:


other_links = [
    "https://www.kennesaw.edu/ccse/first-year-experience/cse-1300.php",
    "https://www.kennesaw.edu/ccse/first-year-experience/cse-1321-python.php",
    "https://www.kennesaw.edu/ccse/first-year-experience/cse-1321-lab-python.php",
    "https://www.kennesaw.edu/ccse/first-year-experience/cse-1322.php",
    "https://www.kennesaw.edu/ccse/first-year-experience/cse-1322-lab.php",
    "https://www.kennesaw.edu/ccse/first-year-experience/policies.php",
    "https://www.kennesaw.edu/ccse/first-year-experience/faculty-staff.php",
    "https://www.kennesaw.edu/ccse/first-year-experience/office-hours.php",
    "https://www.kennesaw.edu/ccse/first-year-experience/old_syllabus_schedule.php",
    "https://www.kennesaw.edu/ccse/first-year-experience/registration.php",
    "https://www.kennesaw.edu/ccse/first-year-experience/fye-feedback.php",
    "https://www.kennesaw.edu/ccse/first-year-experience/policies.php"
]

files_found = []
for link in other_links:
    resp = requests.get(link)
    soup = BeautifulSoup(resp.content, "html.parser")
    for a in soup.find_all("a", href=True):
        href = a["href"]
        full_link = urljoin(link, href)
        if any(full_link.lower().endswith(ext) for ext in [".pdf", ".pptx", ".docx"]):
            files_found.append(full_link)

files_found = list(set(files_found))

# ✅ Print results
if files_found:
    print(f"Final files found across pages ({len(files_found)}):\n")
    for f in files_found:
        print(f)
else:
    print("No downloadable files found across links.")


✅ Final files found across pages (157):

https://www.kennesaw.edu/ccse/first-year-experience/cse1322l/assignments/cse-1322l-spring-2025-assignment-2-v1.1.pdf
https://www.kennesaw.edu/ccse/first-year-experience/cse1322/slides/m2.1_methods.pptx
https://www.kennesaw.edu/ccse/first-year-experience/cse1300/slides/cse1300-software_engg.pptx
https://www.kennesaw.edu/ccse/first-year-experience/cse1321_python/book/python_quick_tour.pdf
https://www.kennesaw.edu/ccse/first-year-experience/cse1321l_python/labs/m5-lab-9-sequence-type-part-2-v1.2.pdf
https://www.kennesaw.edu/ccse/first-year-experience/cse1322/schedule/cse1322_schedule_summer_v2.1.pdf
https://www.kennesaw.edu/ccse/first-year-experience/cse1322l/assignments/cse-1322l-spring-2025-assignment-1.pdf
https://www.kennesaw.edu/ccse/first-year-experience/cse1322/books/fundamentals-of-computer-programming-with-csharp-nakov-ebook-v2013.pdf
https://www.kennesaw.edu/ccse/first-year-experience/cse1300/slides/cse1300-m2.4-abstraction-v1.1.pptx
http

In [None]:

files_dir = "files"

# ---- Extract text ----
def extract_text(filepath):
    text = ""
    if filepath.endswith(".pdf"):
        with pdfplumber.open(filepath) as pdf:
            for page in pdf.pages:
                text += page.extract_text() or ""
    elif filepath.endswith(".pptx"):
        prs = Presentation(filepath)
        for slide in prs.slides:
            for shape in slide.shapes:
                if hasattr(shape, "text"):
                    text += shape.text + " "
    elif filepath.endswith(".docx"):
        doc = Document(filepath)
        for para in doc.paragraphs:
            text += para.text + " "
    return text

# ---- Split into chunks (~300 words) ----
def chunk_text(text, chunk_size=300):
    words = text.split()
    for i in range(0, len(words), chunk_size):
        yield " ".join(words[i : i + chunk_size])

# ---- Main ----
embedding_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
embedding_dim = embedding_model.get_sentence_embedding_dimension()

documents = []
embeddings = []
files = [os.path.join(files_dir, f) for f in os.listdir(files_dir)]

for filepath in tqdm(files, desc="Processing files"):
    text = extract_text(filepath)
    if not text:
        continue
    for chunk in chunk_text(text, chunk_size=500):
        documents.append({"file": os.path.basename(filepath), "text": chunk})
        embeddings.append(embedding_model.encode(chunk))

# ---- Build FAISS Index ----
embeddings = np.vstack(embeddings).astype("float32")
index = faiss.IndexFlatL2(embedding_dim)
index.add(embeddings)

# ---- Save Index and Metadata ----
faiss.write_index(index, "fye_files.index")
import json
with open("fye_files_metadata.json", "w") as f:
    json.dump(documents, f)

print(f"Done! Created FAISS index with {len(embeddings)} embeddings.")


  from .autonotebook import tqdm as notebook_tqdm
Processing files:  57%|█████▋    | 94/166 [02:44<01:12,  1.01s/it]Cannot set gray non-stroke color because /'P2321' is an invalid float value
Cannot set gray non-stroke color because /'P2323' is an invalid float value
Cannot set gray non-stroke color because /'P2324' is an invalid float value
Cannot set gray non-stroke color because /'P2325' is an invalid float value
Cannot set gray non-stroke color because /'P2326' is an invalid float value
Cannot set gray non-stroke color because /'P2327' is an invalid float value
Cannot set gray non-stroke color because /'P2328' is an invalid float value
Cannot set gray non-stroke color because /'P2329' is an invalid float value
Cannot set gray non-stroke color because /'P2330' is an invalid float value
Cannot set gray non-stroke color because /'P2331' is an invalid float value
Cannot set gray non-stroke color because /'P2332' is an invalid float value
Cannot set gray non-stroke color because /'P2333

✅ Done! Created FAISS index with 2654 embeddings.


In [22]:

# Paths
index_file = "fye_files.index"
metadata_file = "fye_files_metadata.json"

# Load Index
index = faiss.read_index(index_file)

# Load Metadata
with open(metadata_file, "r") as f:
    documents = json.load(f)

# Load Model
embedding_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

def search_query(query, top_k=5):
    """Search the index for a given query"""
    query_embedding = embedding_model.encode([query])
    distances, indices = index.search(np.asarray(query_embedding, dtype=np.float32), top_k)

    results = []
    for idx, dist in zip(indices[0], distances[0]): 
        results.append({
            "file": documents[idx]["file"],
            "text": documents[idx]["text"],
            "distance": float(dist),
        })
    return results

# ⚡️ TRY IT:
results = search_query("The student demonstrates a solid understanding of arrays and lists, as evidenced by their performance in two labs. However, they struggle with loops, indicating a need for improvement in iterating through data structures effectively. Additionally, the feedback suggests that the student may benefit from refining their error handling and code structure to enhance overall clarity and functionality.", top_k=5)

# DISPLAY RESULTS
for r in results:
    print(f"File: {r['file']} | Distance: {r['distance']:.4f}")
    print(f"Snippet: {r['text'][:300]}...")
    print("="*80)



File: cse1321_syllabus_spring24.docx | Distance: 0.6762
Snippet: CSE 1321/01 – Programming and Problem Solving I Spring 2024 SYLLABUS Course Description, Credit Hours, and Prerequisites CSE 1321: Programming and Problem Solving I 3 Class Hours 0 Laboratory Hours 3 Credit Hours (Concurrent Prerequisites: CSE 1321L and (MATH 1112 or MATH 1113 or MATH 1190 or CSE 13...
File: jjj-os-20170625.pdf | Distance: 0.7088
Snippet: on the sheet // Bound test do a problem // Loop body cross it off the assignment sheet // Updater (cid:10) (cid:9) It is possible that the assignment sheet contains no homework problems tobeginwith. Inthatcase,there’snoworkforthebodyofthelooptodo anditshouldbeskipped. SELF-STUDYEXERCISES EXERCISE 6....
File: m2-lab-5-flow-control-part-2-v1.4.pdf | Distance: 0.7491
Snippet: CSE 1321L: Programming and Problem Solving I Lab Lab 5 Flow Control (Part 2) What students will learn: o Using WHILE loops. o Using FOR loops. o Using Nested FOR loops. Content o Overview o Lab5A: Larg