In [13]:
import requests
import pandas as pd
import time

def fetch_courses(limit_per_page=100, max_pages=5):
    base_url = "https://api.coursera.org/api/courses.v1"
    
    fields = "name,description,slug,level,primaryLanguages,workload,domainTypes,certificates"
    
    all_courses = []
    start = 0
    page_count = 0
    
    print(f"Starting fetch process...")
    
    while page_count < max_pages:
        params = {
            "start": start,
            "limit": limit_per_page,
            "fields": fields
        }
        
        try:
            response = requests.get(base_url, params=params)
            
            if response.status_code == 200:
                data = response.json()
                elements = data.get('elements', [])
                
                if not elements:
                    print("No more data available.")
                    break
                
                filtered_count = 0
                for item in elements:
                    languages = item.get("primaryLanguages", [])
                    if 'en' not in languages:
                        continue

                    domains = item.get("domainTypes", [])
                    if domains:
                        category = domains[0].get("subdomainId") or domains[0].get("domainId") or "General"
                    else:
                        category = "General"

                    certs = item.get("certificates", [])
                    cert_str = ", ".join(certs) if certs else "Standard Course Certificate"

                    course_info = {
                        "id": item.get("id"),
                        "title": item.get("name"),
                        "description": item.get("description"),
                        "level": item.get("level", "Not Specified"),
                        "duration": item.get("workload", "Self-paced"),
                        "category": category,
                        "certificate_type": cert_str,
                        "url": f"https://www.coursera.org/learn/{item.get('slug')}"
                    }
                    all_courses.append(course_info)
                    filtered_count += 1
                
                print(f"Page {page_count + 1}: Kept {filtered_count} courses.")
                
                if 'paging' in data and 'next' in data['paging']:
                    start = int(data['paging']['next'])
                    page_count += 1
                else:
                    break
                
                time.sleep(1)
                
            else:
                print(f"Error: {response.status_code}")
                break
                
        except Exception as e:
            print(f"Exception: {e}")
            break

    return all_courses

def save_to_csv(courses, filename="coursera_dataset.csv"):
    if not courses:
        print("No data to save.")
        return

    df = pd.DataFrame(courses)
    df = df.dropna(subset=['description']) 
    
    if 'category' in df.columns:
        df['category'] = df['category'].str.replace('-', ' ').str.title()

    df.to_csv(filename, index=False, encoding='utf-8-sig')
    print(f"Saved {len(df)} courses to {filename}")

if __name__ == "__main__":
    courses = fetch_courses(max_pages=5)
    save_to_csv(courses)

Starting fetch process...
Page 1: Kept 79 courses.
Page 2: Kept 76 courses.
Page 3: Kept 70 courses.
Page 4: Kept 78 courses.
Page 5: Kept 86 courses.
Saved 389 courses to coursera_dataset.csv


In [None]:
import pandas as pd

# 1. อ่านไฟล์ CSV
df = pd.read_csv("coursera_dataset.csv")

pd.set_option('display.max_colwidth', 50)  # ให้โชว์แค่ 50 ตัวอักษรพอ เดี๋ยวล้นจอ
df

In [None]:
pip install langchain-chroma langchain-huggingface chromadb

In [None]:
import pandas as pd
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain_core.documents import Document
import os

def build_database(csv_path="coursera_dataset.csv", db_path="./vector_store"):
    if not os.path.exists(csv_path):
        print(f"File not found: {csv_path}")
        return

    df = pd.read_csv(csv_path)
    
    # ใช้ Model ฟรีของ HuggingFace ทำงานบน CPU ได้ ไม่ต้องใช้ API Key
    embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

    documents = []
    print(f"Processing {len(df)} courses...")

    for _, row in df.iterrows():
        # รวมข้อมูลที่จะให้ AI ใช้ค้นหาความหมาย
        content = f"""
        Title: {row['title']}
        Category: {row['category']}
        Level: {row['level']}
        Description: {row['description']}
        """
        
        # เก็บข้อมูลสำหรับนำไปแสดงผล (AI ไม่เอาไปคำนวณ แต่เก็บไว้ให้)
        metadata = {
            "id": str(row['id']),
            "title": row['title'],
            "url": row['url'],
            "duration": str(row['duration']),
            "certificate": str(row['certificate_type']),
            "level": str(row['level'])
        }

        doc = Document(page_content=content.strip(), metadata=metadata)
        documents.append(doc)

    print("Creating Vector Database...")
    
    # สร้าง DB และบันทึกลงโฟลเดอร์ (Persist)
    Chroma.from_documents(
        documents=documents,
        embedding=embedding_model,
        persist_directory=db_path
    )
    
    print(f"Database saved to {db_path}")

def test_search(query, db_path="./vector_store"):
    print(f"\nTesting Search: '{query}'")
    
    embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    db = Chroma(persist_directory=db_path, embedding_function=embedding_model)
    
    # ค้นหา 3 อันดับแรกที่ใกล้เคียงที่สุด
    results = db.similarity_search(query, k=3)
    
    for i, doc in enumerate(results):
        print(f"{i+1}. {doc.metadata['title']} ({doc.metadata['level']})")
        print(f"   Link: {doc.metadata['url']}")
        print("-" * 20)

if __name__ == "__main__":
    # 1. สร้าง DB
    build_database()
    
    # 2. ลองค้นหาดู
    test_search("I want to learn about Python for Data Science")

  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


Processing 389 courses...
Creating Vector Database...
Database saved to ./vector_store

Testing Search: 'I want to learn about Python for Data Science'
1. Python Programming Fundamentals (BEGINNER)
   Link: https://www.coursera.org/learn/microsoft-python-programming-fundamentals
--------------------
2. Intermediate Python – Libraries, Tools & Practical Projects (INTERMEDIATE)
   Link: https://www.coursera.org/learn/packt-intermediate-python-libraries-tools-and-practical-projects-0d9as
--------------------
3. Data Visualization and Modeling in Python (INTERMEDIATE)
   Link: https://www.coursera.org/learn/python-data-modeling
--------------------


In [4]:
test_search("i want to learn TailwindCSS")


Testing Search: 'i want to learn TailwindCSS'
1. Tailwind CSS Practice Project: Build a Product Card (INTERMEDIATE)
   Link: https://www.coursera.org/learn/build-a-product-card-with-tailwind-css
--------------------
2. Create a Dark Moody Atmospheric 2D Game with Unity and C# (BEGINNER)
   Link: https://www.coursera.org/learn/packt-create-a-dark-moody-atmospheric-2d-game-with-unity-and-c-2qqm9
--------------------
3. Unix System Overview and Command (INTERMEDIATE)
   Link: https://www.coursera.org/learn/unix-system-overview-and-command
--------------------


In [None]:
import os

try:
    current_path = os.path.dirname(os.path.abspath(__file__))
except NameError:
    current_path = os.getcwd() # กันเหนียวเผื่อรันใน Notebook

project_root = current_path
    
    # 2. วนลูปเดินถอยหลังขึ้นไปเรื่อยๆ จนกว่าจะเจอโฟลเดอร์ชื่อ 'data'
while True:
    possible_data_path = os.path.join(project_root, 'data')
    
    if os.path.exists(possible_data_path):
        # เจอแล้ว! หยุดตรงนี้แหละคือ Root
        print(f"✅ Found project root at: {project_root}")
        break
    
    # ถ้าไม่เจอ ให้ถอยหลังขึ้นไปอีก 1 ชั้น
    parent_dir = os.path.dirname(project_root)
    
    if parent_dir == project_root:
        # ถ้าถอยจนสุดทาง (C:\) แล้วยังไม่เจอ แสดงว่าวางไฟล์ผิดที่
        print("❌ Error: Could not find 'data' folder.")
        print("Please check if 'data' folder exists inside your project.")
        
    project_root = parent_dir

    # 3. กำหนด path ที่ถูกต้องจาก Root ที่หาเจอ
csv_path = os.path.join(project_root, 'data', 'coursera_dataset.csv')
db_path = os.path.join(project_root, 'vector_store')

Project Root: c:\SUPERPROJECT\modules
Looking for CSV at: c:\SUPERPROJECT\modules\data\coursera_dataset.csv
Error: File not found at c:\SUPERPROJECT\modules\data\coursera_dataset.csv
