<a href="https://colab.research.google.com/github/sign2speak/mainRepository/blob/main/webScraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install playwright beautifulsoup4 requests
!playwright install chromium



In [None]:
!apt-get update -qq
!apt-get install -y \
  libatk-bridge2.0-0 \
  libatk1.0-0 \
  libcups2 \
  libdrm2 \
  libxkbcommon0 \
  libxcomposite1 \
  libxdamage1 \
  libxrandr2 \
  libgbm1 \
  libasound2

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
libasound2 is already the newest version (1.2.6.1-1ubuntu1).
libasound2 set to manually installed.
libxdamage1 is already the newest version (1:1.1.5-2build2).
libxdamage1 set to manually installed.
libxkbcommon0 is already the newest version (1.4.0-1).
libxkbcommon0 set to manually installed.
libxrandr2 is already the newest version (2:1.5.2-1build1).
libxrandr2 set to manually installed.
libcups2 is already the newest version (2.4.1op1-1ubuntu4.16).
libcups2 set to manually installed.
libdrm2 is already the newest version (2.4.113-2~ubuntu0.22.04.1).
libdrm2 set to manually installed.
libgbm1 is already the newest version (23.2.1-1ubuntu3.1~22.04.3).
libgbm1 set to manually installed.
The following addi

In [None]:
!playwright install chromium


In [None]:
import os
import re
import time
import asyncio
import requests
from urllib.parse import urlparse
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright

# =========================
# CONFIG
# =========================
BASE_URL = "https://psl.org.pk"
CATEGORY_URL = "https://psl.org.pk/dictionary/65-sentences"
SAVE_DIR = "/content/PSL_VIDEOS"

HEADERS = {"User-Agent": "Mozilla/5.0"}
os.makedirs(SAVE_DIR, exist_ok=True)

# =========================
# UTILS
# =========================
def clean_filename(text: str) -> str:
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)
    text = re.sub(r"\s+", "_", text)
    return text.strip("_")

def filename_from_detail_url(detail_url: str) -> str:
    """
    https://psl.org.pk/dictionary/65-sentences/4265-are-you-deaf?_rsc=abc
    -> are_you_deaf.mp4
    """
    path = urlparse(detail_url).path
    last = path.rstrip("/").split("/")[-1]      # 4265-are-you-deaf
    last = re.sub(r"^\d+-", "", last)           # remove numeric id
    last = last.replace("-", "_")               # hyphens -> underscores
    last = clean_filename(last)
    return f"{last}.mp4"

# =========================
# MAIN ASYNC PIPELINE
# =========================
async def main():
    print("üöÄ Launching Playwright (async)")

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()

        # -------------------------
        # STEP 1: COLLECT DETAIL LINKS
        # -------------------------
        print("üìÑ Loading category page‚Ä¶")
        await page.goto(CATEGORY_URL, timeout=60000)
        await page.wait_for_timeout(8000)  # wait for JS cards

        anchors = await page.query_selector_all("a")
        links = set()

        for a in anchors:
            href = await a.get_attribute("href")
            if href and "/dictionary/65-sentences/" in href:
                if href.startswith("/"):
                    href = BASE_URL + href
                links.add(href)

        print(f"‚úÖ Found {len(links)} dictionary entries")

        # -------------------------
        # STEP 2: PROCESS EACH DETAIL PAGE
        # -------------------------
        for idx, detail_url in enumerate(sorted(links), start=1):
            print(f"\n[{idx}/{len(links)}] Processing: {detail_url}")

            try:
                await page.goto(detail_url, timeout=60000)
                await page.wait_for_timeout(4000)  # wait for video hydration

                html = await page.content()
                soup = BeautifulSoup(html, "html.parser")

                filename = filename_from_detail_url(detail_url)
                save_path = os.path.join(SAVE_DIR, filename)

                if os.path.exists(save_path):
                    print("  ‚è≠Ô∏è Already exists:", filename)
                    continue

                # Extract JS-injected video source
                source = soup.find("source", src=True)
                if not source:
                    print("  ‚ùå No video source found")
                    continue

                video_url = source["src"]
                print("  ‚¨áÔ∏è Downloading:", filename)

                r = requests.get(video_url, stream=True, headers=HEADERS)
                with open(save_path, "wb") as f:
                    for chunk in r.iter_content(8192):
                        if chunk:
                            f.write(chunk)

                print("  ‚úÖ Saved:", filename)
                time.sleep(0.4)

            except Exception as e:
                print("  ‚ùå Error:", e)

        await browser.close()

    print("\nüéâ ALL VIDEOS DOWNLOADED SUCCESSFULLY")

# =========================
# RUN (Colab-safe)
# =========================
await main()


üöÄ Launching Playwright (async)
üìÑ Loading category page‚Ä¶
‚úÖ Found 221 dictionary entries

[1/221] Processing: https://psl.org.pk/dictionary/65-sentences/4265-are-you-deaf
  ‚¨áÔ∏è Downloading: are_you_deaf.mp4
  ‚úÖ Saved: are_you_deaf.mp4

[2/221] Processing: https://psl.org.pk/dictionary/65-sentences/4266-are-you-hungry
  ‚¨áÔ∏è Downloading: are_you_hungry.mp4
  ‚úÖ Saved: are_you_hungry.mp4

[3/221] Processing: https://psl.org.pk/dictionary/65-sentences/4267-are-you-ready
  ‚¨áÔ∏è Downloading: are_you_ready.mp4
  ‚úÖ Saved: are_you_ready.mp4

[4/221] Processing: https://psl.org.pk/dictionary/65-sentences/4268-assalam-o-alaikum
  ‚¨áÔ∏è Downloading: assalam_o_alaikum.mp4
  ‚úÖ Saved: assalam_o_alaikum.mp4

[5/221] Processing: https://psl.org.pk/dictionary/65-sentences/4269-beware
  ‚¨áÔ∏è Downloading: beware.mp4
  ‚úÖ Saved: beware.mp4

[6/221] Processing: https://psl.org.pk/dictionary/65-sentences/4270-call-the-ambulance
  ‚¨áÔ∏è Downloading: call_the_ambulance.mp4
  ‚úÖ Sav

---
EMBEDDINGS

In [None]:
import os
import shutil

SOURCE_DIR = "/content/drive/MyDrive/PSL_VIDEOS"
TARGET_DIR = "/content/drive/MyDrive/FRAMES_UPDATED"

os.makedirs(TARGET_DIR, exist_ok=True)

for file in os.listdir(SOURCE_DIR):
    if file.endswith(".mp4"):
        sign_name = os.path.splitext(file)[0].upper()  # hello.mp4 ‚Üí HELLO

        sign_folder = os.path.join(TARGET_DIR, sign_name)
        os.makedirs(sign_folder, exist_ok=True)

        shutil.move(
            os.path.join(SOURCE_DIR, file),
            os.path.join(sign_folder, file)
        )

print("‚úÖ Videos organized into sign folders")


‚úÖ Videos organized into sign folders


In [None]:
# %%bash

# CLIPS_DIR="/content/drive/MyDrive/PSL_VIDEOS"
# FRAMES_DIR="/content/drive/MyDrive/FRAMES_FROM_START"

# mkdir -p "$FRAMES_DIR"

# for class_dir in "$CLIPS_DIR"/*; do
#   [ -d "$class_dir" ] || continue

#   class_name=$(basename "$class_dir")
#   echo "üìÅ Processing class: $class_name"

#   mkdir -p "$FRAMES_DIR/$class_name"

#   for video in "$class_dir"/*.mp4; do
#     video_name=$(basename "$video" .mp4)
#     echo "  üé¨ Extracting from start: $video_name.mp4"

#     ffmpeg -y -loglevel error -i "$video" \
#       "$FRAMES_DIR/$class_name/${video_name}_%05d.jpg"
#   done
# done

# echo "‚úÖ Frame extraction from start complete"

In [None]:
import os
import cv2
import numpy as np
import tensorflow as tf

In [None]:
DATASET_DIR = "/content/drive/MyDrive/FRAMES_UPDATED"
EMBEDDINGS_DIR = "/content/drive/MyDrive/EMBEDDINGS_2.0"

IMAGE_HEIGHT = 128
IMAGE_WIDTH = 128
SEQUENCE_LENGTH = 20

os.makedirs(EMBEDDINGS_DIR, exist_ok=True)

In [None]:
def extract_frames(video_path):
    frames = []
    cap = cv2.VideoCapture(video_path)

    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    skip = max(total_frames // SEQUENCE_LENGTH, 1)

    for i in range(SEQUENCE_LENGTH):
        cap.set(cv2.CAP_PROP_POS_FRAMES, i * skip)
        ret, frame = cap.read()
        if not ret:
            break

        frame = cv2.resize(frame, (IMAGE_WIDTH, IMAGE_HEIGHT))
        frame = frame / 255.0
        frames.append(frame)

    cap.release()
    return np.array(frames)


In [None]:
base_model = tf.keras.applications.MobileNetV2(
    include_top=False,
    weights="imagenet",
    input_shape=(IMAGE_HEIGHT, IMAGE_WIDTH, 3),
    pooling="avg"
)

base_model.trainable = False


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/mobilenet_v2/mobilenet_v2_weights_tf_dim_ordering_tf_kernels_1.0_128_no_top.h5
[1m9406464/9406464[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m1s[0m 0us/step


In [None]:
def video_to_embedding(video_path):
    frames = extract_frames(video_path)

    if frames.shape[0] == 0:
        raise ValueError("No frames extracted")

    # Pad if video is too short
    while frames.shape[0] < SEQUENCE_LENGTH:
        frames = np.vstack([frames, frames[-1][None, ...]])

    # Extract features per frame
    frame_embeddings = base_model.predict(frames, verbose=0)

    # Average over time ‚Üí single embedding
    video_embedding = np.mean(frame_embeddings, axis=0)

    return video_embedding

In [None]:
for sign_name in os.listdir(DATASET_DIR):
    sign_path = os.path.join(DATASET_DIR, sign_name)

    if not os.path.isdir(sign_path):
        continue

    print(f"üîπ Processing sign: {sign_name}")

    videos = [
        f for f in os.listdir(sign_path)
        if f.lower().endswith(".mp4")
    ]

    if not videos:
        print(f"‚ö†Ô∏è No video found for {sign_name}")
        continue

    video_path = os.path.join(sign_path, videos[0])

    try:
        embedding = video_to_embedding(video_path)

        np.save(
            os.path.join(EMBEDDINGS_DIR, f"{sign_name}.npy"),
            embedding
        )

        print(f"‚úÖ Saved embedding for {sign_name} | shape: {embedding.shape}")

    except Exception as e:
        print(f"‚ùå Failed for {sign_name}: {e}")


üîπ Processing sign: PSL_DICTIONARY
‚úÖ Saved embedding for PSL_DICTIONARY | shape: (1280,)
üîπ Processing sign: ARE_YOU_DEAF
‚úÖ Saved embedding for ARE_YOU_DEAF | shape: (1280,)
üîπ Processing sign: ARE_YOU_HUNGRY
‚úÖ Saved embedding for ARE_YOU_HUNGRY | shape: (1280,)
üîπ Processing sign: ARE_YOU_READY
‚úÖ Saved embedding for ARE_YOU_READY | shape: (1280,)
üîπ Processing sign: ASSALAM_O_ALAIKUM
‚úÖ Saved embedding for ASSALAM_O_ALAIKUM | shape: (1280,)
üîπ Processing sign: BEWARE
‚úÖ Saved embedding for BEWARE | shape: (1280,)
üîπ Processing sign: CALL_THE_AMBULANCE
‚úÖ Saved embedding for CALL_THE_AMBULANCE | shape: (1280,)
üîπ Processing sign: CAN_I_HELP_YOU
‚úÖ Saved embedding for CAN_I_HELP_YOU | shape: (1280,)
üîπ Processing sign: CAN_I_TAKE_YOUR_ORDER
‚úÖ Saved embedding for CAN_I_TAKE_YOUR_ORDER | shape: (1280,)
üîπ Processing sign: CAN_I_USE_THE_TOILET
‚úÖ Saved embedding for CAN_I_USE_THE_TOILET | shape: (1280,)
üîπ Processing sign: CLOSE_THE_DOOR
‚úÖ Saved embed