In [None]:
import os
import time
from selenium import webdriver
from selenium.webdriver.common.by import By

# Create folder to save HTML files
output_folder = "tnea_pages"
os.makedirs(output_folder, exist_ok=True)

driver = webdriver.Firefox()

driver.get("https://cutoff.tneaonline.org/")

# Wait for the page and reCAPTCHA iframe to load
time.sleep(5)

# Switch to the reCAPTCHA iframe and click checkbox
iframe = driver.find_element(By.CSS_SELECTOR, "iframe[src*='recaptcha']")
driver.switch_to.frame(iframe)
checkbox = driver.find_element(By.ID, "recaptcha-anchor")
checkbox.click()

# Switch back to main content
driver.switch_to.default_content()

time.sleep(5)

# Click the submit button
submit_button = driver.find_element(By.CSS_SELECTOR, "input[type='submit'][value='Proceed']")
submit_button.click()

# Wait for the results page to load
time.sleep(10)  # Increase if loading is slow

# Loop through all 174 pages
for page_number in range(1, 175):
    print(f"Processing page {page_number}...")

    # Find the pagination link for the page number
    try:
        link = driver.find_element(By.XPATH, f"//ul[contains(@class, 'pagination')]//a[text()='{page_number}']")
        link.click()
    except Exception as e:
        print(f"Could not find or click page {page_number}: {e}")
        break

    # Wait for the page content to update
    time.sleep(5)

    # Save the page source
    html = driver.page_source
    filename = os.path.join(output_folder, f"page_{page_number}.html")
    with open(filename, "w", encoding="utf-8") as f:
        f.write(html)

driver.quit()
print("All pages saved.")


In [1]:
import os
from bs4 import BeautifulSoup
import pandas as pd

# Path to the folder containing all HTML files
folder_path = "tnea_pages"

# List to store all rows
all_rows = []

# Loop through all HTML files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".html"):
        filepath = os.path.join(folder_path, filename)
        with open(filepath, "r", encoding="utf-8") as file:
            soup = BeautifulSoup(file, "html.parser")

            # Find the table with the class name
            table = soup.find("table", class_="table table-striped table-bordered table-hover table-condensed")
            if table:
                tbody = table.find("tbody")
                if tbody:
                    for row in tbody.find_all("tr"):
                        cells = [td.get_text(strip=True).replace('\xa0', ' ') for td in row.find_all("td")]
                        if cells:
                            all_rows.append(cells)

# Column headers (based on the HTML structure you showed)
columns = [
    "College Code", "College Name", "Branch Code", "Branch Name",
    "OC", "BC", "BCM", "MBC", "MBCDNC", "MBCV", "SC", "SCA", "ST"
]

# Create DataFrame
df = pd.DataFrame(all_rows, columns=columns)

# Save to CSV or Excel
df.to_csv("tnea_data.csv", index=False)
# or
df.to_json("tnea_data.json", orient="records", indent=2)

print(f"Scraped {len(df)} rows from {len(os.listdir(folder_path))} HTML files and saved to 'tnea_data.json'.")


Scraped 3474 rows from 174 HTML files and saved to 'tnea_data.json'.
