In [11]:
from bs4 import BeautifulSoup
import requests
from datetime import *
import pandas as pd
head = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36 Edg/87.0.664.75'}

In [14]:
jobs = []
today = datetime.today().date()
cutoff_date = today - timedelta(days=30)
page_num = 1
stop_pagination = False
while not stop_pagination:
    page_url = f"https://jobsnet.in/page/{page_num}/"
    print(f"Fetching: {page_url}") # Checking
    response = requests.get(page_url, headers=head, timeout=10)
    soup = BeautifulSoup(response.content, "html.parser")
    articles = soup.find_all("article")
    if not articles:
        break  
    for article in articles:
        job = {}
        title_tag = article.find("h3", class_="entry-title")
        if title_tag:
            a_tag = title_tag.find("a", href=True)
            if a_tag:
                job["title"] = a_tag.text.strip()
                job["listing_url"] = a_tag["href"]
        time_tag = article.find("time")
        if time_tag:
            job_date = datetime.strptime(time_tag.text.strip(), "%B %d, %Y").date()
            job["posted_date"] = job_date
            if job_date < cutoff_date:
                stop_pagination = True
                break
        if job.get("listing_url"):
            jobs.append(job)
    if stop_pagination:
        break
    page_num += 1
for job in jobs:
    print(job)

Fetching: https://jobsnet.in/page/1/
Fetching: https://jobsnet.in/page/2/
Fetching: https://jobsnet.in/page/3/
{'title': 'Unisys Off Campus Drive 2026 | Associate Cybersecurity  Engineer | 0-2 Years', 'listing_url': 'https://jobsnet.in/unisys-off-campus-drive-associate-cybersecurity-engineer/', 'posted_date': datetime.date(2026, 1, 26)}
{'title': 'Equifax Off Campus Drive 2026 – Hiring Technology Engineering Trainee', 'listing_url': 'https://jobsnet.in/equifax-off-campus-drive-hiring-technology-engineering-trainee/', 'posted_date': datetime.date(2026, 1, 26)}
{'title': 'Oracle Hiring Entry-level Associate Test Engineers 0-2 Years Experience', 'listing_url': 'https://jobsnet.in/oracle-hiring-entry-level-associate-test-engineers/', 'posted_date': datetime.date(2026, 1, 26)}
{'title': 'HSBC Off Campus Drive 2026 | Trainee Apprentice – IT Auditor', 'listing_url': 'https://jobsnet.in/hsbc-off-campus-drive-trainee-apprentice-it-auditor/', 'posted_date': datetime.date(2026, 1, 26)}
{'title': 

In [None]:
import requests
from bs4 import BeautifulSoup
import time
job_details = []
for job in jobs:
    job_info = job.copy()
    try:
        response = requests.get(job_info["listing_url"])
        job_page = BeautifulSoup(response.content, "html.parser")
        apply_urls = set()
        for a in job_page.find_all("a", href=True):
            anchor_text = a.get_text(strip=True).lower()
            if any(word in anchor_text for word in ("apply here", "click here", "apply now")):
                apply_urls.add(a["href"])
        if apply_urls:
            job_info["apply_urls"] = list(apply_urls)
        for p in job_page.find_all("p"):
            text_lower = p.get_text(strip=True).lower()
            if "location" in text_lower and ":" in p.text:
                job_info["location"] = p.text.split(":", 1)[-1].strip()
            elif "experience" in text_lower and ":" in p.text:
                job_info["experience"] = p.text.split(":", 1)[-1].strip()
        descriptions = []
        for ul in job_page.find_all("ul", class_="wp-block-list"):
            for li in ul.find_all("li"):
                txt = li.get_text(strip=True)
                if txt:
                    descriptions.append(txt)
        for p in job_page.find_all("p"):
            txt = p.get_text(strip=True)
            if txt.startswith(("•", "-", "–", "*")):
                descriptions.append(txt.lstrip("•-–* ").strip())
        if descriptions:
            job_info["description"] = " ".join(dict.fromkeys(descriptions))
        job_details.append(job_info)
    except Exception as e:
        print(f"Error: {job_info['listing_url']} → {e}")

In [None]:
cleaned_jobs = None
BLOCKED_KEYWORDS = {'telegram','jobsnet','acciojob', 'whatsapp'}
for job in job_details:
    urls = job.get('apply_urls', [])
    cleaned_urls = {url for url in urls if not any(block in url.lower() for block in BLOCKED_KEYWORDS)}
    job['apply_urls'] = list(cleaned_urls)
normalized_jobs = []
for job in job_details:
    urls = job.get('apply_urls', [])
    if urls:
        for url in urls:
            new_job = job.copy()
            new_job['apply_url'] = url
            new_job.pop('apply_urls', None)
            normalized_jobs.append(new_job)
    else:
        new_job = job.copy()
        new_job['apply_url'] = None
        new_job.pop('apply_urls', None)
        normalized_jobs.append(new_job)
cleaned_jobs = normalized_jobs

In [None]:
df = pd.DataFrame(cleaned_jobs)
df.location = df.location.fillna('Across India')
df.experience = df.experience.fillna('Freshers')
df.description = df.description.fillna('No detailed description available. Please visit the apply link for more information.')
df['apply_url'] = df['apply_url'].fillna(df['listing_url'])
df.head()

In [None]:
from sqlalchemy import create_engine
import pandas as pd

DB_URL = "postgresql+psycopg2://postgres:Sunny$123@localhost:35432/pgvector"
engine = create_engine(DB_URL)

In [None]:
df.to_sql(
    name="jobs",
    con=engine,
    if_exists="append",
    index=False,
    method="multi",
    chunksize=500
)