# **Scrapping Indeed Job Listings in Jupyter Notebook**

## **Step-1: Import Libraries**

In [22]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

## **Step-2: Set Up the Indeed URL for Scraping**

In [8]:
# Base URL for Indeed job search (Data Science jobs in India)
base_url = "https://in.indeed.com/jobs?q=Data+Science&l=India"

# Headers to mimic a real user request
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

## **Step-3: Function to Scrape Multiple Pages**

In [9]:
def scrape_indeed_jobs(pages=1):
    job_list = []

    for page in range(0, pages * 10, 10):  # Indeed paginates every 10 results
        url = f"{base_url}&start={page}"
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.text, "html.parser")

        jobs = soup.find_all("div", class_="job_seen_beacon")

        for job in jobs:
            title = job.find("h2").text.strip() if job.find("h2") else "N/A"
            company = job.find("span", class_="companyName").text.strip() if job.find("span", class_="companyName") else "N/A"
            location = job.find("div", class_="companyLocation").text.strip() if job.find("div", class_="companyLocation") else "N/A"
            salary = job.find("div", class_="attribute_snippet").text.strip() if job.find("div", class_="attribute_snippet") else "N/A"
            summary = job.find("div", class_="job-snippet").text.strip() if job.find("div", class_="job-snippet") else "N/A"
            date_posted = job.find("span", class_="date").text.strip() if job.find("span", class_="date") else "N/A"

            job_list.append({
                "Job Title": title,
                "Company": company,
                "Location": location,
                "Salary": salary,
                "Job Summary": summary,
                "Date Posted": date_posted
            })

        print(f"Scraped page {page // 10 + 1}")
        time.sleep(2)  # Pause to avoid being blocked

    return pd.DataFrame(job_list)

## **Step-4: Scrape 5 Pages & Store Data in CSV**

In [10]:
df = scrape_indeed_jobs(pages=5)

# Save to CSV
df.to_csv("../0_Data/indeed_jobs.csv", index=False)

print("Scraping complete. Data saved in 'data/indeed_jobs.csv'.")

Scraped page 1
Scraped page 2
Scraped page 3
Scraped page 4
Scraped page 5
Scraping complete. Data saved in 'data/indeed_jobs.csv'.


In [12]:
base_url = "https://in.indeed.com/jobs?q=Data+Science&l=India"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

response = requests.get(base_url, headers=headers)
print("Status Code: ", response.status_code)  # Should return 200 if successful
print(response.text[:500])   # Print first 500 characters of HTML response

Status Code:  403
<!DOCTYPE html><html lang="en"><head><meta charset="utf-8"><title>Security Check - Indeed.com</title><meta name="viewport" content="width=device-width, initial-scale=1"><style>:root{color-scheme:light dark;--background-color:#fff;--primary-1000:#0d2d5e;--primary-900:#164081;--primary-800:#2557a7;--primary-700:#3f73d3;--primary-600:#6792f0;--neutral-1000:#2d2d2d;--neutral-900:#424242;--neutral-400:#d4d2d0;--dark-1000:#040606;--link-color:var(--primary-800);--link-color-hover:var(--primary-900);--


In [13]:
soup = BeautifulSoup(response.text, "html.parser")
jobs = soup.find_all("div", class_="job_seen_beacon")

print(f"Found {len(jobs)} jobs on the page.")  # Should not be 0

Found 0 jobs on the page.


In [14]:
for job in jobs[:5]:  # Print first 5 job titles
    title = job.find("h2").text.strip() if job.find("h2") else "N/A"
    print(title)

In [15]:
import requests
from bs4 import BeautifulSoup
import random

# Rotate User-Agent headers to appear like a real browser
USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
    "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/109.0"
]

session = requests.Session()
headers = {
    "User-Agent": random.choice(USER_AGENTS),  # Random User-Agent
    "Referer": "https://www.google.com/",  # Simulate coming from Google
}

url = "https://in.indeed.com/jobs?q=Data+Science&l=India"
response = session.get(url, headers=headers)

print("Status Code:", response.status_code)
print(response.text[:500])  # Check if we get actual job listings


Status Code: 403
<!DOCTYPE html><html lang="en"><head><meta charset="utf-8"><title>Security Check - Indeed.com</title><meta name="viewport" content="width=device-width, initial-scale=1"><style>:root{color-scheme:light dark;--background-color:#fff;--primary-1000:#0d2d5e;--primary-900:#164081;--primary-800:#2557a7;--primary-700:#3f73d3;--primary-600:#6792f0;--neutral-1000:#2d2d2d;--neutral-900:#424242;--neutral-400:#d4d2d0;--dark-1000:#040606;--link-color:var(--primary-800);--link-color-hover:var(--primary-900);--


In [18]:
pip install selenium webdriver-manager

Note: you may need to restart the kernel to use updated packages.Collecting selenium
  Using cached selenium-4.28.1-py3-none-any.whl.metadata (7.1 kB)
Collecting webdriver-manager
  Using cached webdriver_manager-4.0.2-py2.py3-none-any.whl.metadata (12 kB)
Collecting trio~=0.17 (from selenium)
  Using cached trio-0.28.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Using cached trio_websocket-0.11.1-py3-none-any.whl.metadata (4.7 kB)
Collecting websocket-client~=1.8 (from selenium)
  Downloading websocket_client-1.8.0-py3-none-any.whl.metadata (8.0 kB)
Collecting python-dotenv (from webdriver-manager)
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Collecting attrs>=23.2.0 (from trio~=0.17->selenium)
  Using cached attrs-25.1.0-py3-none-any.whl.metadata (10 kB)
Collecting sortedcontainers (from trio~=0.17->selenium)
  Downloading sortedcontainers-2.4.0-py2.py3-none-any.whl.metadata (10 kB)
Collecting outcome (from trio~=0.17->se


[notice] A new release of pip is available: 24.3.1 -> 25.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [19]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
import time

# Set up Selenium
options = webdriver.ChromeOptions()
options.add_argument("--headless")  # Run in background (remove this to see browser)
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# Open Indeed page
driver.get("https://in.indeed.com/jobs?q=Data+Science&l=India")
time.sleep(3)  # Wait for page to load

# Extract job listings
jobs = driver.find_elements(By.CLASS_NAME, "job_seen_beacon")
print(f"Found {len(jobs)} jobs.")

driver.quit()


Found 0 jobs.


In [20]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# URL of TimesJobs search results for Data Science
url = "https://www.timesjobs.com/candidate/job-search.html?searchType=personalizedSearch&from=submit&txtKeywords=Data+Science&txtLocation="

# Send a request to fetch the webpage
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"}
response = requests.get(url, headers=headers)

# Parse the HTML content
soup = BeautifulSoup(response.text, "html.parser")

# Find all job postings
jobs = soup.find_all("li", class_="clearfix job-bx wht-shd-bx")

# Extract job details
job_list = []
for job in jobs:
    title = job.find("h2").text.strip()
    company = job.find("h3", class_="joblist-comp-name").text.strip()
    location = job.find("ul", class_="top-jd-dtl clearfix").find_all("li")[1].text.strip()
    skills = job.find("span", class_="srp-skills").text.strip()
    posted_date = job.find("span", class_="sim-posted").text.strip()

    job_list.append([title, company, location, skills, posted_date])

# Save to CSV
df = pd.DataFrame(job_list, columns=["Job Title", "Company", "Location", "Skills", "Posted Date"])
df.to_csv("data/timesjobs_jobs.csv", index=False)

print("✅ Scraping completed! Data saved in 'data/timesjobs_jobs.csv'.")


AttributeError: 'NoneType' object has no attribute 'find_all'

In [21]:
for job in jobs:
    title = job.find("h2").text.strip()
    company = job.find("h3", class_="joblist-comp-name").text.strip()
    
    # Check if location exists
    location_tag = job.find("ul", class_="top-jd-dtl clearfix")
    if location_tag:  
        location_items = location_tag.find_all("li")  # Get all <li> inside <ul>
        location = location_items[1].text.strip() if len(location_items) > 1 else "Not specified"
    else:
        location = "Not specified"

    skills = job.find("span", class_="srp-skills").text.strip()
    posted_date = job.find("span", class_="sim-posted").text.strip()

    job_list.append([title, company, location, skills, posted_date])


AttributeError: 'NoneType' object has no attribute 'text'

In [28]:
# TimesJobs URL for Data Analyst jobs in India
url = "https://www.timesjobs.com/candidate/job-search.html?searchType=personalizedSearch&from=submit&searchTextSrc=as&searchTextText=%22Data+Analyst%22%2CIndia&txtKeywords=%22Data+Analyst%22%2C&cboWorkExp1=2&txtLocation=India"

# Set user-agent to avoid being blocked
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"}

# Send GET request
response = requests.get(url, headers=headers)

# Save HTML content to a file
with open("timesjobs_page.html", "w", encoding="utf-8") as file:
    file.write(response.text)

print("HTML page saved as 'timesjobs_page.html'. Open it to inspect the structure.")

HTML page saved as 'timesjobs_page.html'. Open it to inspect the structure.


In [32]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# URL for Data Analyst jobs in India
url = "https://www.timesjobs.com/candidate/job-search.html?searchType=personalizedSearch&from=submit&searchTextSrc=as&searchTextText=%22Data+Analyst%22%2CIndia&txtKeywords=%22Data+Analyst%22%2C&cboWorkExp1=2&txtLocation=India"

# Set User-Agent to mimic a real browser
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

# Send request to TimesJobs
response = requests.get(url, headers=headers)

# Parse HTML using BeautifulSoup
soup = BeautifulSoup(response.text, "html.parser")

# Find all job postings
jobs = soup.find_all("li", class_="clearfix job-bx wht-shd-bx")

# Extract job details
job_list = []
for job in jobs:
    title = job.find("h2").text.strip() if job.find("h2") else "N/A"
    company = job.find("h3", class_="joblist-comp-name").text.strip() if job.find("h3", class_="joblist-comp-name") else "N/A"
    
    # Extract multiple locations
    location_tags = job.find_all("span", class_="srp-zindex location-tru")

    # Join multiple locations into a single string
    location = ", ".join([loc.text.strip() for loc in location_tags]) if location_tags else "Not specified"


    # Extract skills
    skills = job.find("span", class_="srp-skills").text.strip() if job.find("span", class_="srp-skills") else "N/A"
    
    # Extract posted date
    posted_date = job.find("span", class_="sim-posted").text.strip() if job.find("span", class_="sim-posted") else "N/A"

    job_list.append([title, company, location, skills, posted_date])

# Save extracted data to CSV
df = pd.DataFrame(job_list, columns=["Job Title", "Company", "Location", "Skills", "Posted Date"])
df.to_csv("timesjobs_jobs.csv", index=False)

print("✅ Scraping completed! Data saved in 'data/timesjobs_jobs.csv'.")


✅ Scraping completed! Data saved in 'data/timesjobs_jobs.csv'.


In [34]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# URL for Data Analyst jobs in India
url = "https://www.timesjobs.com/candidate/job-search.html?searchType=personalizedSearch&from=submit&searchTextSrc=as&searchTextText=%22Data+Analyst%22%2CIndia&txtKeywords=%22Data+Analyst%22%2C&cboWorkExp1=2&txtLocation=India"

# Set User-Agent to mimic a real browser
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

# Send request to TimesJobs
response = requests.get(url, headers=headers)

# Parse HTML using BeautifulSoup
soup = BeautifulSoup(response.text, "html.parser")

# Find all job postings
jobs = soup.find_all("li", class_="clearfix job-bx wht-shd-bx")

# Extract job details
job_list = []
for job in jobs:
    title = job.find("h2").text.strip() if job.find("h2") else "N/A"
    company = job.find("h3", class_="joblist-comp-name").text.strip() if job.find("h3", class_="joblist-comp-name") else "N/A"
    
    # Extract multiple locations
    location_tags = job.find_all("span", class_="srp-zindex location-tru")
    location = ", ".join([loc.text.strip() for loc in location_tags]) if location_tags else "Not specified"

    # Extract skills
    skills = job.find("span", class_="srp-skills").text.strip() if job.find("span", class_="srp-skills") else "N/A"
    
    # Extract posted date
    posted_date = job.find("span", class_="sim-posted").text.strip() if job.find("span", class_="sim-posted") else "N/A"

    job_list.append([title, company, location, skills, posted_date])

# Save extracted data to CSV
df = pd.DataFrame(job_list, columns=["Job Title", "Company", "Location", "Skills", "Posted Date"])
df.to_csv("timesjobs_jobs2.csv", index=False)

print("✅ Scraping completed! Data saved in 'data/timesjobs_jobs.csv'.")


✅ Scraping completed! Data saved in 'data/timesjobs_jobs.csv'.


In [36]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# URL for Data Analyst jobs in India
url = "https://www.timesjobs.com/candidate/job-search.html?searchType=personalizedSearch&from=submit&searchTextSrc=as&searchTextText=%22Data+Analyst%22%2CIndia&txtKeywords=%22Data+Analyst%22%2C&cboWorkExp1=2&txtLocation=India"

# Set User-Agent to mimic a real browser
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

# Send request to TimesJobs
response = requests.get(url, headers=headers)

# Parse HTML using BeautifulSoup
soup = BeautifulSoup(response.text, "html.parser")

# Find all job postings
jobs = soup.find_all("li", class_="clearfix job-bx wht-shd-bx")

# Extract job details
job_list = []
for job in jobs:
    title = job.find("h2").text.strip() if job.find("h2") else "N/A"
    company = job.find("h3", class_="joblist-comp-name").text.strip() if job.find("h3", class_="joblist-comp-name") else "N/A"
    
    # Extract location from the correct tag
    location_tag = job.find("ul", class_="list-job-dtl clearfix")
    if location_tag:
        location_items = location_tag.find_all("li")
        location = [li.text.strip() for li in location_items if "location" in str(li)]
        location = ", ".join(location) if location else "Not specified"
    else:
        location = "Not specified"

    # Extract skills
    skills = job.find("span", class_="srp-skills").text.strip() if job.find("span", class_="srp-skills") else "N/A"
    
    # Extract posted date
    posted_date = job.find("span", class_="sim-posted").text.strip() if job.find("span", class_="sim-posted") else "N/A"

    job_list.append([title, company, location, skills, posted_date])

# Save extracted data to CSV
df = pd.DataFrame(job_list, columns=["Job Title", "Company", "Location", "Skills", "Posted Date"])
df.to_csv("timesjobs_jobs3.csv", index=False)

print("✅ Scraping completed! Data saved in 'data/timesjobs_jobs.csv'.")


✅ Scraping completed! Data saved in 'data/timesjobs_jobs.csv'.


In [41]:
import pandas as pd
import random
from faker import Faker

fake = Faker()

# Define job titles, companies, and skills
job_titles = ["Data Analyst", "Senior Data Analyst", "Data Scientist", "Machine Learning Engineer"]
companies = ["TCS", "Infosys", "Google", "Amazon", "Flipkart", "Wipro", "Deloitte"]
locations = ["Bangalore", "Mumbai", "Delhi", "Pune", "Hyderabad", "Chennai"]
skills = ["Python, SQL, Power BI", "Tableau, Excel, R", "Machine Learning, Deep Learning", "Data Wrangling, Pandas, NumPy"]

# Generate job listings
jobs = []
for _ in range(50):  # Generate 50 job listings
    job = {
        "Job Title": random.choice(job_titles),
        "Company": random.choice(companies),
        "Location": random.choice(locations) + ", India",
        "Skills": random.choice(skills),
        "Experience Required": f"{random.randint(1, 10)}+ years",
        "Salary": f"₹{random.randint(6, 20)}L per annum",
        "Date Posted": f"Posted {random.randint(1, 14)} days ago"
    }
    jobs.append(job)

# Convert to DataFrame & Save
df = pd.DataFrame(jobs)
df.to_csv("synthetic_jobs.csv", index=False)

print("✅ Synthetic job dataset generated and saved as 'data/synthetic_jobs.csv'.")


✅ Synthetic job dataset generated and saved as 'data/synthetic_jobs.csv'.


In [39]:
pip install faker

Note: you may need to restart the kernel to use updated packages.Collecting faker
  Downloading Faker-35.2.0-py3-none-any.whl.metadata (15 kB)
Downloading Faker-35.2.0-py3-none-any.whl (1.9 MB)
   ---------------------------------------- 0.0/1.9 MB ? eta -:--:--
   ---------------- ----------------------- 0.8/1.9 MB 6.7 MB/s eta 0:00:01
   -------------------------------------- - 1.8/1.9 MB 4.6 MB/s eta 0:00:01
   ---------------------------------------- 1.9/1.9 MB 4.4 MB/s eta 0:00:00
Installing collected packages: faker
Successfully installed faker-35.2.0




[notice] A new release of pip is available: 24.3.1 -> 25.0
[notice] To update, run: python.exe -m pip install --upgrade pip
