In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import time
import pandas as pd
from collections import Counter

In [None]:
driver = webdriver.Chrome()
driver.get("https://www.rozee.pk/job/jsearch/q/software-engineer")
time.sleep(5)

# Extract job postings
job_list = []

while len(job_list) < 40:
    jobs = driver.find_elements(By.CLASS_NAME, "job")  # Adjust the class name if needed
    
    for job in jobs:
        if len(job_list) >= 40:
            break
        try:
            # Click job link to extract full description
            job_link = job.find_element(By.TAG_NAME, "a").get_attribute("href")
            driver.get(job_link)
            time.sleep(3)

            title = job.find_element(By.XPATH, ".//h1[@class='jtitle']/bdi").text #CHECK
            company = job.find_element(By.XPATH, ".//h2[@class='cname']/bdi").text #CHECK

            job_location_elements = driver.find_elements(By.XPATH, "//a[contains(@href, 'jobs-in-')][contains(@class, 'jblk')]")
            if job_location_elements:
                job_locations = [elem.text for elem in job_location_elements]
                job_location = job_locations[0] if len(job_locations) == 1 else ", ".join(job_locations)
            else:
                job_location = "N/A"

            job_type_elements = driver.find_elements(By.XPATH, "//a[contains(@href, 'full-time-jobs-in-pakistan') or contains(@href, 'permanent-jobs-in-pakistan')][contains(@class, 'jblk')]")
            if job_type_elements:
                job_type = ", ".join([elem.text for elem in job_type_elements])
            else:
                job_type = "N/A"

            # Find the "Skills" section by locating the h4 tag with text "Skills"
            skills_section = driver.find_element(By.XPATH, "//h4[contains(text(), 'Skills')]/following-sibling::div[contains(@class, 'jcnt')]")
            # Extract all skill links within the found section
            skill_elements = skills_section.find_elements(By.TAG_NAME, "a")
            # Get text from each skill element and clean it up
            skills = [skill.text.strip() for skill in skill_elements] if skill_elements else ["N/A"]

            # Find the job description container
            job_description_container = driver.find_element(By.XPATH, "//div[@dir='ltr']")
            # Extract all child elements (paragraphs, lists, etc.)
            job_description_parts = job_description_container.find_elements(By.XPATH, ".//*")
            # Store as a multi-line string
            job_description = """\n""".join([part.text.strip() for part in job_description_parts if part.text.strip()])
            # Handle case where no text is found
            job_description = f'"""{job_description}"""' if job_description else '"""N/A"""'

            
            job_list.append({
                "Job Title": title,
                "Company Name": company,
                "Location": job_location,
                "Required Skills": ", ".join(skills),
                "Job Type": job_type,
                "Job Description": job_description
            })

            driver.back()
            time.sleep(3)

        except Exception as e:
            print("Error extracting job information: ", e)
    
    # Click on 'Next' if available
    try:
        next_button = driver.find_element(By.LINK_TEXT, "Next")
        next_button.click()
        time.sleep(5)
    except:
        break

driver.quit()
job_data = pd.dataFrame(job_list)
job_data


In [None]:
# Identify the most common job titles
most_common_titles = Counter(job_data["Job Title"]).most_common(10)
print("Most Common Job Titles:", most_common_titles)

# Calculate the average salary in Lahore (assuming numeric salary values are extracted)
def extract_salary(salary):
    try:
        return float(salary.replace("PKR", "").replace(",", "")) if salary != "N/A" else None
    except ValueError:
        return None

job_data["Salary"] = job_data["Salary"].apply(extract_salary)
avg_salary_lahore = job_data[job_data["Location"].str.contains("Lahore", na=False)]["Salary"].mean()
print("Average Salary in Lahore:", avg_salary_lahore)

# Determine the most frequently required skills
all_skills = ", ".join(job_data["Required Skills"].dropna()).split(", ")
most_common_skills = Counter(all_skills).most_common(10)
print("Most Common Required Skills:", most_common_skills)

job_data.to_csv("rozee_job_postings.csv", index=False)