<a href="https://colab.research.google.com/github/satyam26en/JOB/blob/main/JOBS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **1. Import Libraries and Download the Dataset**

In [60]:
import pandas as pd
import requests
import re
from io import BytesIO
from zipfile import ZipFile
from urllib.parse import urlparse

# URL to the GitHub file
url = 'https://github.com/satyam26en/JOB/blob/main/jobs.zip?raw=true'

# Send a GET request to the URL
response = requests.get(url)

# Ensure the request was successful
if response.status_code == 200:
    # Load the content of the response into a BytesIO object
    zip_file = BytesIO(response.content)

    # Extract the content of the zip file
    with ZipFile(zip_file) as z:
        # List the files in the zip
        print(z.namelist())

        # Load the CSV file into a DataFrame (assuming there's a single CSV file in the zip)
        with z.open('jobs.csv') as f:
            jobs_df = pd.read_csv(f)
else:
    print(f"Failed to download file: {response.status_code}")

# Display the first few rows of the DataFrame
jobs_df.head()


['jobs.csv']


Unnamed: 0,job_id,job_role,company,experience,salary,location,rating,reviews,resposibilities,posted_on,job_link,company_link
0,70123010000.0,Branch Banking - Calling For Women Candidates,Hdfc Bank,1-6 Yrs,Not disclosed,"Kolkata, Hyderabad/Secunderabad, Pune, Ahmedab...",4.0,39110 Reviews,"Customer Service,Sales,Relationship Management",1 Day Ago,https://www.naukri.com/job-listings-branch-ban...,https://www.naukri.com/hdfc-bank-jobs-careers-213
1,60123910000.0,Product Owner Senior Manager,Accenture,11-15 Yrs,Not disclosed,"Kolkata, Mumbai, Hyderabad/Secunderabad, Pune,...",4.1,32129 Reviews,"Product management,Market analysis,Change mana...",1 Day Ago,https://www.naukri.com/job-listings-product-ow...,https://www.naukri.com/accenture-jobs-careers-...
2,60123910000.0,Employee Relations and Policies Associate Manager,Accenture,3-7 Yrs,Not disclosed,"Kolkata, Mumbai, Hyderabad/Secunderabad, Pune,...",4.1,32129 Reviews,"Business process,Change management,Team manage...",1 Day Ago,https://www.naukri.com/job-listings-employee-r...,https://www.naukri.com/accenture-jobs-careers-...
3,60123910000.0,Employee Relations and Policies Specialist,Accenture,3-7 Yrs,Not disclosed,"Kolkata, Mumbai, Hyderabad/Secunderabad, Pune,...",4.1,32129 Reviews,"Business process,Change management,Team manage...",1 Day Ago,https://www.naukri.com/job-listings-employee-r...,https://www.naukri.com/accenture-jobs-careers-...
4,60123010000.0,SAP BO Consultant,Mindtree,5-7 Yrs,Not disclosed,"Hybrid - Kolkata, Hyderabad/Secunderabad, Pune...",4.1,3759 Reviews,"SAP BO,PL / SQL,Oracle SQL,SAP Business Object...",1 Day Ago,https://www.naukri.com/job-listings-sap-bo-con...,https://www.naukri.com/mindtree-jobs-careers-3...


# **2. Initial Examination**

In [5]:
# Display the first few rows of the dataset
print("First few rows of the dataset:")
print(jobs_df.head())

# Dataset summary
print("\nDataset summary:")
print(jobs_df.info())

# Descriptive statistics
print("\nDescriptive Statistics:")
print(jobs_df.describe(include='all'))

# Check for missing values
print("\nMissing values in each column:")
print(jobs_df.isnull().sum())


First few rows of the dataset:
         job_id                                           job_role    company  \
0  7.012301e+10      Branch Banking - Calling For Women Candidates  Hdfc Bank   
1  6.012391e+10                       Product Owner Senior Manager  Accenture   
2  6.012391e+10  Employee Relations and Policies Associate Manager  Accenture   
3  6.012391e+10         Employee Relations and Policies Specialist  Accenture   
4  6.012301e+10                                  SAP BO Consultant   Mindtree   

  experience         salary  \
0    1-6 Yrs  Not disclosed   
1  11-15 Yrs  Not disclosed   
2    3-7 Yrs  Not disclosed   
3    3-7 Yrs  Not disclosed   
4    5-7 Yrs  Not disclosed   

                                            location  rating        reviews  \
0  Kolkata, Hyderabad/Secunderabad, Pune, Ahmedab...     4.0  39110 Reviews   
1  Kolkata, Mumbai, Hyderabad/Secunderabad, Pune,...     4.1  32129 Reviews   
2  Kolkata, Mumbai, Hyderabad/Secunderabad, Pune,...     4

# **3  Data Cleaning and Preparation**

In [8]:
# Correct column name if necessary (e.g., responsibilities might be resposibilities)
jobs_df.rename(columns={'resposibilities': 'responsibilities'}, inplace=True)

# Handling missing values
jobs_df.dropna(subset=['job_role', 'company', 'posted_on', 'job_link', 'company_link'], inplace=True)
jobs_df['experience'].fillna('Not specified', inplace=True)
jobs_df['location'].fillna('Unknown', inplace=True)
jobs_df['rating'] = pd.to_numeric(jobs_df['rating'], errors='coerce')

# Convert reviews to string first
jobs_df['reviews'] = jobs_df['reviews'].astype(str).str.replace(' Reviews', '').astype(float, errors='ignore')

median_rating = jobs_df['rating'].median()
median_reviews = jobs_df['reviews'].median()
jobs_df['rating'].fillna(median_rating, inplace=True)
jobs_df['reviews'].fillna(median_reviews, inplace=True)
jobs_df['responsibilities'].fillna('Not specified', inplace=True)

# Correcting data types
jobs_df['job_id'] = jobs_df['job_id'].astype(str)
jobs_df['experience'] = jobs_df['experience'].str.strip()
jobs_df['salary'] = jobs_df['salary'].str.strip()
jobs_df['location'] = jobs_df['location'].str.strip()

# Removing duplicates
jobs_df = jobs_df.drop_duplicates()

# Check the dataset again for any remaining missing values
print(jobs_df.isnull().sum())


job_id              0
job_role            0
company             0
experience          0
salary              0
location            0
rating              0
reviews             0
responsibilities    0
posted_on           0
job_link            0
company_link        0
dtype: int64


# **4. Exploratory Data Analysis (EDA)**

1.  job_id

In [9]:
# Ensure each `job_id` is unique and drop duplicates
duplicate_job_ids = jobs_df['job_id'].duplicated().sum()
print(f"Number of duplicate job IDs: {duplicate_job_ids}")

# Drop duplicates if any
jobs_df = jobs_df.drop_duplicates(subset='job_id')

# Check for missing job IDs
missing_job_ids = jobs_df['job_id'].isnull().sum()
print(f"Number of missing job IDs: {missing_job_ids}")


Number of duplicate job IDs: 336
Number of missing job IDs: 0


In [12]:
# Ensure each `job_id` is unique and drop duplicates
duplicate_job_ids = jobs_df['job_id'].duplicated().sum()
print(f"Number of duplicate job IDs: {duplicate_job_ids}")

# Drop duplicates if any
jobs_df = jobs_df.drop_duplicates(subset='job_id')

# Verify that there are no more duplicate job IDs
duplicate_job_ids_after = jobs_df['job_id'].duplicated().sum()
print(f"Number of duplicate job IDs after cleaning: {duplicate_job_ids_after}")

# Check for missing job IDs
missing_job_ids = jobs_df['job_id'].isnull().sum()
print(f"Number of missing job IDs: {missing_job_ids}")


Number of duplicate job IDs: 0
Number of duplicate job IDs after cleaning: 0
Number of missing job IDs: 0


2. job_role

In [66]:
# Analyze the distribution of job roles
print("\nTop 10 job roles:")
print(jobs_df['job_role'].value_counts().head(10))

# Standardize job role names (example, actual cleaning may vary)
jobs_df['job_role'] = jobs_df['job_role'].str.strip().str.title()

# Check for unique job roles
print(f"Number of unique job roles: {jobs_df['job_role'].nunique()}")



Top 10 job roles:
job_role
Business Development Executive      305
Solution Architect                  278
Customer Service Associate          277
Sales Executive                     275
.Net Fullstack Developer            258
People Advisor Senior Analyst       256
SAP BO Consultant                   255
Business Development Manager        254
Senior Java BED ( PAN India)- KG    254
PowerBI Developer                   254
Name: count, dtype: int64
Number of unique job roles: 50340


3.  company

In [13]:
# Check for unique company names and standardize them
print(f"Number of unique companies: {jobs_df['company'].nunique()}")

# Standardize company names
jobs_df['company'] = jobs_df['company'].str.strip().str.title()


Number of unique companies: 15313


4. experience

In [14]:
# Analyze the range and distribution of experience levels
print("\nExperience level distribution:")
print(jobs_df['experience'].value_counts().head(10))

# Convert experience levels to a consistent format
jobs_df['experience'] = jobs_df['experience'].str.replace(' Yrs', '').str.strip()



Experience level distribution:
experience
5-10 Yrs    7488
3-8 Yrs     3914
2-7 Yrs     3059
1-6 Yrs     3014
4-9 Yrs     2918
0-5 Yrs     2805
2-5 Yrs     2759
1-3 Yrs     2679
4-6 Yrs     2422
3-5 Yrs     2362
Name: count, dtype: int64


In [24]:
# Convert to DataFrame
df = pd.DataFrame(data)

# Function to extract the upper range
def extract_upper_range(experience):
    match = re.search(r'(\d+)-(\d+)', experience)
    if match:
        return int(match.group(2))
    return None

# Apply the function to extract the upper range
df['upper_experience'] = df['experience'].apply(extract_upper_range)

# Function to categorize experience
def categorize_experience(upper_experience):
    if upper_experience is None:
        return 'Unknown'
    elif upper_experience <= 1:
        return 'Fresher'
    elif upper_experience <= 3:
        return 'Junior'
    elif upper_experience <= 5:
        return 'Medium'
    elif upper_experience <= 10:
        return 'Senior'
    else:
        return 'Expert'

# Apply the function to categorize experience
df['experience_category'] = df['upper_experience'].apply(categorize_experience)

# Group by experience category and sum the counts
grouped_df = df.groupby('experience_category')['count'].sum().reset_index()

# Display the resulting DataFrame
print(grouped_df)


  experience_category  count
0              Junior   2679
1              Medium   7926
2              Senior  22815


5. salary

In [25]:
# Identify and handle missing salary information
print(f"Number of missing salary entries: {jobs_df['salary'].isnull().sum()}")

# Convert salary entries to a consistent format
jobs_df['salary'] = jobs_df['salary'].str.strip().replace('Not disclosed', 'Not Disclosed')

# Analyze the distribution of salary entries
print("\nSalary distribution:")
print(jobs_df['salary'].value_counts().head(10))


Number of missing salary entries: 480

Salary distribution:
salary
Not Disclosed               45544
6,50,000 - 9,00,000 PA.      3330
6,00,000 - 8,00,000 PA.      1114
5,00,000 - 10,00,000 PA.     1087
2,50,000 - 3,50,000 PA.       599
2,00,000 - 3,00,000 PA.       486
3,00,000 - 5,00,000 PA.       450
3,00,000 - 6,00,000 PA.       424
2,00,000 - 4,00,000 PA.       382
2,00,000 - 5,00,000 PA.       367
Name: count, dtype: int64


In [71]:
# Ensure the salary column is treated as strings
jobs_df['salary'] = jobs_df['salary'].astype(str)

# Step 2: Extract Upper Salary Range
def extract_upper_salary(salary):
    if salary == "Not Disclosed":
        return None
    else:
        match = re.search(r'-\s*([\d,]+)\s*PA', salary)
        if match:
            upper_salary = match.group(1).replace(',', '')
            return int(upper_salary)
        return None

jobs_df['upper_salary'] = jobs_df['salary'].apply(extract_upper_salary)

# Step 4: Categorize into Salary Bands
def categorize_salary(upper_salary):
    if pd.isna(upper_salary):
        return 'Not Disclosed'
    elif upper_salary < 300000:
        return 'Low'
    elif upper_salary < 600000:
        return 'Medium'
    elif upper_salary < 1000000:
        return 'Good'
    else:
        return 'High'

jobs_df['salary_band'] = jobs_df['upper_salary'].apply(categorize_salary)

# Step 5: Count and Display the Salary Bands
salary_band_counts = jobs_df['salary_band'].value_counts()

# Display the counts of each salary band, including 'Not Disclosed'
print(salary_band_counts)



salary_band
Not Disclosed    46289
Medium           10904
Good             10130
High              8201
Low               3623
Name: count, dtype: int64


6. location

In [36]:
# Examine the distribution of job locations
print("\nTop 10 job locations:")
print(jobs_df['location'].value_counts().head(10))

# Standardize location names
jobs_df['location'] = jobs_df['location'].str.strip().str.title()



Top 10 job locations:
location
Bangalore/Bengaluru       10035
Hyderabad/Secunderabad     3736
Pune                       3216
Chennai                    2953
Mumbai                     2818
Gurgaon/Gurugram           2733
Noida                      1678
Permanent Remote           1644
Ahmedabad                  1227
Kolkata                     923
Name: count, dtype: int64


7. Rating

In [37]:
# Analyze the distribution of company ratings
print("\nRating distribution:")
print(jobs_df['rating'].describe())



Rating distribution:
count    42948.000000
mean         3.940144
std          0.519211
min          1.000000
25%          3.700000
50%          4.000000
75%          4.200000
max          5.000000
Name: rating, dtype: float64


8. reviews

In [81]:
import pandas as pd
import requests
from io import BytesIO
from zipfile import ZipFile
import re

# Step 1: Data Import
url = 'https://github.com/satyam26en/JOB/blob/main/jobs.zip?raw=true'
response = requests.get(url)

if response.status_code == 200:
    zip_file = BytesIO(response.content)
    with ZipFile(zip_file) as z:
        with z.open('jobs.csv') as f:
            jobs_df = pd.read_csv(f)
else:
    raise Exception(f"Failed to download file: {response.status_code}")

# Ensure the reviews column is treated as strings
jobs_df['reviews'] = jobs_df['reviews'].astype(str)

# Step 2: Clean and Extract Numeric Values from the Reviews Column
def extract_review_number(review_str):
    if pd.isna(review_str):
        return None
    numbers = re.findall(r'\d+', str(review_str))
    return int(numbers[0]) if numbers else None

# Extract numeric review values
jobs_df['numeric_reviews'] = jobs_df['reviews'].apply(extract_review_number)

# Display the numeric reviews
numeric_reviews = jobs_df['numeric_reviews'].dropna().unique()

# Sort and display the top 10 numeric reviews
top_ten_reviews = sorted(numeric_reviews, reverse=True)[:10]

top_ten_reviews


[50163.0,
 43672.0,
 39143.0,
 39110.0,
 33480.0,
 32151.0,
 32129.0,
 28658.0,
 28172.0,
 24041.0]

9.posted_on

In [53]:
# Display the `posted_on` column
print("Posted On column:")
print(jobs_df['posted_on'].head())


Posted On column:
0   NaT
1   NaT
2   NaT
3   NaT
4   NaT
Name: posted_on, dtype: datetime64[ns]


In [54]:
# Delete the `posted_on` column
jobs_df.drop(columns=['posted_on'], inplace=True)

# Verify that the column has been deleted
print("Columns after deleting `posted_on`:")
print(jobs_df.columns)


Columns after deleting `posted_on`:
Index(['job_id', 'job_role', 'company', 'experience', 'salary', 'location',
       'rating', 'reviews', 'resposibilities', 'job_link', 'company_link'],
      dtype='object')


10. job_link and company_link

In [63]:
# Delete the `job_link` and `company_link` columns
jobs_df.drop(columns=['job_link', 'company_link'], inplace=True)

# Verify that the columns have been deleted
print("Columns after deleting `job_link` and `company_link`:")
print(jobs_df.columns)


Columns after deleting `job_link` and `company_link`:
Index(['job_id', 'job_role', 'company', 'experience', 'salary', 'location',
       'rating', 'reviews', 'resposibilities', 'posted_on'],
      dtype='object')


# 5.

In [65]:
# Save the cleaned DataFrame to a CSV file in the current directory
cleaned_file_path = 'cleaned_jobs.csv'
jobs_df.to_csv(cleaned_file_path, index=False)

cleaned_file_path


'cleaned_jobs.csv'