<a href="https://colab.research.google.com/github/satyam26en/JOB/blob/main/JOBS2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **1. Import Libraries and Download the Dataset**

In [6]:
import pandas as pd
import requests
import re
from io import BytesIO
from zipfile import ZipFile
from urllib.parse import urlparse

# URL to the GitHub file
url = 'https://github.com/satyam26en/JOB/blob/main/jobs.zip?raw=true'

# Send a GET request to the URL
response = requests.get(url)

# Ensure the request was successful
if response.status_code == 200:
    # Load the content of the response into a BytesIO object
    zip_file = BytesIO(response.content)

    # Extract the content of the zip file
    with ZipFile(zip_file) as z:
        # List the files in the zip
        print(z.namelist())

        # Load the CSV file into a DataFrame (assuming there's a single CSV file in the zip)
        with z.open('jobs.csv') as f:
            jobs_df = pd.read_csv(f)
else:
    print(f"Failed to download file: {response.status_code}")

# Display the first few rows of the DataFrame
jobs_df.head()


['jobs.csv']


Unnamed: 0,job_id,job_role,company,experience,salary,location,rating,reviews,resposibilities,posted_on,job_link,company_link
0,70123010000.0,Branch Banking - Calling For Women Candidates,Hdfc Bank,1-6 Yrs,Not disclosed,"Kolkata, Hyderabad/Secunderabad, Pune, Ahmedab...",4.0,39110 Reviews,"Customer Service,Sales,Relationship Management",1 Day Ago,https://www.naukri.com/job-listings-branch-ban...,https://www.naukri.com/hdfc-bank-jobs-careers-213
1,60123910000.0,Product Owner Senior Manager,Accenture,11-15 Yrs,Not disclosed,"Kolkata, Mumbai, Hyderabad/Secunderabad, Pune,...",4.1,32129 Reviews,"Product management,Market analysis,Change mana...",1 Day Ago,https://www.naukri.com/job-listings-product-ow...,https://www.naukri.com/accenture-jobs-careers-...
2,60123910000.0,Employee Relations and Policies Associate Manager,Accenture,3-7 Yrs,Not disclosed,"Kolkata, Mumbai, Hyderabad/Secunderabad, Pune,...",4.1,32129 Reviews,"Business process,Change management,Team manage...",1 Day Ago,https://www.naukri.com/job-listings-employee-r...,https://www.naukri.com/accenture-jobs-careers-...
3,60123910000.0,Employee Relations and Policies Specialist,Accenture,3-7 Yrs,Not disclosed,"Kolkata, Mumbai, Hyderabad/Secunderabad, Pune,...",4.1,32129 Reviews,"Business process,Change management,Team manage...",1 Day Ago,https://www.naukri.com/job-listings-employee-r...,https://www.naukri.com/accenture-jobs-careers-...
4,60123010000.0,SAP BO Consultant,Mindtree,5-7 Yrs,Not disclosed,"Hybrid - Kolkata, Hyderabad/Secunderabad, Pune...",4.1,3759 Reviews,"SAP BO,PL / SQL,Oracle SQL,SAP Business Object...",1 Day Ago,https://www.naukri.com/job-listings-sap-bo-con...,https://www.naukri.com/mindtree-jobs-careers-3...


# **2. Initial Examination**

In [7]:
# Display the first few rows of the dataset
print("First few rows of the dataset:")
print(jobs_df.head())

First few rows of the dataset:
         job_id                                           job_role    company  \
0  7.012301e+10      Branch Banking - Calling For Women Candidates  Hdfc Bank   
1  6.012391e+10                       Product Owner Senior Manager  Accenture   
2  6.012391e+10  Employee Relations and Policies Associate Manager  Accenture   
3  6.012391e+10         Employee Relations and Policies Specialist  Accenture   
4  6.012301e+10                                  SAP BO Consultant   Mindtree   

  experience         salary  \
0    1-6 Yrs  Not disclosed   
1  11-15 Yrs  Not disclosed   
2    3-7 Yrs  Not disclosed   
3    3-7 Yrs  Not disclosed   
4    5-7 Yrs  Not disclosed   

                                            location  rating        reviews  \
0  Kolkata, Hyderabad/Secunderabad, Pune, Ahmedab...     4.0  39110 Reviews   
1  Kolkata, Mumbai, Hyderabad/Secunderabad, Pune,...     4.1  32129 Reviews   
2  Kolkata, Mumbai, Hyderabad/Secunderabad, Pune,...     4

In [8]:
# Dataset summary
print("\nDataset summary:")
print(jobs_df.info())


Dataset summary:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79147 entries, 0 to 79146
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   job_id           78667 non-null  float64
 1   job_role         78667 non-null  object 
 2   company          78666 non-null  object 
 3   experience       77398 non-null  object 
 4   salary           78667 non-null  object 
 5   location         77441 non-null  object 
 6   rating           42948 non-null  float64
 7   reviews          42948 non-null  object 
 8   resposibilities  78647 non-null  object 
 9   posted_on        78667 non-null  object 
 10  job_link         78667 non-null  object 
 11  company_link     78667 non-null  object 
dtypes: float64(2), object(10)
memory usage: 7.2+ MB
None


In [9]:
# Check for missing values
print("\nMissing values in each column:")
print(jobs_df.isnull().sum())


Missing values in each column:
job_id               480
job_role             480
company              481
experience          1749
salary               480
location            1706
rating             36199
reviews            36199
resposibilities      500
posted_on            480
job_link             480
company_link         480
dtype: int64


# **3  Data Cleaning and Preparation**

In [10]:
# Correct column name if necessary (e.g., responsibilities might be resposibilities)
jobs_df.rename(columns={'resposibilities': 'responsibilities'}, inplace=True)

# Handling missing values
jobs_df.dropna(subset=['job_role', 'company', 'posted_on', 'job_link', 'company_link'], inplace=True)
jobs_df['experience'].fillna('Not specified', inplace=True)
jobs_df['location'].fillna('Unknown', inplace=True)
jobs_df['rating'] = pd.to_numeric(jobs_df['rating'], errors='coerce')

# Convert reviews to string first
jobs_df['reviews'] = jobs_df['reviews'].astype(str).str.replace(' Reviews', '').astype(float, errors='ignore')

median_rating = jobs_df['rating'].median()
median_reviews = jobs_df['reviews'].median()
jobs_df['rating'].fillna(median_rating, inplace=True)
jobs_df['reviews'].fillna(median_reviews, inplace=True)
jobs_df['responsibilities'].fillna('Not specified', inplace=True)

# Correcting data types
jobs_df['job_id'] = jobs_df['job_id'].astype(str)
jobs_df['experience'] = jobs_df['experience'].str.strip()
jobs_df['salary'] = jobs_df['salary'].str.strip()
jobs_df['location'] = jobs_df['location'].str.strip()

# Removing duplicates
jobs_df = jobs_df.drop_duplicates()

# Check the dataset again for any remaining missing values
print(jobs_df.isnull().sum())


job_id              0
job_role            0
company             0
experience          0
salary              0
location            0
rating              0
reviews             0
responsibilities    0
posted_on           0
job_link            0
company_link        0
dtype: int64


# **4. Exploratory Data Analysis (EDA)**

### 1.  job_id

In [11]:
# Check for uniqueness of 'job_id'
unique_job_ids = jobs_df['job_id'].nunique()
total_job_ids = jobs_df['job_id'].count()

print(f"Total job_id entries: {total_job_ids}")
print(f"Unique job_id entries: {unique_job_ids}")

# Check for duplicates in 'job_id'
duplicate_job_ids = jobs_df['job_id'].duplicated().sum()
print(f"Number of duplicate job_ids: {duplicate_job_ids}")

# Check for missing values in 'job_id'
missing_job_ids = jobs_df['job_id'].isnull().sum()
print(f"Number of missing job_ids: {missing_job_ids}")

# Remove duplicates if any
if duplicate_job_ids > 0:
    jobs_df = jobs_df.drop_duplicates(subset=['job_id'])
    print("Duplicates removed.")

# Check for consistent length of 'job_id' (optional, based on specific requirements)
job_id_lengths = jobs_df['job_id'].apply(len).unique()
print(f"Unique lengths of job_id: {job_id_lengths}")

# Ensure 'job_id' has no missing values
if missing_job_ids > 0:
    jobs_df = jobs_df.dropna(subset=['job_id'])
    print("Missing job_ids removed.")

# Verify the changes
print("\nVerification after cleaning:")
print(f"Total job_id entries: {jobs_df['job_id'].count()}")
print(f"Unique job_id entries: {jobs_df['job_id'].nunique()}")
print(f"Number of duplicate job_ids: {jobs_df['job_id'].duplicated().sum()}")
print(f"Number of missing job_ids: {jobs_df['job_id'].isnull().sum()}")


Total job_id entries: 73344
Unique job_id entries: 73008
Number of duplicate job_ids: 336
Number of missing job_ids: 0
Duplicates removed.
Unique lengths of job_id: [13 14]

Verification after cleaning:
Total job_id entries: 73008
Unique job_id entries: 73008
Number of duplicate job_ids: 0
Number of missing job_ids: 0


### 2. job_role

In [12]:
# Analyze the distribution of job roles
print("\nTop 10 job roles:")
print(jobs_df['job_role'].value_counts().head(10))

# Standardize job role names (example, actual cleaning may vary)
jobs_df['job_role'] = jobs_df['job_role'].str.strip().str.title()

# Check for unique job roles
print(f"Number of unique job roles: {jobs_df['job_role'].nunique()}")



Top 10 job roles:
job_role
Business Development Executive    302
Sales Executive                   274
Java Developer                    252
Business Development Manager      248
Sales Manager                     158
Senior Software Engineer          144
Graphic Designer                  143
Business Analyst                  137
Software Engineer                 132
Dot Net Developer                 132
Name: count, dtype: int64
Number of unique job roles: 50339


### 3.  company

In [13]:
# EDA on 'company' Column

# Distribution Analysis
print("Company Distribution:")
print(jobs_df['company'].value_counts().head(10))

# Unique Values
unique_companies = jobs_df['company'].nunique()
print(f"Number of unique companies: {unique_companies}")

# Frequency Count
company_counts = jobs_df['company'].value_counts()
print("Frequency of each company (top 10):")
print(company_counts.head(10))

# Cleaning 'company' Column

# Remove leading/trailing spaces and standardize text
jobs_df['company'] = jobs_df['company'].str.strip().str.title()

# Handle Missing Values
missing_companies = jobs_df['company'].isnull().sum()
print(f"Number of missing companies: {missing_companies}")

if missing_companies > 0:
    jobs_df['company'].fillna('Unknown', inplace=True)
    print("Missing companies filled with 'Unknown'.")

# Handle Synonyms/Duplicates (example for merging similar names)
# Create a dictionary for company replacements (example)
company_replacements = {
    'Google Inc': 'Google',
    'Google Llc': 'Google',
    'Microsoft Corporation': 'Microsoft',
    # Add more replacements as needed
}

jobs_df['company'] = jobs_df['company'].replace(company_replacements)

# Verify the changes
print("\nVerification after cleaning:")
print(jobs_df['company'].value_counts().head(10))
print(f"Number of unique companies after cleaning: {jobs_df['company'].nunique()}")
print(f"Number of missing companies after cleaning: {jobs_df['company'].isnull().sum()}")


Company Distribution:
company
Lavya Associates          5128
Accenture                 2928
Hucon                      852
Varite India Pvt. Ltd.     815
IBM                        656
Megma Services             523
Infosys                    505
D'source                   502
RCPC                       444
Tekpillar Services         377
Name: count, dtype: int64
Number of unique companies: 15313
Frequency of each company (top 10):
company
Lavya Associates          5128
Accenture                 2928
Hucon                      852
Varite India Pvt. Ltd.     815
IBM                        656
Megma Services             523
Infosys                    505
D'source                   502
RCPC                       444
Tekpillar Services         377
Name: count, dtype: int64
Number of missing companies: 0

Verification after cleaning:
company
Lavya Associates          5128
Accenture                 2928
Hucon                      852
Varite India Pvt. Ltd.     815
Ibm                        

### 4. experience

In [14]:
# Remove leading/trailing spaces and standardize text
jobs_df['experience'] = jobs_df['experience'].str.strip()

# Handle Missing Values
missing_experience = jobs_df['experience'].isnull().sum()
print(f"Number of missing experience values: {missing_experience}")

if missing_experience > 0:
    jobs_df['experience'].fillna('Not specified', inplace=True)
    print("Missing experience values filled with 'Not specified'.")

# Extract and categorize experience levels
def extract_upper_range(experience):
    match = re.search(r'(\d+)-(\d+)', experience)
    if match:
        return int(match.group(2))
    match = re.search(r'(\d+)\+', experience)
    if match:
        return int(match.group(1))
    return None

# Apply the function to extract the upper range
jobs_df['upper_experience'] = jobs_df['experience'].apply(extract_upper_range)

# Function to categorize experience
def categorize_experience(upper_experience):
    if upper_experience is None:
        return 'Unknown'
    elif upper_experience <= 1:
        return 'Fresher'
    elif upper_experience <= 3:
        return 'Junior'
    elif upper_experience <= 5:
        return 'Medium'
    elif upper_experience <= 10:
        return 'Senior'
    else:
        return 'Expert'

# Apply the function to categorize experience
jobs_df['experience_category'] = jobs_df['upper_experience'].apply(categorize_experience)

# Verify the changes
print("\nVerification after cleaning:")
print(jobs_df['experience_category'].value_counts())
print(f"Number of unique experience categories: {jobs_df['experience_category'].nunique()}")
print(f"Number of missing experience values after cleaning: {jobs_df['experience'].isnull().sum()}")


Number of missing experience values: 0

Verification after cleaning:
experience_category
Senior     36512
Medium     15699
Expert      9775
Junior      8416
Fresher     2606
Name: count, dtype: int64
Number of unique experience categories: 5
Number of missing experience values after cleaning: 0


### 5. salary

In [27]:
# Remove leading/trailing spaces
jobs_df['salary'] = jobs_df['salary'].str.strip()

# Handle Missing Values
missing_salaries = jobs_df['salary'].isnull().sum()
print(f"Number of missing salary values: {missing_salaries}")

if missing_salaries > 0:
    jobs_df['salary'].fillna('Not Disclosed', inplace=True)
    print("Missing salary values filled with 'Not Disclosed'.")

# Extract and clean salary ranges
def extract_upper_salary(salary):
    if salary == "Not Disclosed":
        return None
    else:
        match = re.search(r'-\s*([\d,]+)\s*PA', salary)
        if match:
            upper_salary = match.group(1).replace(',', '')
            return int(upper_salary)
        return None

jobs_df['upper_salary'] = jobs_df['salary'].apply(extract_upper_salary)

# Categorize salaries into bands
def categorize_salary(salary):
    if pd.isna(salary):
        return 'Not Disclosed'
    elif salary < 300000:
        return 'Low'
    elif salary < 600000:
        return 'Medium'
    elif salary < 1000000:
        return 'Good'
    else:
        return 'High'

jobs_df['salary_band'] = jobs_df['upper_salary'].apply(categorize_salary)

# Verify the changes
print("\nVerification after cleaning:")
print(jobs_df['salary_band'].value_counts())

Number of missing salary values: 0

Verification after cleaning:
salary_band
Not Disclosed    46289
Medium           10904
Good             10130
High              8201
Low               3623
Name: count, dtype: int64


### 6. location

In [17]:
# Examine the distribution of job locations
print("\nTop 10 job locations:")
print(jobs_df['location'].value_counts().head(10))

# Standardize location names
jobs_df['location'] = jobs_df['location'].str.strip().str.title()



Top 10 job locations:
location
Bangalore/Bengaluru       9926
Hyderabad/Secunderabad    3728
Pune                      3211
Chennai                   2921
Mumbai                    2813
Gurgaon/Gurugram          2710
Noida                     1664
Permanent Remote          1623
Ahmedabad                 1215
Unknown                   1208
Name: count, dtype: int64


### 7. Rating

In [18]:
# Analyze the distribution of company ratings
print("\nRating distribution:")
print(jobs_df['rating'].describe())



Rating distribution:
count    73008.000000
mean         3.967419
std          0.393600
min          1.000000
25%          4.000000
50%          4.000000
75%          4.000000
max          5.000000
Name: rating, dtype: float64


In [19]:
# Ensure the 'rating' column is numeric
jobs_df['rating'] = pd.to_numeric(jobs_df['rating'], errors='coerce')

# Calculate the median of 'rating'
median_rating = jobs_df['rating'].median()

# Fill missing values with the median rating
jobs_df['rating'].fillna(median_rating, inplace=True)

# Check for missing values in 'rating' to confirm
print("\nMissing values in 'rating' after filling:")
print(jobs_df['rating'].isnull().sum())



Missing values in 'rating' after filling:
0


### 8. reviews

In [20]:
import pandas as pd
import requests
from io import BytesIO
from zipfile import ZipFile
import re

# Step 1: Data Import
url = 'https://github.com/satyam26en/JOB/blob/main/jobs.zip?raw=true'
response = requests.get(url)

if response.status_code == 200:
    zip_file = BytesIO(response.content)
    with ZipFile(zip_file) as z:
        with z.open('jobs.csv') as f:
            jobs_df = pd.read_csv(f)
else:
    raise Exception(f"Failed to download file: {response.status_code}")

# Ensure the reviews column is treated as strings
jobs_df['reviews'] = jobs_df['reviews'].astype(str)

# Step 2: Clean and Extract Numeric Values from the Reviews Column
def extract_review_number(review_str):
    if pd.isna(review_str):
        return None
    numbers = re.findall(r'\d+', str(review_str))
    return int(numbers[0]) if numbers else None

# Extract numeric review values
jobs_df['numeric_reviews'] = jobs_df['reviews'].apply(extract_review_number)

# Display the numeric reviews
numeric_reviews = jobs_df['numeric_reviews'].dropna().unique()

# Sort and display the top 10 numeric reviews
top_ten_reviews = sorted(numeric_reviews, reverse=True)[:10]

top_ten_reviews


[50163.0,
 43672.0,
 39143.0,
 39110.0,
 33480.0,
 32151.0,
 32129.0,
 28658.0,
 28172.0,
 24041.0]

### 9.posted_on

In [21]:
# Display the `posted_on` column
print("Posted On column:")
print(jobs_df['posted_on'].head())


Posted On column:
0    1 Day Ago
1    1 Day Ago
2    1 Day Ago
3    1 Day Ago
4    1 Day Ago
Name: posted_on, dtype: object


In [22]:
# Delete the `posted_on` column
jobs_df.drop(columns=['posted_on'], inplace=True)

# Verify that the column has been deleted
print("Columns after deleting `posted_on`:")
print(jobs_df.columns)


Columns after deleting `posted_on`:
Index(['job_id', 'job_role', 'company', 'experience', 'salary', 'location',
       'rating', 'reviews', 'resposibilities', 'job_link', 'company_link',
       'numeric_reviews'],
      dtype='object')


### 10. job_link and company_link

In [23]:
# Delete the `job_link` and `company_link` columns
jobs_df.drop(columns=['job_link', 'company_link'], inplace=True)

# Verify that the columns have been deleted
print("Columns after deleting `job_link` and `company_link`:")
print(jobs_df.columns)


Columns after deleting `job_link` and `company_link`:
Index(['job_id', 'job_role', 'company', 'experience', 'salary', 'location',
       'rating', 'reviews', 'resposibilities', 'numeric_reviews'],
      dtype='object')


# 5.

In [24]:
# Save the cleaned DataFrame to a CSV file in the current directory
cleaned_file_path = 'cleaned_jobs.csv'
jobs_df.to_csv(cleaned_file_path, index=False)

cleaned_file_path


'cleaned_jobs.csv'