In [1]:
!pip install requests pandas beautifulsoup4 



In [2]:
# Import relevant packages
import pandas as pd
import requests
import random
import re
from bs4 import BeautifulSoup
from datetime import datetime, timedelta

In [3]:
file_name = "LinkedIn_Jobs_Data_Scientist_Monterrey_2024-09-04_clean.csv"
df_jobs = pd.read_csv(file_name)
print(df_jobs.head())

    Location                                        Title           Company  \
0  Monterrey                           Jr. Data Scientist  Arca Continental   
1  Monterrey  ML Engineer (Engineer Software Development)            NEORIS   
2  Monterrey                                  AI Engineer            NEORIS   
3  Monterrey                               Data Scientist             Chubb   
4  Monterrey               AI/ML and MLOps Field Engineer         Canonical   

                                                 Url       JobID      Category  
0  https://mx.linkedin.com/jobs/view/jr-data-scie...  4002846143  Data Science  
1  https://mx.linkedin.com/jobs/view/ml-engineer-...  4002146229  Data Science  
2  https://mx.linkedin.com/jobs/view/ai-engineer-...  3984233060         AI/ML  
3  https://mx.linkedin.com/jobs/view/data-scienti...  3987318831  Data Science  
4  https://mx.linkedin.com/jobs/view/ai-ml-and-ml...  4013780012  Data Science  


In [4]:
def get_random_user_agent():

    headers = [
        {'User-Agent': 'Mozilla/5.0'},
        {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36'},
        {'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Mobile Safari/537.36'},
        {'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Mobile Safari/537.36'},
        {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36'}
    ]

    selected_header = random.choice(headers)
    return selected_header

In [5]:
def fetch_jobs_until_success(url):
    got_200 = False
    while not got_200:
        response = requests.get(url, headers=get_random_user_agent())
        got_200 = response.status_code == 200
    return response

In [6]:
def get_jobid_information(jobid):
   
    # Base URL for LinkedIn job search
    base_url = 'https://www.linkedin.com/jobs-guest/jobs/api/jobPosting/'
    
    url_search = base_url + jobid
    
    return url_search

In [7]:
extracted_data = []
for i in range(0, df_jobs.shape[0]):
    #print(F"Get job description {i+1}/{df_jobs.shape[0]}")
    jobID = str(df_jobs['JobID'][i])
    target_url = get_jobid_information(jobID)
    response = fetch_jobs_until_success(target_url)
    
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find the criteria list first
    criteria_list = soup.find('ul', class_='description__job-criteria-list')

    # Initialize values as 'N/A'
    seniority_level = 'N/A'
    employment_type = 'N/A'
    job_function = 'N/A'
    industries = 'N/A'

    if criteria_list:
        criteria_items = criteria_list.find_all('li', class_='description__job-criteria-item')
        for item in criteria_items:
            # Check for Seniority level
            if 'Seniority level' in item.get_text():
                seniority_level = item.find('span', class_='description__job-criteria-text').get_text(strip=True)
            # Check for Employment type
            elif 'Employment type' in item.get_text():
                employment_type = item.find('span', class_='description__job-criteria-text').get_text(strip=True)
            # Check for Job function
            elif 'Job function' in item.get_text():
                job_function = item.find('span', class_='description__job-criteria-text').get_text(strip=True)
            # Check for Industries
            elif 'Industries' in item.get_text():
                industries = item.find('span', class_='description__job-criteria-text').get_text(strip=True)

    # Extract number of applicants (with multiple class check)
    num_applicants = 'N/A'
    num_applicants_tag = soup.find('figcaption', class_='num-applicants__caption') or \
                         soup.find('span', class_='num-applicants__caption topcard__flavor--metadata topcard__flavor--bullet')
    if num_applicants_tag:
        num_applicants = num_applicants_tag.get_text(strip=True)


    # Extract posted time
    posted_time = soup.find('span', class_='posted-time-ago__text')
    posted_time = posted_time.get_text(strip=True) if posted_time else 'N/A'

    # Extract job description text
    description_tag = soup.find('div', class_='show-more-less-html__markup')
    description = description_tag.get_text(separator=' ', strip=True) if description_tag else 'N/A'

    # Append the data to the list
    extracted_data.append({
        'SeniorityLevel': seniority_level,
        'EmploymentType': employment_type,
        'JobFunction': job_function,
        'Industries': industries,
        'PostedTime': posted_time,
        'NumApplicants': num_applicants,
        'Description': description
    })

# Convert the extracted data into a DataFrame
extracted_df = pd.DataFrame(extracted_data)

# Combine with the original dataframe
df_jobs = pd.concat([df_jobs, extracted_df], axis=1)
print("Done!")

Done!


In [8]:
# Convert the 'Category' column to a categorical data type
categories = ['AI/ML', 'Data Science', 'Data Engineering', 'Data Analysis']
df_jobs['Category'] = pd.Categorical(df_jobs['Category'], categories=categories)

In [9]:
# Clean and standardize the number of applicants
def extract_num_applicants(text):
    match = re.search(r'\d+', text)
    if match:
        return int(match.group())
    elif "Be among the first 25" in text:
        return 25
    elif "Over 200 applicants" in text:
        return 200
    else:
        return 'N/A'

df_jobs['NumApplicants'] = df_jobs['NumApplicants'].apply(extract_num_applicants)

In [10]:
df_jobs['SeniorityLevel'] = df_jobs['SeniorityLevel'].apply(lambda x: 'N/A' if 'Not Applicable' in x else x)

In [11]:
categories = ['Entry level', 'Mid-Senior level', 'Executive', 'N/A', 'Associate',
       'Internship']
df_jobs['SeniorityLevel'] = pd.Categorical(df_jobs['SeniorityLevel'], categories=categories)

In [12]:
df_jobs['EmploymentType'] = pd.Categorical(df_jobs['EmploymentType'], categories=df_jobs['EmploymentType'].unique())

In [13]:
df_jobs['JobFunction'] = df_jobs['JobFunction'].replace({
    'Research and Design': 'R&D',
    'Design and Product Management': 'Product Management'
})

In [14]:
# Standardize job functions
def standardize_job_function(text):
    # Replace 'and' with commas for two-element values
    if ' and ' in text:
        text = text.replace(' and ', ', ')
    
    # Split by commas and limit to the first 3 elements
    job_functions = text.split(', ')
    
    # If more than 3 functions, keep only the first three
    if len(job_functions) > 3:
        job_functions = job_functions[:3]
    
    # Join back the elements with commas
    return ', '.join(job_functions)

# Apply the function to standardize the JobFunction column
df_jobs['JobFunction'] = df_jobs['JobFunction'].apply(standardize_job_function)

In [15]:
def split_job_functions(text):
    # Split the job functions
    job_functions = text.split(', ')
    
    # Return the first 3 job functions, or None if not available
    job_function_1 = job_functions[0] if len(job_functions) > 0 else None
    job_function_2 = job_functions[1] if len(job_functions) > 1 else None
    job_function_3 = job_functions[2] if len(job_functions) > 2 else None
    
    return pd.Series([job_function_1, job_function_2, job_function_3])

# Apply the splitting function and assign new columns
df_jobs[['JobFunction1', 'JobFunction2', 'JobFunction3']] = df_jobs['JobFunction'].apply(split_job_functions)

In [16]:
# Step 1: Replace the None values with 'N/A' by re-assigning the columns
df_jobs['JobFunction1'] = df_jobs['JobFunction1'].fillna('N/A')
df_jobs['JobFunction2'] = df_jobs['JobFunction2'].fillna('N/A')
df_jobs['JobFunction3'] = df_jobs['JobFunction3'].fillna('N/A')

# Step 2: Extract unique values from each JobFunction column, excluding None values
job_function_categories = list(set(
    df_jobs['JobFunction1'].unique().tolist() + 
    df_jobs['JobFunction2'].unique().tolist() + 
    df_jobs['JobFunction3'].unique().tolist()))

# Step 3: Remove any None from the category list (just in case)
job_function_categories = [x for x in job_function_categories if x is not None]

# Step 4: Convert JobFunction1, JobFunction2, JobFunction3 to categorical data types using the merged categories
df_jobs['JobFunction1'] = pd.Categorical(df_jobs['JobFunction1'], categories=job_function_categories)
df_jobs['JobFunction2'] = pd.Categorical(df_jobs['JobFunction2'], categories=job_function_categories)
df_jobs['JobFunction3'] = pd.Categorical(df_jobs['JobFunction3'], categories=job_function_categories)

# Drop the 'JobFunction' column if it's not needed
df_jobs.drop(columns=['JobFunction'], inplace=True)

In [17]:
def convert_posted_time(text):
    if 'hour' in text:
        return 0
    
    if 'day' in text:
        days = int(re.search(r'\d+', text).group()) if re.search(r'\d+', text) else 1  # Default to 1 if no number
        return days
    
    elif 'week' in text:
        weeks = int(re.search(r'\d+', text).group()) if re.search(r'\d+', text) else 1  # Default to 1 if no number
        return weeks * 7
    
    elif 'month' in text:
        months = int(re.search(r'\d+', text).group()) if re.search(r'\d+', text) else 1  # Default to 1 if no number
        return months * 30
    
    return text

df_jobs['PostedTime'] = df_jobs['PostedTime'].apply(convert_posted_time)

In [18]:
df_jobs.rename(columns={'PostedTime': 'DaysSincePosted'}, inplace=True)

In [19]:
df_jobs.rename(columns={'Category': 'JobCategory'}, inplace=True)

In [22]:
new_column_order = [
    'Title', 'Company', 'Location', 'JobID', 'JobCategory', 
    'SeniorityLevel', 'EmploymentType', 'Industries', 
    'DaysSincePosted', 'NumApplicants', 
    'JobFunction1', 'JobFunction2', 'JobFunction3', 
    'Description', 'Url'
]
df_jobs = df_jobs[new_column_order]

In [24]:
df_jobs.head(1)

Unnamed: 0,Title,Company,Location,JobID,JobCategory,SeniorityLevel,EmploymentType,Industries,DaysSincePosted,NumApplicants,JobFunction1,JobFunction2,JobFunction3,Description,Url
0,Jr. Data Scientist,Arca Continental,Monterrey,4002846143,Data Science,Associate,Full-time,Food and Beverage Services,14,198,Information Technology,Sales,,Nuestra compañía Arca Continental es una empre...,https://mx.linkedin.com/jobs/view/jr-data-scie...


In [23]:
df_jobs.to_csv("LinkedIn_Jobs_Data_Scientist_Monterrey_2024-08-15_FullInfo.csv", index=False, encoding='utf-8-sig')