In [1]:
# Install necessary libraries if they are not present
!pip install requests
!pip install beautifulsoup4
!pip install pyarrow
!pip install pandas



In [2]:
# Import relevant packages
import requests
from bs4 import BeautifulSoup
import pandas as pd
import datetime
import random
import math

In [3]:
time_posted_dict = {
    'ALL': '',
    'MONTH': 'r2592000',
    'WEEK': 'r604800',
    'DAY': 'r86400'
}
remote_dict = {
    'ALL': '',
    'ON-SITE': '1',
    'REMOTE': '2',
    'HYBRID': '3'
}

In [4]:
def get_random_user_agent():

    headers = [
        {'User-Agent': 'Mozilla/5.0'},
        {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36'},
        {'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Mobile Safari/537.36'},
        {'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Mobile Safari/537.36'},
        {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36'}
    ]

    selected_header = random.choice(headers)
    return selected_header

In [5]:
def generate_main_linkedin_url(position, location, distance=10, time_posted='ALL', remote='ALL'):
   
    # Base URL for LinkedIn job search
    base_url = 'https://www.linkedin.com/jobs/search/'
    
    # Replace spaces in position with URL encoding
    url_friendly_position = position.replace(" ", "%20")
    
    # Construct the query parameters
    query_params = f'?keywords={url_friendly_position}&location={location}'
    
    if distance:
        query_params += f'&distance={distance}'
    if time_posted:
        time_posted_value = time_posted_dict.get(time_posted, '')
        query_params += f'&f_TPR={time_posted_value}'
    if remote:
        remote_value = remote_dict.get(remote, '')
        query_params += f'&f_WT={remote_value}'
    
    # Combine base URL with query parameters
    url_search = base_url + query_params
    
    return url_search

In [6]:
def get_url_next_10_positions(position, location,start_position, distance=10, time_posted='ALL', remote='ALL'):
   
    # Base URL for LinkedIn job search
    base_url = 'https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search'
    
    # Replace spaces in position with URL encoding
    url_friendly_position = position.replace(" ", "%20")
    
    # Construct the query parameters
    query_params = f'?keywords={url_friendly_position}&location={location}'
    
    if distance:
        query_params += f'&distance={distance}'
    if time_posted:
        time_posted_value = time_posted_dict.get(time_posted, '')
        query_params += f'&f_TPR={time_posted_value}'
    if remote:
        remote_value = remote_dict.get(remote, '')
        query_params += f'&f_WT={remote_value}'
    query_params += f'&position=1&pageNum=0&start={start_position}'
    
    # Combine base URL with query parameters
    url_search = base_url + query_params
    
    return url_search

In [11]:
position = 'Data Scientist'
location = 'Monterrey'
time_posted = 'ALL'
remote = 'ALL'

header = get_random_user_agent()

main_url = generate_main_linkedin_url(position, location,time_posted=time_posted, remote=remote)

Using header: {'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Mobile Safari/537.36'}


In [12]:
def fetch_jobs_until_success(url, headers):
    got_200 = False
    while not got_200:
        response = requests.get(url, headers=headers)
        got_200 = response.status_code == 200
    return response

In [13]:
response = fetch_jobs_until_success(main_url, header)
soup = BeautifulSoup(response.text, 'html.parser')
all_jobs = int(soup.find('span', {'class': 'results-context-header__job-count'}).text)
print(f'There are a total of {all_jobs} jobs that will be scraped based on the given conditions.')

There are a total of 533 jobs that will be scraped based on the given conditions.


In [14]:
jobs = []
total_pages = math.ceil(all_jobs/10)
for i in range(0,all_jobs, 10):
    
    current_page = i/10+1

    target_url = get_url_next_10_positions(position, location,i,time_posted=time_posted,remote=remote)

    response = fetch_jobs_until_success(target_url, header)
    
    print(f"Parsing data for page: {int(current_page)}/{total_pages}")
    
    soup = BeautifulSoup(response.content, 'html.parser')
    alljobs = soup.find_all('li')

    for job in alljobs:
        try:
            info = job.find('div', class_="base-search-card__info")
            title = info.find('h3', class_="base-search-card__title").text.strip() if info else 'N/A'
            company = info.find('h4', class_="base-search-card__subtitle").text.strip() if info else 'N/A'

            metadata = job.find('div', class_="base-search-card__metadata")
            location_element = metadata.find('span', class_="job-search-card__location") if metadata else None
            location_job = location_element.text.strip() if location_element else 'N/A'

            joburl_element = job.find('a', class_="base-card__full-link")
            joburl = joburl_element['href'] if joburl_element else 'N/A'

            job_info = {
                'Location': location_job,
                'Title': title,
                'Company': company,
                'Url': joburl
            }

            jobs.append(job_info)

        except Exception as e:
            print(f"Error processing job: {e}")
            continue

Parsing data for page: 1/54
Parsing data for page: 2/54
Parsing data for page: 3/54
Parsing data for page: 4/54
Parsing data for page: 5/54
Parsing data for page: 6/54
Parsing data for page: 7/54
Parsing data for page: 8/54
Parsing data for page: 9/54
Parsing data for page: 10/54
Parsing data for page: 11/54
Parsing data for page: 12/54
Parsing data for page: 13/54
Parsing data for page: 14/54
Parsing data for page: 15/54
Parsing data for page: 16/54
Parsing data for page: 17/54
Parsing data for page: 18/54
Parsing data for page: 19/54
Parsing data for page: 20/54
Parsing data for page: 21/54
Parsing data for page: 22/54
Parsing data for page: 23/54
Parsing data for page: 24/54
Parsing data for page: 25/54
Parsing data for page: 26/54
Parsing data for page: 27/54
Parsing data for page: 28/54
Parsing data for page: 29/54
Parsing data for page: 30/54
Parsing data for page: 31/54
Parsing data for page: 32/54
Parsing data for page: 33/54
Parsing data for page: 34/54
Parsing data for page: 

In [111]:
df_jobs = pd.DataFrame(jobs, columns=['Location', 'Title', 'Company', 'Url'])
df_jobs.replace("N/A", pd.NA, inplace=True)
df_jobs = df_jobs.dropna()

In [112]:
df_jobs['Location'] = df_jobs['Location'].apply(lambda x: x.split(',')[0])
unique_locations = df_jobs['Location'].unique()
print(unique_locations)

['Monterrey' 'San Pedro Garza García' 'San Nicolás de Los Garza'
 'Centro de San Pedro Garza García' 'Garza García' 'Polanco'
 'Santa Catarina' 'San Nicolás de los Garza' 'Guadalupe' 'Villa de García'
 'Monterrey Metropolitan Area']


In [113]:
# Define a mapping for the location names
location_mapping = {
    'San Pedro Garza García': 'San Pedro',
    'Centro de San Pedro Garza García': 'San Pedro',
    'Monterrey Metropolitan Area': 'Monterrey',
    'San Nicolás de los Garza': 'San Nicolás',
    'San Nicolás de Los Garza': 'San Nicolás',
    'Garza García': 'García',
    'Santa Catarina': 'Santa Catarina',
    'Guadalupe': 'Guadalupe',
    'Villa de García': 'García',
    'Polanco': 'Polanco'
}
# Function to rename locations based on the mapping
def rename_location(location):
    return location_mapping.get(location, location)

In [114]:
# Apply the function to the DataFrame
df_jobs['Location'] = df_jobs['Location'].apply(rename_location)

# Verify the result
unique_locations = df_jobs['Location'].unique()
print(unique_locations)

['Monterrey' 'San Pedro' 'San Nicolás' 'García' 'Polanco' 'Santa Catarina'
 'Guadalupe']


In [115]:
filtered_location_list = ['Monterrey', 'San Pedro']
df_jobs = df_jobs[df_jobs['Location'].isin(filtered_location_list)]

In [116]:
def truncate_url(url):
    parts = url.split('?position')
    return parts[0]
df_jobs['Url'] = df_jobs['Url'].apply(truncate_url)

In [117]:
df_jobs = df_jobs.drop_duplicates(subset=['Location', 'Title', 'Company']).reset_index(drop=True)

In [118]:
categories = {
    'Data Engineering': ['Data Engineer', 'ETL', 'Data Platform', 'Data Pipeline', 
                         'Database Engineer', 'Big Data', 'Hadoop', 'Spark', 
                         'Databricks', 'Data Integration', 'Data Warehouse',
                         'Data Monitoring', 'Data Governance and Management'],
    
    'Data Analysis': ['Data Analyst', 'Business Intelligence', 'BI', 'Data Visualization', 
                      'Data Reporting', 'SQL', 'Data Metrics', 'Analytics', 'Data Insights',
                      'Analítica de Datos', 'Data Operations Analyst'],
    
    'Data Science': ['Data Scientist', 'Machine Learning', 'ML', 'Statistical Analysis', 
                     'Predictive Modeling', 'Data Modeling', 'Deep Learning', 
                     'Algorithm', 'Statistical', 'Data Science Analyst'],
    
    'AI/ML': ['AI', 'Artificial Intelligence', 'Machine Learning', 'ML', 'Neural Networks', 
              'Deep Learning', 'AI Engineer', 'AI/ML Engineer', 'MLOps', 'Model Training',
              'IA'],
    
    'Software Engineering': ['Software Engineer', 'Developer', 'Backend Developer', 
                              'Frontend Developer', 'Fullstack Developer', 'Software Development', 
                              'Programming', 'App Developer', 'Application Developer', 
                              'DevOps', 'Desarrollador']
}

def get_category(title):
    title_lower = title.lower()
    for category, keywords in categories.items():
        for keyword in keywords:
            if keyword.lower() in title_lower:
                return category
    return 'Other'


In [119]:
# Apply the function to the 'Job_Title' column
df_jobs['Category'] = df_jobs['Title'].apply(get_category)
x = df_jobs['Category'].unique()
print(x)

['AI/ML' 'Data Science' 'Software Engineering' 'Data Analysis' 'Other'
 'Data Engineering']


In [121]:
categories_to_drop = ['Other', 'Software Engineering','Data Engineering']
df_jobs = df_jobs[~df_jobs['Category'].isin(categories_to_drop)].reset_index(drop=True)
x = df_jobs['Category'].unique()
print(x)

['AI/ML' 'Data Science' 'Data Analysis']


In [122]:
# Export DataFrame to CSV
date = datetime.datetime.now().strftime('%Y-%m-%d')
position = position.replace(" ", "_")
file_name = f'LinkedIn_{position}_{location}_{date}.csv'
df_jobs.to_csv(file_name, index=False)