### Themes/Positions to scrap
* programmers and software development professionals
* IT business analysts architects and system designers
* Finance and investment analysts and advisers
* IT managers

In [None]:
import re
import json
import requests
from urllib.parse import urlencode

# Function to generate the Indeed search URL based on keyword, location, and offset
def get_indeed_search_url(keyword, location, offset=0):
   parameters = {"q": keyword, "l": location, "filter": 0, "start": offset}
   return "https://www.indeed.co.uk/jobs?" + urlencode(parameters)

# Set the User-Agent header to mimic a browser
headers = {
    "User-Agent": "Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148"
}

# An empty list to store the scraped job data
jobs_data_list = []

# Job Search Parameters
job_titles = [
    # Programmers and Software Development Professionals
    'software engineer', 'java developer', 'python developer', 'c# developer', 'mobile app developer',
    'embedded software engineer', 'web developer', 'front end developer', 'backend developer',
    'full stack developer', 'game developer', 'ui/ux developer', 'software architect', 'devops engineer',
    'system engineer', 'application developer', 'cloud software engineer', 'software development manager',
    'video game designer', 'graphics programmer',

    # IT Business Analysts, Architects, and System Designers
    'it business analyst', 'systems analyst', 'data architect', 'it architect', 'solution architect',
    'enterprise architect', 'technical architect', 'data analyst', 'business systems analyst',
    'business intelligence analyst', 'network architect', 'cloud architect', 'it systems designer',
    'functional analyst', 'systems integrator', 'erp analyst', 'scrum master', 'agile coach',
    'requirements engineer', 'compliance analyst',

    # Finance and Investment Analysts and Advisers
    'finance analyst', 'investment analyst', 'risk analyst', 'financial planner', 'financial consultant',
    'portfolio manager', 'equity analyst', 'asset manager', 'credit analyst', 'wealth advisor',
    'economic analyst', 'insurance analyst', 'commodities analyst', 'forex market analyst',
    'investment banker', 'financial auditor', 'regulatory affairs analyst', 'budget analyst',
    'tax advisor', 'personal finance advisor',

    # IT Managers
    'it manager', 'it project manager', 'it service manager', 'it operations manager', 'it infrastructure manager',
    'chief information officer', 'technology manager', 'it security manager', 'it network manager',
    'information systems manager', 'data center manager', 'it compliance manager', 'it quality assurance manager',
    'it support manager', 'cybersecurity manager', 'it procurement manager', 'it strategy manager',
    'it program manager', 'it incident manager', 'chief technology officer',

    # AI-related Job Titles
    'ai engineer', 'machine learning engineer', 'data scientist', 'ai research scientist', 'computer vision engineer',
    'nlp engineer', 'robotics engineer', 'ai software developer', 'deep learning engineer', 'ai product manager',
    'ai consultant', 'ai application developer', 'ai system designer', 'ethical ai specialist', 'ai integration specialist',
    'machine learning consultant', 'speech recognition specialist', 'ai data engineer', 'quantum computing scientist',
    'ai business developer',

    # ML-related Job Titles
    'machine learning scientist', 'applied machine learning engineer', 'ml operations engineer', 'ai/ml researcher',
    'quantitative researcher', 'algorithm developer', 'statistical learning scientist', 'ml infrastructure engineer',
    'data mining engineer', 'predictive modeler', 'machine learning analyst', 'deep learning architect', 'ml model developer',
    'machine learning product manager', 'natural language processing engineer', 'reinforcement learning engineer',
    'big data machine learning engineer', 'neural network engineer', 'computational linguist', 'biometrics engineer'
]

# Top 11 biggest city in theUK
location_list = ['London', 'Birmingham', 'Manchester', 'Glasgow', 'Leeds', 'Sheffield', 
                 'Bradford', 'Edinburgh', 'Liverpool', 'Bristol', 'Cardiff']

# A set to track unique jobkeys
unique_job_keys = set()

# Loop through locations and keywords to distribute the crawling more evenly
for location in location_list:
   location_job_count = 0  # Counter for jobs per location
   while location_job_count < 10:
       for keyword in job_titles:
           offset = 0
           while True:
               try:
                   # Generate the Indeed search URL for the current keyword, location, and offset
                   indeed_jobs_url = get_indeed_search_url(keyword, location, offset)
                   response = requests.get(indeed_jobs_url, headers=headers)
                   if response.status_code == 200:
                       # Find the JSON data containing the job listings
                       script_tag = re.findall(r'window.mosaic.providerData\["mosaic-provider-jobcards"\]=(\{.+?\});', response.text)
                       if script_tag:
                           json_blob = json.loads(script_tag[0])
                           jobs_list = json_blob['metaData']['mosaicProviderJobCardsModel']['results']
                           for index, job in enumerate(jobs_list):
                               job_key = job.get('jobkey')
                               # Check if the job key is unique
                               if job_key and job_key not in unique_job_keys:
                                   unique_job_keys.add(job_key)
                                   # Extract job details and append to the jobs_data_list
                                   jobs_data_list.append({
                                       'keyword': keyword,
                                       'location': location,
                                       'page': round(offset / 10) + 1 if offset > 0 else 1,
                                       'position': index,
                                       'applyCount': job.get('applyCount'),
                                       'company': job.get('company'),
                                       'companyRating': job.get('companyRating'),
                                       'companyReviewCount': job.get('companyReviewCount'),
                                       'createDate': job.get('createDate'),
                                       'highlyRatedEmployer': job.get('highlyRatedEmployer'),
                                       'jobkey': job_key,
                                       'jobTitle': job.get('title'),
                                       'jobTypes': job.get('jobTypes'),
                                       'normTitle': job.get('normTitle'),
                                       'displayTitle': job.get('displayTitle'),
                                       'jobLocationCity': job.get('jobLocationCity'),
                                       'jobLocationPostal': job.get('jobLocationPostal'),
                                       'jobLocationState': job.get('jobLocationState'),
                                       'extractedSalary': job.get('extractedSalary'),
                                       'salarySnippet': job.get('salarySnippet'),
                                       'pubDate': job.get('pubDate'),
                                       'snippet': job.get('snippet'),
                                       'jobCardRequirementsModel': job.get('jobCardRequirementsModel'),
                                       'requirements': job.get('requirements'),
                                       'taxoAttributes': job.get('taxoAttributes'),
                                       'taxoLogAttributes': job.get('taxoLogAttributes'),
                                       'taxonomyAttributes': job.get('taxonomyAttributes'),
                                       'hiringMultipleCandidatesModel': job.get('hiringMultipleCandidatesModel'),
                                   })

                                   location_job_count += 1

                           offset += 10
                           # Check if the limit is reached for the current keyword/location combination
                           if location_job_count >= 1:
                               # print(location, keyword) # was used for sanity checking
                               break
                       else:
                           break
               except Exception as e:
                   print('Error:', e)
               # if location_job_count >= 3000:  # Break if total for location is reached
               #     break

# Save the scraped job data as a JSON file
with open('uk_indeed.json', 'w') as f:
   json.dump(jobs_data_list, f, indent=2)

# Print the total number of jobs scraped
print(f"Total jobs scraped: {len(jobs_data_list)}")