In [2]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import time

In [None]:
DOMAIN = 'https://www.cake.me'
FILTER_TAIWAN = '/jobs?location_list%5B0%5D=Taiwan'
FILTER_SENIORITY = '&seniority_level%5B0%5D='
FILTER_LATEST = '&order=latest'
PAGING = '&page='
LANG_ARTICLE = '?locale=en'
SLEEP = 1
NUMPAGE = 100

In [None]:
seniority_list = [
	'internship_level',
	# 'entry_level',
	# 'mid_senior_level',
	# 'associate',
	'director',
	'executive',
]

job_metadata = {
	'title': 'title',
	'company_name': 'company_name',
	'company_field': 'company_field',
	'category_major': 'category_major',
	'category_minor': 'category_minor',
	'employment_type': 'employment_type',
	'seniority': 'seniority',
	'location': 'location',
	'number_of_hire': 'number_of_hire',
	'experience': 'experience',
	'salary_range': 'salary_range',
	'skills': 'skills',
	'job_description': 'job_description',
	'requirements': 'requirements',
	'job_url': 'job_url',
	'company_size': 'company_size',
	'company_address': 'company_address',
	'company_about': 'company_about',
}

job_columns = []
for key, _ in job_metadata.items():
	job_columns.append(key)

jobs_df = pd.DataFrame()

In [None]:
def getCakeresume(metadata, df, numPage=1):
	parseAllPagination(metadata, df, numPage)
	return None
	
def parseAllPagination(metadata, df, numPage):
	for seniority in seniority_list:
		print(f"Processing seniority level: {seniority}")
		seniority_df = pd.DataFrame(columns=df.columns)
		
		consecutive_errors = 0
		max_consecutive_errors = 3
		
		for i in range(1, numPage + 1):
			url = DOMAIN + FILTER_TAIWAN + FILTER_SENIORITY + seniority + FILTER_LATEST + PAGING + str(i)
			print(f"Fetching page {i}/{numPage}: {url}")
			
			try:
				result_df = parseList(url, metadata, seniority_df)
				
				if result_df is None: # Page error (like 404)
					consecutive_errors += 1
					print(f"Error on page {i} - consecutive errors: {consecutive_errors}/{max_consecutive_errors}")
					
					if consecutive_errors >= max_consecutive_errors:
						print(f"Reached {max_consecutive_errors} consecutive errors. Moving to next seniority level.")
						break
				else: # Successful page - reset error counter
					consecutive_errors = 0
					seniority_df = result_df
			except Exception as e:
				print(f"Exception on page {i}: {str(e)}")
				consecutive_errors += 1
				
				if consecutive_errors >= max_consecutive_errors:
					print(f"Reached {max_consecutive_errors} consecutive errors. Moving to next seniority level.")
					break
				
			time.sleep(SLEEP)

		try:
			seniority_df.to_csv(f'Jobs_{str(seniority)}.csv', index=False, encoding='utf-8-sig')
			print(f"Successfully saved {len(seniority_df)} jobs for '{seniority}'")
		except Exception as e:
			print(f"Error saving CSV for '{seniority}': {str(e)}")
	
	return None

def parseList(url, metadata, df):
	try:
		resp = requests.get(url, timeout=30)
		
		if resp.status_code != 200:
			print(f"HTTP Error {resp.status_code} for URL: {url}")
			return None
			
		doc = BeautifulSoup(resp.text, "lxml")
		articles = doc.find_all('a', class_='JobSearchItem_jobTitle__bu6yO')
		
		if not articles:
			print(f"No job listings found on page {url}")
			return df
			
		jobs_added = 0
		
		for article in articles:
			try:
				article_url = DOMAIN + article['href'] + LANG_ARTICLE
				article_metadata = metadata.copy()
				result_df = parseArticle(article_url, article_metadata, df)
				
				if result_df is not None:
					df = result_df
					jobs_added += 1
				
				time.sleep(SLEEP)
			except Exception as e:
				print(f"Error processing article {article.get('href', 'unknown')}: {str(e)}")
				# Continue with next article
				continue
				
		print(f"Added {jobs_added} jobs from page {url}")
		return df
		
	except requests.RequestException as e:
		print(f"Request error for {url}: {str(e)}")
		return None 
	except Exception as e:
		print(f"Unexpected error processing page {url}: {str(e)}")
		return None

def parseArticle(url, metadata, df):
	try:
		resp = requests.get(url, timeout=30)
		
		if resp.status_code != 200:
			print(f"HTTP Error {resp.status_code} for article URL: {url}")
			return df
			
		doc = BeautifulSoup(resp.text, "lxml")
		
		try:
			metadata['title'] = doc.find('h1').text.strip()
		except (IndexError, AttributeError, TypeError):
			metadata['title'] = ''

		try:
			metadata['category_major'] = doc.find_all('span', class_='Breadcrumbs_labelText__ZXeZH')[3].text.strip()
		except (IndexError, AttributeError, TypeError):
			metadata['category_major'] = ''

		try:
			metadata['category_minor'] = doc.find_all('span', class_='Breadcrumbs_labelText__ZXeZH')[4].text.strip()
		except (IndexError, AttributeError, TypeError):
			metadata['category_minor'] = ''

		try:
			metadata['employment_type'] = doc.find_all('div', class_='JobDescriptionRightColumn_row__5rklX')[0].find_all('a')[0].text.strip()
		except (IndexError, AttributeError, TypeError):
			metadata['employment_type'] = ''

		try:
			metadata['seniority'] = doc.find_all('div', class_='JobDescriptionRightColumn_row__5rklX')[0].find_all('a')[1].text.strip()
		except (IndexError, AttributeError, TypeError):
			metadata['seniority'] = ''

		try:
			metadata['location'] = doc.find_all('div', class_='JobDescriptionRightColumn_row__5rklX')[1].text.strip()
		except (IndexError, AttributeError, TypeError):
			metadata['location'] = ''

		try:
			metadata['number_of_hire'] = doc.find_all('div', class_='JobDescriptionRightColumn_row__5rklX')[2].text.strip()
		except (IndexError, AttributeError, TypeError):
			metadata['number_of_hire'] = ''

		try:
			metadata['experience'] = doc.find_all('div', class_='JobDescriptionRightColumn_row__5rklX')[3].text.strip()
		except (IndexError, AttributeError, TypeError):
			metadata['experience'] = ''

		try:
			metadata['salary_range'] = doc.find_all('div', class_='JobDescriptionRightColumn_row__5rklX')[4].text.strip()
		except (IndexError, AttributeError, TypeError):
			metadata['salary_range'] = ''

		# Skills
		try:
			skills_cloud = doc.find('div', class_='Tags_wrapper__UQ34T Tags_primary__yUsz1 Tags_tagsMedium__PC_Iu').find_all('a')
			skills = []
			for skill in skills_cloud:
				skill = skill.text.strip()
				skills.append(skill)
			metadata['skills'] = skills
		except (IndexError, AttributeError, TypeError):
			metadata['skills'] = []

		try:
			metadata['job_description'] = doc.find_all('div', class_='ContentSection_contentSection__ELRlG')[0].find('div', class_='RailsHtml_container__LlMcK').text.strip()
		except (IndexError, AttributeError, TypeError):
			metadata['job_description'] = ''

		try:
			metadata['requirements'] = doc.find_all('div', class_='ContentSection_contentSection__ELRlG')[1].find('div', class_='RailsHtml_container__LlMcK').text.strip()
		except (IndexError, AttributeError, TypeError):
			metadata['requirements'] = ''

		metadata['job_url'] = url

		try:
			metadata['company_name'] = doc.find('a', class_='AboutBlock_companyName__4YTC8').text.strip()
		except (IndexError, AttributeError, TypeError):
			metadata['company_name'] = ''

		try:
			metadata['company_url'] = DOMAIN + doc.find('a', class_='AboutBlock_companyName__4YTC8')['href']
		except (IndexError, AttributeError, TypeError):
			metadata['company_url'] = ''

		try:
			metadata['company_field'] = doc.find_all('div', class_='CompanyInfoItem_container__jjp_r')[0].text.strip()
		except (IndexError, AttributeError, TypeError):
			metadata['company_field'] = ''

		try:
			metadata['company_size'] = doc.find_all('div', class_='CompanyInfoItem_container__jjp_r')[1].text.strip()
		except (IndexError, AttributeError, TypeError):
			metadata['company_size'] = ''

		try:
			metadata['company_address'] = doc.find_all('div', class_='CompanyInfoItem_container__jjp_r')[2].text.strip()
		except (IndexError, AttributeError, TypeError):
			metadata['company_address'] = ''

		try:
			metadata['company_about'] = doc.find('div', class_='AboutBlock_companySummary__l6for').find('div', class_='RailsHtml_container__LlMcK').text.strip()
		except (IndexError, AttributeError, TypeError):
			metadata['company_about'] = ''

		metadata_df = pd.DataFrame([metadata])
		updated_df = pd.concat([df, metadata_df], ignore_index=True)
		return updated_df
		
	except requests.RequestException as e:
		print(f"Request error for article {url}: {str(e)}")
		return df
	except Exception as e:
		print(f"Error processing article {url}: {str(e)}")
		return df

In [None]:
getCakeresume(metadata=job_metadata, df=jobs_df, numPage=NUMPAGE)

https://www.cake.me/jobs?location_list%5B0%5D=Taiwan&seniority_level%5B0%5D=internship_level&order=latest&page=90
https://www.cake.me/jobs?location_list%5B0%5D=Taiwan&seniority_level%5B0%5D=internship_level&order=latest&page=91
https://www.cake.me/jobs?location_list%5B0%5D=Taiwan&seniority_level%5B0%5D=internship_level&order=latest&page=92
https://www.cake.me/jobs?location_list%5B0%5D=Taiwan&seniority_level%5B0%5D=internship_level&order=latest&page=93
https://www.cake.me/jobs?location_list%5B0%5D=Taiwan&seniority_level%5B0%5D=internship_level&order=latest&page=94
https://www.cake.me/jobs?location_list%5B0%5D=Taiwan&seniority_level%5B0%5D=internship_level&order=latest&page=95
'NoneType' object has no attribute 'to_csv'
https://www.cake.me/jobs?location_list%5B0%5D=Taiwan&seniority_level%5B0%5D=entry_level&order=latest&page=90
https://www.cake.me/jobs?location_list%5B0%5D=Taiwan&seniority_level%5B0%5D=entry_level&order=latest&page=91
https://www.cake.me/jobs?location_list%5B0%5D=Taiwan&s