In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import math

## Find the number of pages

In [24]:
url = 'https://stackoverflow.com/jobs/remote-developer-jobs'

response = requests.get(url)
soup = BeautifulSoup(response.content, features = 'lxml')

jobs_nb = soup.find('div', attrs = {'class':'js-search-title'}).findChild().text
jobs_nb = re.findall('\d+', jobs_nb)[0]

#QUES - Why does the website shop 116 jobs and here I receive 416  🤔

pages = math.ceil((int(jobs_nb)/25))
jobs_nb

soup.find('div', attrs = {'class':'js-search-title'})

<div class="js-search-title -header h4 seo-header">
<span class="description">
416 jobs                </span>
<span class="description"> | </span><h1 class="h1 description">Remote developer jobs</h1>
</div>

## Set url to scrape list of jobs

In [3]:
#Erase this line when I find out why I have 17 pages
pages = 7
urls = ['https://stackoverflow.com/jobs/remote-developer-jobs?pg='+ str(page) for page in range(1,pages+1)]

In [4]:
responses =[requests.get(url) for url in urls]

In [5]:
soups = [BeautifulSoup(response.content,'lxml') for response in responses]

In [6]:
a_tags = [soup.find_all('a', {'class':'s-link stretched-link'}) for soup in soups]
titles = [a.text for sublist in a_tags for a in sublist]

In [7]:
tech_tags = [soup.find_all('a', attrs ={'class': 'post-tag'})]
technologies = [a.text for tag in tech_tags for a in tag]
list(set(technologies))

['objective-c',
 'concurrency',
 'flask',
 'multithreading',
 'laravel',
 'elasticsearch',
 'heroku',
 'ruby-on-rails',
 'gwt',
 'graphql',
 'network-programming',
 'google-cloud-platform',
 'java',
 'python',
 'user-interface',
 'swift',
 'gwtp',
 'mysql',
 'shopify',
 'distributed-computing',
 'node.js',
 '.net',
 'hibernate',
 'ansible',
 'sysadmin',
 'firebase',
 'php',
 'kafka',
 'ios',
 'docker',
 'flash',
 'c#',
 'c++',
 'performance-testing',
 'airflow',
 'aws',
 'go',
 'secure-coding',
 'postgresql',
 'testing',
 'golang',
 'computer-science',
 'javascript',
 'distributed-system',
 'sass',
 'googletest',
 'enterprise',
 'cryptocurrency',
 'react',
 'angular',
 'macos',
 'xcode',
 'mobile',
 'nosql',
 'typescript',
 'spring-boot',
 'continuous-integration',
 'data-structures',
 'next.js',
 'devops',
 'automation',
 'reactjs',
 'chromium',
 'kotlin',
 'actionscript-3',
 'amazon-web-services',
 'linux',
 'html5',
 'mongodb',
 'algorithm',
 'asp.net-core',
 'time-complexity']

In [8]:
links = ['https://stackoverflow.com' + a['href'] for sublist in a_tags for a in sublist]

## DataFrame 🙃

In [150]:
df = pd.DataFrame(titles,columns = ["Title"])
df['Link'] = links
df

Unnamed: 0,Title,Link
0,DevOps Engineer,https://stackoverflow.com/jobs/378945/devops-e...
1,ActionScript / FLASH Developer,https://stackoverflow.com/jobs/379299/actionsc...
2,Mid to Senior Front End Developer - $75K to $1...,https://stackoverflow.com/jobs/358552/mid-to-s...
3,Senior Python Engineer in Fintech,https://stackoverflow.com/jobs/334701/senior-p...
4,Computer Science Eng. to Brainstorm Topics for...,https://stackoverflow.com/jobs/350799/computer...
...,...,...
170,Senior Linux Software Engineer,https://stackoverflow.com/jobs/367261/senior-l...
171,Software Engineer,https://stackoverflow.com/jobs/367258/software...
172,Elixir/API Engineer with Growing Messaging Com...,https://stackoverflow.com/jobs/335070/elixir-a...
173,Senior Software Engineer - Big Data/AI,https://stackoverflow.com/jobs/342960/senior-s...


## Set url for scraping the job descriptions

In [10]:
description_responses =[requests.get(url) for url in links]

In [11]:
description_soups = [BeautifulSoup(response.content,'lxml') for response in description_responses]

**Tech tags**:

In [33]:
tags = [soup.find_all('a', attrs = {'class':'post-tag'}) for soup in description_soups]


**Job Details**:

In [151]:
job_details = [soup.find_all('div', attrs = {'class':'job-details--about'}) for soup in description_soups]

job_det = []

for job in job_details:
    if len(job):
        if len(job[0].find_all('span')) > 2:
            job_det.append([job[0].find_all('span')[3].text.lower()])
        else:
            job_det.append(['other'])  
    else:
        job_det.append(['other'])

job_det

[['mid-level, senior'],
 ['senior'],
 ['junior, mid-level, senior'],
 ['mid-level, senior'],
 ['mid-level, senior, lead'],
 ['mid-level, senior'],
 ['mid-level, senior'],
 ['mid-level, senior'],
 ['manager'],
 ['mid-level, senior'],
 ['lead, manager'],
 ['senior'],
 ['mid-level, senior, lead'],
 ['other'],
 ['senior'],
 ['mid-level, senior'],
 ['mid-level, senior'],
 ['senior, lead'],
 ['mid-level, senior, lead'],
 ['mid-level, senior'],
 ['senior, lead'],
 ['backend developer'],
 ['mid-level, senior'],
 ['senior'],
 ['mid-level, senior'],
 ['mid-level, senior, lead'],
 ['mid-level'],
 ['senior'],
 ['senior'],
 ['senior'],
 ['senior'],
 ['mid-level, senior'],
 ['lead'],
 ['mid-level, senior'],
 ['lead'],
 ['senior, lead'],
 ['senior, lead'],
 ['mid-level, senior, lead'],
 ['mid-level, senior, lead'],
 ['mid-level'],
 ['mid-level, senior'],
 ['mid-level, senior, lead'],
 ['frontend developer'],
 ['mid-level'],
 ['senior, lead'],
 ['senior'],
 ['mid-level, senior, lead'],
 ['mid-level, s

## Find technologies in golden source

In [172]:
technologies = pd.read_json('golden-source-data.json')
word_boundary_techs = r'\b|\b'.join(technologies['remote_jobs']['word_boundary_techs'])
no_boundary_techs = "|".join(technologies['remote_jobs']['no_boundary_techs'])
pattern = word_boundary_techs + "|" + no_boundary_techs+ '|c\+\+'
# pattern.replace('\x08', '')
pattern

'aws\\b|\\bgo\\b|\\bgit\\b|\\bux\\b|\\bui\\b|\\bseo\\b|\\bC#\\b|\\br\\b|\\bc\\b|\\bc#\\b|\\basp\\b|\\bvba|docker|rabbitmq|html|express|php|spring|css|react bootstrap|react native|react|vue|javascript|java|node|ruby|python|typescript|graphql|mongodb|redis|orchardcms|machine learning|umbraco|canvas|angular|django|google cloud platform|kotlin|swift|matlab|rust|golang|bash|powershell|shell|f#|scala|clojure|elixir|webassembly|assembly|objective c|dart|jquery|flask|laravel|drupal|pandas|tensorflow|ansible|cordova|xamarin|apache|hadoop|unreal engine|flutter|puppet|cryengine|mariadb|headoop|oracle|elasticsearch|firebase|dynamodb|cassandra|couchbase|microsoft sql server|postgresql|sql|nosql|apollo|wordpress|heroku|kubernetes|amazon web services|chromium|pytorch|sass|shopify|c\\+\\+'

In [163]:
tech_tags = [list(set(re.findall(pattern,str(tag).replace("-"," ").lower()))) for tag in tags]
tech_tags

[['java', 'ansible', 'docker', 'mongodb', 'amazon web services', 'apache'],
 ['oracle', 'react', 'angular', 'graphql', 'javascript'],
 ['shopify', 'laravel'],
 ['machine learning',
  'flask',
  'pandas',
  'apache',
  'docker',
  'sql',
  'amazon web services',
  'google cloud platform',
  'python'],
 ['scala', 'java'],
 ['typescript', 'react', 'kotlin', 'graphql', 'postgresql'],
 ['java'],
 ['java'],
 ['golang',
  'vue',
  'react',
  'ruby',
  'angular',
  'tensorflow',
  'pytorch',
  'pandas',
  'amazon web services',
  'java',
  'python'],
 ['angular', 'docker', 'mongodb', 'amazon web services', 'python'],
 ['angular', 'docker', 'mongodb', 'amazon web services', 'python'],
 ['angular'],
 [],
 [],
 ['shopify', 'sass', 'firebase', 'react', 'nosql', 'swift'],
 ['shopify', 'sass', 'firebase', 'react', 'nosql', 'swift'],
 ['amazon web services', 'java'],
 ['jquery', 'react', 'angular', 'node', 'javascript'],
 ['react',
  'elasticsearch',
  'hadoop',
  'wordpress',
  'amazon web services'

## The following code is just used to compare the original technologies in the tags and to compare with the ones found with the pattern.

In [15]:
description_technologies = []
for a in tags:
    description_technologies.append(','.join([i.text.replace("-"," ") for i in a]))
    
description_technologies

['linux,ansible,amazon web services,continuous integration,docker,sysadmin,go,web scraping,amazon web services,mongodb,apache kafka,.net,java',
 'html5,user interface,flash,actionscript 3,angularjs,javascript,reactjs,graphql,css,oracle',
 'laravel,shopify,laravel',
 'python,amazon web services,flask,docker,google cloud platform,python,apache,amazon web services,etl,pandas,python,testing,automated tests,continuous integration,unit testing,python,machine learning,pandas,sql,docker,python,machine learning,artificial intelligence,docker,data science',
 'computer science,algorithm,time complexity,data structures,spring,spring security,spring boot,java,scala,akka',
 'typescript,reactjs,graphql,kotlin,postgresql',
 'java,distributed computing,multithreading,network programming,concurrency,testing,distributed system,automation,performance testing',
 'testing,distributed system,automation,performance testing,java,distributed computing,multithreading,network programming,concurrency',
 'ruby on r

## Add technologies to DataFrame

In [166]:
df["Technologies"] = [",".join(techs) for techs in tech_tags]


## Find level of experience

In [153]:
level = pd.read_json('golden-source-data.json')
pattern = r'\b|\b'.join(level['remote_jobs']['level'])

pattern

entry_levels = [(set(re.findall(pattern, str(job).lower()))) for job in job_det]


entry_levels_clean = []

for level in entry_levels:
    if level.intersection({'senior','experienced','sr','sr.','manager'}):
        entry_levels_clean.append('senior')
    elif level.intersection({'junior','jr','jr.'}):
        entry_levels_clean.append('junior')
    else:
        entry_levels_clean.append('other')
    
entry_levels_clean

['senior',
 'senior',
 'senior',
 'senior',
 'senior',
 'senior',
 'senior',
 'senior',
 'senior',
 'senior',
 'senior',
 'senior',
 'senior',
 'other',
 'senior',
 'senior',
 'senior',
 'senior',
 'senior',
 'senior',
 'senior',
 'other',
 'senior',
 'senior',
 'senior',
 'senior',
 'other',
 'senior',
 'senior',
 'senior',
 'senior',
 'senior',
 'other',
 'senior',
 'other',
 'senior',
 'senior',
 'senior',
 'senior',
 'other',
 'senior',
 'senior',
 'other',
 'other',
 'senior',
 'senior',
 'senior',
 'senior',
 'senior',
 'senior',
 'senior',
 'senior',
 'senior',
 'senior',
 'senior',
 'senior',
 'other',
 'other',
 'other',
 'senior',
 'senior',
 'senior',
 'other',
 'senior',
 'senior',
 'senior',
 'senior',
 'other',
 'junior',
 'junior',
 'senior',
 'senior',
 'senior',
 'senior',
 'senior',
 'senior',
 'senior',
 'senior',
 'senior',
 'senior',
 'junior',
 'senior',
 'senior',
 'senior',
 'senior',
 'senior',
 'senior',
 'senior',
 'other',
 'senior',
 'senior',
 'senior',
 '

## Add experience level to df

In [167]:
df['Experience Level'] = [level for level in entry_levels_clean]
df

Unnamed: 0,Title,Link,Experience Level,Technologies
0,DevOps Engineer,https://stackoverflow.com/jobs/378945/devops-e...,senior,"java,ansible,docker,mongodb,amazon web service..."
1,ActionScript / FLASH Developer,https://stackoverflow.com/jobs/379299/actionsc...,senior,"oracle,react,angular,graphql,javascript"
2,Mid to Senior Front End Developer - $75K to $1...,https://stackoverflow.com/jobs/358552/mid-to-s...,senior,"shopify,laravel"
3,Senior Python Engineer in Fintech,https://stackoverflow.com/jobs/334701/senior-p...,senior,"machine learning,flask,pandas,apache,docker,sq..."
4,Computer Science Eng. to Brainstorm Topics for...,https://stackoverflow.com/jobs/350799/computer...,senior,"scala,java"
...,...,...,...,...
170,Senior Linux Software Engineer,https://stackoverflow.com/jobs/367261/senior-l...,senior,"sql,c++,javascript"
171,Software Engineer,https://stackoverflow.com/jobs/367258/software...,other,"sql,c++,javascript"
172,Elixir/API Engineer with Growing Messaging Com...,https://stackoverflow.com/jobs/335070/elixir-a...,senior,"graphql,elixir,postgresql"
173,Senior Software Engineer - Big Data/AI,https://stackoverflow.com/jobs/342960/senior-s...,senior,"python,java"


## Save DataFrame

In [168]:
df.to_csv('dataframes/stackoverflow.csv', index = False)

## Create a dictionary to count the times a technology is required

In [30]:
technologies_set = df['Technologies']

technologies_dict= {}

for technologies in technologies_set:
    for technology in technologies.split(','):
        if technology in technologies_dict.keys():
            technologies_dict[technology] += 1
        else:
            technologies_dict[technology] =  1

technologies_dict

{key: value for key, value in sorted(technologies_dict.items(), key=lambda item: item[1])}

{'tensorflow': 1,
 'jquery': 1,
 'heroku': 1,
 'chromium': 1,
 'rust': 1,
 'clojure': 1,
 'bash': 1,
 'react bootstrap': 1,
 'umbraco': 1,
 'orchardcms': 1,
 'canvas': 1,
 'drupal': 1,
 'dart': 1,
 'elixir': 1,
 'pandas': 2,
 'firebase': 2,
 'hadoop': 2,
 'objective c': 2,
 'shell': 2,
 'oracle': 3,
 'sass': 3,
 'ansible': 4,
 'shopify': 4,
 'scala': 4,
 'graphql': 5,
 'golang': 5,
 'pytorch': 5,
 'c++': 5,
 'xamarin': 5,
 'kotlin': 7,
 'nosql': 7,
 'swift': 7,
 'wordpress': 7,
 'flutter': 7,
 'laravel': 8,
 'google cloud platform': 8,
 'django': 9,
 'apache': 11,
 'elasticsearch': 11,
 'flask': 13,
 'mongodb': 14,
 'vue': 14,
 'machine learning': 15,
 'typescript': 16,
 '': 17,
 'react native': 17,
 'postgresql': 18,
 'ruby': 23,
 'angular': 33,
 'node': 36,
 'kubernetes': 41,
 'java': 44,
 'sql': 44,
 'docker': 45,
 'python': 58,
 'react': 64,
 'javascript': 70,
 'amazon web services': 71}