In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import math

## Find the number of pages

In [2]:
url = 'https://stackoverflow.com/jobs/remote-developer-jobs'

response = requests.get(url)
soup = BeautifulSoup(response.content, features = 'lxml')

jobs_nb = soup.find('div', attrs = {'class':'js-search-title'}).findChild().text
jobs_nb = re.findall('\d+', jobs_nb)[0]

#QUES - Why does the website shop 116 jobs and here I receive 416  🤔

pages = math.ceil((int(jobs_nb)/25))
jobs_nb

soup.find('div', attrs = {'class':'js-search-title'})

<div class="js-search-title -header h4 seo-header">
<span class="description">
411 jobs                </span>
<span class="description"> | </span><h1 class="h1 description">Remote developer jobs</h1>
</div>

## Set url to scrape list of jobs

In [3]:
#Erase this line when I find out why I have 17 pages
pages = 7
urls = ['https://stackoverflow.com/jobs/remote-developer-jobs?pg='+ str(page) for page in range(1,pages+1)]

In [4]:
responses =[requests.get(url) for url in urls]

In [5]:
soups = [BeautifulSoup(response.content,'lxml') for response in responses]

In [6]:
a_tags = [soup.find_all('a', {'class':'s-link stretched-link'}) for soup in soups]
titles = [a.text for sublist in a_tags for a in sublist]

In [22]:
tech_tags = [soup.find_all('a', attrs ={'class': 'post-tag'})]
technologies = [a.text for tag in tech_tags for a in tag]
# list(set(technologies))

In [8]:
links = ['https://stackoverflow.com' + a['href'] for sublist in a_tags for a in sublist]

## Set url for scraping the job descriptions

In [10]:
description_responses =[requests.get(url) for url in links]

In [11]:
description_soups = [BeautifulSoup(response.content,'lxml') for response in description_responses]

**Tech tags**:

In [12]:
tags = [soup.find_all('a', attrs = {'class':'post-tag'}) for soup in description_soups]


**Job Details**:

In [24]:
job_details = [soup.find_all('div', attrs = {'class':'job-details--about'}) for soup in description_soups]

job_det = []

for job in job_details:
    if len(job):
        if len(job[0].find_all('span')) > 2:
            job_det.append([job[0].find_all('span')[3].text.lower()])
        else:
            job_det.append(['other'])  
    else:
        job_det.append(['other'])

## Find technologies in golden source

In [25]:
technologies = pd.read_json('golden-source-classified.json')

pattern_languages = '|'.join(technologies['remote_project']['technologies']['languages'])
pattern_web_frameworks = '|'.join(technologies['remote_project']['technologies']['web_frameworks'])
pattern_libraries = '|'.join(technologies['remote_project']['technologies']['other_frameworks_libraries_tools'])
pattern_databases = '|'.join(technologies['remote_project']['technologies']['databases'])
pattern_platforms = '|'.join(technologies['remote_project']['technologies']['platforms'])
pattern_other = '|'.join(technologies['remote_project']['technologies']['other_habilities'])

pattern_all_technologies = pattern_languages + '|' + pattern_web_frameworks + '|' + pattern_libraries + '|' + pattern_databases + '|' +pattern_platforms + '|' +pattern_other

In [26]:
languages = [list(set(re.findall(pattern_languages,str(tag).replace("-"," ").lower()))) for tag in tags]
web_frameworks = [list(set(re.findall(pattern_web_frameworks,str(tag).replace("-"," ").lower()))) for tag in tags]
libraries = [list(set(re.findall(pattern_libraries,str(tag).replace("-"," ").lower()))) for tag in tags]
databases = [list(set(re.findall(pattern_databases,str(tag).replace("-"," ").lower()))) for tag in tags]
platforms = [list(set(re.findall(pattern_platforms,str(tag).replace("-"," ").lower()))) for tag in tags]
other = [list(set(re.findall(pattern_other,str(tag).replace("-"," ").lower()))) for tag in tags]

## The following code is just used to compare the original technologies in the tags and to compare with the ones found with the pattern.

In [16]:
description_technologies = []
for a in tags:
    description_technologies.append(','.join([i.text.replace("-"," ") for i in a]))
    
description_technologies

['angularjs,typescript,ionic framework,node.js,firebase,sysadmin,php,javascript,laravel,node.js,reactjs,node.js,mongodb,amazon web services,reactjs,docker,javascript,html,css,vuejs,angular,sysadmin,reactjs,javascript,vue.js,html,css,php,javascript,wordpress,html,css,sysadmin',
 'spring,java,oracle,guidewire,java,sql,java ee,angularjs,hibernate,sfdc,apex,visualforce,b2b,xml,soap,salesforce,apex,visualforce,java,java ee,sql,java,.net,java ee,html,computer science',
 'amazon web services,mysql,redis,apache kafka,bigdata',
 'lit element,git,web component,reactjs,webpack',
 'css,javascript,html5,angular,shell,linux,amazon web services,unix,sysadmin',
 'shell,linux,amazon web services,unix,sysadmin,css,javascript,html5,angular',
 'go,postgresql,rabbitmq,gin gonic,google cloud platform,vue.js',
 'microservices,kubernetes,spring boot,java,javascript',
 'ruby on rails,ruby,reactjs,docker,postgresql',
 'design,user experience,html,css,balsamiq,python,sql,amazon web services,google analytics,inst

## Find level of experience

In [27]:
level = pd.read_json('golden-source-data.json')
pattern = r'\b|\b'.join(level['remote_jobs']['level'])

pattern

entry_levels = [(set(re.findall(pattern, str(job).lower()))) for job in job_det]


entry_levels_clean = []

for level in entry_levels:
    if level.intersection({'senior','experienced','sr','sr.','manager'}):
        entry_levels_clean.append('senior')
    elif level.intersection({'junior','jr','jr.'}):
        entry_levels_clean.append('junior')
    else:
        entry_levels_clean.append('other')
    


## DataFrame 🙃

In [28]:
df = pd.DataFrame(titles,columns = ["Title"])
df['Link'] = links

df["Languages"] = [",".join(techs) for techs in languages]
df["Web Frameworks"] = [",".join(techs) for techs in web_frameworks]
df["Libraries"] = [",".join(techs) for techs in libraries]
df["Databases"] = [",".join(techs) for techs in databases]
df["Platforms"] = [",".join(techs) for techs in platforms]
df["Other"] = [",".join(techs) for techs in other]

df['Experience Level'] = [level for level in entry_levels_clean]

df

Unnamed: 0,Title,Link,Languages,Web Frameworks,Libraries,Databases,Platforms,Other,Experience Level
0,"Front-End Developer (JS, Angular, TypeScript) ...",https://stackoverflow.com/jobs/371097/front-en...,"css,typescript,javascript,php,html","laravel,node,react,angular,vue",,"firebase,mongodb","docker,amazon web services,wordpress",,other
1,Guidewire InsuranceNow Implementation Engineer...,https://stackoverflow.com/jobs/279895/guidewir...,"html,java,sql","angular,spring",,oracle,,,senior
2,Senior Big Data Scalability Engineer (Remote U...,https://stackoverflow.com/jobs/334503/senior-b...,,,apache,"mysql,redis",amazon web services,,senior
3,Senior Frontend Developer,https://stackoverflow.com/jobs/359023/senior-f...,,react,git,,,,senior
4,Front-End Developer (Remote),https://stackoverflow.com/jobs/380072/front-en...,"shell,javascript,css,html",angular,,,"amazon web services,linux",,senior
...,...,...,...,...,...,...,...,...,...
170,Senior React Native / Flutter Developer (Remote),https://stackoverflow.com/jobs/200538/senior-r...,"python,typescript,javascript","django,flask,express,node,react,angular,vue","flutter,react native",,"docker,ios,kubernetes,amazon web services,android",,senior
171,Senior Software Engineer (Full Stack),https://stackoverflow.com/jobs/376694/senior-s...,"java,javascript",,,"nosql,elasticsearch","docker,kubernetes,amazon web services,linux",,senior
172,"Senior Mobile Engineer, Flutter",https://stackoverflow.com/jobs/375400/senior-m...,dart,,flutter,,"ios,android",,senior
173,Sr. Software Engineer - Angular,https://stackoverflow.com/jobs/363678/sr-softw...,"python,sql",angular,,,docker,,senior


## Save DataFrame

In [29]:
df.to_csv('dataframes/stackoverflow.csv', index = False)

## Create a dictionary to count the times a technology is required

In [31]:
def create_tech_dict(col_name,df):
    technologies_set = df[col_name]
    technologies_dict= {}
    for technologies in technologies_set:
        for technology in technologies.split(','):
            if technology in technologies_dict.keys():
                technologies_dict[technology] += 1
            else:
                technologies_dict[technology] =  1
    return {key: value for key, value in sorted(technologies_dict.items(), key=lambda item: item[1])}

In [32]:
create_tech_dict('Libraries',df)
create_tech_dict('Web Frameworks',df)
create_tech_dict('Languages',df)
# create_tech_dict('Databases',df)
# create_tech_dict('Platforms',df)
# create_tech_dict('Other',df)

{'clojure': 1,
 'bash': 1,
 'dart': 1,
 'shell': 2,
 'objective c': 2,
 'scala': 3,
 'sass': 3,
 'kotlin': 4,
 'r': 5,
 'asp': 5,
 'graphql': 7,
 'swift': 7,
 'golang': 8,
 'go': 13,
 'typescript': 16,
 'css': 17,
 'php': 19,
 'html': 19,
 '': 21,
 'c': 23,
 'ruby': 30,
 'sql': 36,
 'java': 50,
 'python': 56,
 'javascript': 73}