In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

## WeWorkRemotely request

In [4]:
url = 'https://weworkremotely.com/categories/remote-programming-jobs'

response = requests.get(url)
soup = BeautifulSoup(response.content, 'lxml')

links = soup.select('article > ul > li > a',recursive=False)
links = ['https://weworkremotely.com' + a['href'] for a in links[1:-1]]
titles = [' '.join(link.split('-')[3:]) for link in links]

## Find technologies

#### Trying to do it with the findall regex

In [6]:
responses = [requests.get(link) for link in links]

In [7]:
soups = [BeautifulSoup(response.content, 'lxml') for response in responses]

In [8]:
job_details = [soup.find('div',attrs = {'class':'listing-container'}) for soup in soups] 


## List of technologies

In [3]:
technologies = pd.read_json('golden-source-classified.json')

pattern_languages = '|'.join(technologies['remote_project']['technologies']['languages'])
pattern_web_frameworks = '|'.join(technologies['remote_project']['technologies']['web_frameworks'])
pattern_libraries = '|'.join(technologies['remote_project']['technologies']['other_frameworks_libraries_tools'])
pattern_databases = '|'.join(technologies['remote_project']['technologies']['databases'])
pattern_platforms = '|'.join(technologies['remote_project']['technologies']['platforms'])
pattern_other = '|'.join(technologies['remote_project']['technologies']['other_habilities'])

pattern_all_technologies = pattern_languages + '|' + pattern_web_frameworks + '|' + pattern_libraries + '|' + pattern_databases + '|' +pattern_platforms + '|' +pattern_other


In [18]:
languages = [list(set(re.findall(pattern_languages,str(tag).replace("-"," ").lower()))) for tag in job_details]
web_frameworks = [list(set(re.findall(pattern_web_frameworks,str(tag).replace("-"," ").lower()))) for tag in job_details]
libraries = [list(set(re.findall(pattern_libraries,str(tag).replace("-"," ").lower()))) for tag in job_details]
databases = [list(set(re.findall(pattern_databases,str(tag).replace("-"," ").lower()))) for tag in job_details]
platforms = [list(set(re.findall(pattern_platforms,str(tag).replace("-"," ").lower()))) for tag in job_details]
other = [list(set(re.findall(pattern_other,str(tag).replace("-"," ").lower()))) for tag in job_details]

## Find experience level

In [28]:
level = pd.read_json('golden-source-classified.json')
pattern = r'\b|\b'.join(level['remote_project']['level'])

pattern


'\\bexperienced\\b\\b|\\b\\bmanager\\b\\b|\\b\\bsenior\\b\\b|\\b\\bjunior\\b\\b|\\b\\bsr.\\b\\b|\\b\\bjr.\\b\\b|\\b\\bjr\\b\\b|\\b\\bsr\\b'

In [29]:
entry_level_body = [list(set(re.findall(pattern, str(job).lower()))) for job in job_details]
# entry_level

entry_level_titles =  [list(set(re.findall(pattern, str(job).lower()))) for job in titles]
entry_levels = [(set(a+b)) for a,b in zip(entry_level_titles,entry_level_body)]

entry_levels_clean = []
for level in entry_levels:
    if level.intersection({'senior','experienced','sr','sr.','manager'}):
        entry_levels_clean.append('senior')
    elif level.intersection({'junior','jr','jr.'}):
        entry_levels_clean.append('junior')
    else:
        entry_levels_clean.append('other')
    


## DataFrame 🙃

In [40]:
titles_links = [[title, link] for title,link in zip(titles,links)]
df = pd.DataFrame(titles_links, columns = ['Title', 'Link'])

df["Languages"] = [",".join(techs) for techs in languages]
df["Web Frameworks"] = [",".join(techs) for techs in web_frameworks]
df["Libraries"] = [",".join(techs) for techs in libraries]
df["Databases"] = [",".join(techs) for techs in databases]
df["Platforms"] = [",".join(techs) for techs in platforms]
df["Other"] = [",".join(techs) for techs in other]

df['Experience Level'] = [level for level in entry_levels_clean]
df

Unnamed: 0,Title,Link,Languages,Web Frameworks,Libraries,Databases,Platforms,Other,Experience Level
0,superstar full stack developer your work will ...,https://weworkremotely.com/remote-jobs/country...,"html,javascript,css,php",jquery,,mysql,"shopify,wordpress","ui,seo,ux",senior
1,source developer devops python django aws open...,https://weworkremotely.com/remote-jobs/opencra...,"html,python,javascript,css","react,django","ansible,git","mongodb,redis,elasticsearch,postgresql,mysql","linux,ios,android,docker,aws,rabbitmq",,senior
2,senior software engineer node js 1,https://weworkremotely.com/remote-jobs/float-c...,"graphql,sql",node,,"mysql,nosql,mongodb","aws,docker,kubernetes",,senior
3,inc senior full stack engineer,https://weworkremotely.com/remote-jobs/sofia-f...,"python,typescript,javascript,go,ruby,scala","angular,django,react,ruby,node",,redis,"windows,linux,ios,android,macos",,senior
4,boss senior full stack engineer 1,https://weworkremotely.com/remote-jobs/follow-...,"css,javascript,sql,php",react,,"mysql,redis",linux,,senior
...,...,...,...,...,...,...,...,...,...
173,shopify developers needed,https://weworkremotely.com/remote-jobs/storeta...,,,,,shopify,,senior
174,stack engineer for medical research,https://weworkremotely.com/remote-jobs/curebas...,"graphql,javascript","react,express",,,"aws,apollo",,other
175,llc react developer,https://weworkremotely.com/remote-jobs/skapa-t...,"javascript,scala,sql,asp",react,"git,react native,.net",nosql,,,other
176,react apollo developer 1,https://weworkremotely.com/remote-jobs/onthego...,"graphql,typescript",react,git,,"aws,apollo",machine learning,other


In [42]:
df.to_csv('dataframes/we-work-remotely.csv', index = False)

## Create a dictionary to count the times a technology is required

In [41]:
def create_tech_dict(col_name,df):
    technologies_set = df[col_name]
    technologies_dict= {}
    for technologies in technologies_set:
        for technology in technologies.split(','):
            if technology in technologies_dict.keys():
                technologies_dict[technology] += 1
            else:
                technologies_dict[technology] =  1
    return {key: value for key, value in sorted(technologies_dict.items(), key=lambda item: item[1])}

In [36]:
create_tech_dict('Libraries',df)
create_tech_dict('Web Frameworks',df)
create_tech_dict('Languages',df)
# create_tech_dict('Databases',df)
# create_tech_dict('Platforms',df)
# create_tech_dict('Other',df)

{'f#': 1,
 'powershell': 1,
 'elixir': 2,
 'clojure': 2,
 'bash': 3,
 'objective c': 3,
 'r': 3,
 'kotlin': 4,
 'asp': 5,
 'golang': 6,
 'shell': 6,
 'swift': 9,
 'c': 12,
 'sass': 12,
 '': 13,
 'php': 18,
 'java': 22,
 'typescript': 25,
 'graphql': 27,
 'go': 29,
 'sql': 34,
 'scala': 37,
 'python': 38,
 'rust': 38,
 'html': 43,
 'css': 52,
 'ruby': 53,
 'javascript': 86}