In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

# Remote.io request

## Find pagination length - Does this make sense? 
I'm able to find the number of element in each page but not the number of pages

In [2]:
url = 'https://www.remote.io/remote-jobs'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'lxml')

job_offers_length = len(soup.find_all('h3', attrs ={'class':'job-listing-title'}))
job_offers_length

20

Chose 85 for the pagination lenght because it is around the 85th page that the jobs have more 4 months

In [3]:
# pagination_length = 50
pagination_length = 85


urls = ['https://www.remote.io/remote-jobs?p=' + str(i) for i in range(1,pagination_length)]

responses =[requests.get(url) for url in urls]

In [4]:
soups = [BeautifulSoup(response.content,'lxml') for response in responses]

## Find all the job titles 

In [5]:
all_h3 = [soup.find_all('h3', attrs ={'class':'job-listing-title'}) for soup in soups]
titles = [sublist.text for h3 in all_h3 for sublist in h3]


## Find all the links to job description & request

In [6]:
all_links = [soup.find_all('a', attrs ={'class':'list-apply-button'}) for soup in soups]
links = ['https://www.remote.io' + sublist['href'] for a in all_links for sublist in a]


In [7]:
responses =[requests.get(url) for url in links]

In [8]:
soups = [BeautifulSoup(response.content,'lxml') for response in responses]


In [9]:
single_pages = [soup.find('div', attrs= {'class':'single-page-section'}).text for soup in soups]

## Find Experience Level

In [18]:
level = pd.read_json('golden-source-classified.json')
pattern = r'\b|\b'.join(level['remote_project']['level'])

entry_levels = [(set(re.findall(pattern, str(job).lower()))) for job in single_pages]
entry_levels

entry_levels_clean = []

for level in entry_levels:
    if level.intersection({'senior','experienced','sr','sr.','manager'}):
        entry_levels_clean.append('senior')
    elif level.intersection({'junior','jr','jr.'}):
        entry_levels_clean.append('junior')
    else:
        entry_levels_clean.append('other')

## Find technologies

In [19]:
technologies = pd.read_json('golden-source-classified.json')

pattern_languages = '|'.join(technologies['remote_project']['technologies']['languages'])
pattern_web_frameworks = '|'.join(technologies['remote_project']['technologies']['web_frameworks'])
pattern_libraries = '|'.join(technologies['remote_project']['technologies']['other_frameworks_libraries_tools'])
pattern_databases = '|'.join(technologies['remote_project']['technologies']['databases'])
pattern_platforms = '|'.join(technologies['remote_project']['technologies']['platforms'])
pattern_other = '|'.join(technologies['remote_project']['technologies']['other_habilities'])

pattern_all_technologies = pattern_languages + '|' + pattern_web_frameworks + '|' + pattern_libraries + '|' + pattern_databases + '|' +pattern_platforms + '|' +pattern_other

In [20]:
languages = [list(set(re.findall(pattern_languages,str(tag).replace("-"," ").lower()))) for tag in single_pages]
web_frameworks = [list(set(re.findall(pattern_web_frameworks,str(tag).replace("-"," ").lower()))) for tag in single_pages]
libraries = [list(set(re.findall(pattern_libraries,str(tag).replace("-"," ").lower()))) for tag in single_pages]
databases = [list(set(re.findall(pattern_databases,str(tag).replace("-"," ").lower()))) for tag in single_pages]
platforms = [list(set(re.findall(pattern_platforms,str(tag).replace("-"," ").lower()))) for tag in single_pages]
other = [list(set(re.findall(pattern_other,str(tag).replace("-"," ").lower()))) for tag in single_pages]

## Data frame 🙃

In [21]:
df = pd.DataFrame(titles, columns=['Title'])

df['Link'] = links
df["Languages"] = [",".join(techs) for techs in languages]
df["Web Frameworks"] = [",".join(techs) for techs in web_frameworks]
df["Libraries"] = [",".join(techs) for techs in libraries]
df["Databases"] = [",".join(techs) for techs in databases]
df["Platforms"] = [",".join(techs) for techs in platforms]
df["Other"] = [",".join(techs) for techs in other]
df['Experience Level'] = [level for level in entry_levels_clean]
df

Unnamed: 0,Title,Link,Languages,Web Frameworks,Libraries,Databases,Platforms,Other,Experience Level
0,"Senior Systems Administrator (Washington, D.C...",https://www.remote.io/job/4048/senior-systems-...,"java,sass,c,shell",,apache,mysql,"aws,ios,amazon web services,linux",,senior
1,Front-End Developer at Wallethub,https://www.remote.io/job/4047/front-end-devel...,"php,html,javascript,c,css",angular,,,,,other
2,Part-Time Full Stack Web Developer with React...,https://www.remote.io/job/4046/part-time-full-...,"bash,sass,html,javascript,shell,css,ruby","react,ruby",git,postgresql,"docker,aws",,senior
3,Full Stack Developer (Remote United States) a...,https://www.remote.io/job/4045/full-stack-deve...,ruby,"react,ruby",,,,,other
4,Medical Biller,https://www.remote.io/job/4044/medical-biller,,,,,,,other
...,...,...,...,...,...,...,...,...,...
1675,.NET Engineer at AdHawk and FloorForce,https://www.remote.io/job/2360/net-engineer-at...,"asp,sql,php,html,javascript,c#,graphql,c,css,ruby","react,jquery,express,ruby","git,.net",microsoft sql server,microsoft azure,,other
1676,Senior SRE - Master of None at Packet Fabric,https://www.remote.io/job/2357/senior-sre-mast...,python,,ansible,"elasticsearch,postgresql","aws,amazon web services,linux",,other
1677,Senior Big Data Scalability Engineer (Remote ...,https://www.remote.io/job/2358/senior-big-data...,"sql,java,sass,c,golang,scala,go",,"git,apache","redis,dynamodb,mysql","aws,shopify,amazon web services,linux",,senior
1678,Customer Service and Sales for Cpap Supplies ...,https://www.remote.io/job/2356/customer-servic...,,,,,,,other


## Save DataFrame

In [22]:
df.to_csv('dataframes/remote-io.csv', index = False)

## React not invited to the party!
Here I found out that it's not repeating react when we have react native 🎊

In [15]:
# react_native_rows = df[df['Technologies'].str.contains('react native')]
# react_native_rows



Unnamed: 0,Title,Link,Technologies,Experience Level
58,UI Lead / Front-End Developer (ReactJS / Reac...,https://www.remote.io/job/3990/ui-lead-front-e...,"react native,amazon web services,python,react,...",senior
66,Mobile Developer - Android/iOS - Freelance at...,https://www.remote.io/job/3982/mobile-develope...,"react native,kotlin,cordova,swift,flutter",other
80,React Native Developer [100% REMOTE] at Cleve...,https://www.remote.io/job/3968/react-native-de...,"react native,react,rust",senior
84,Sr. Ruby on Rails and React Native Engineer [...,https://www.remote.io/job/3964/sr-ruby-on-rail...,"react native,sql,redis,heroku,react,postgresql...",other
90,Senior React-Native Developer at Avanoo (San ...,https://www.remote.io/job/3958/senior-react-na...,react native,other
145,"Senior Front End Engineer at Unanet, Inc. (St...",https://www.remote.io/job/3903/senior-front-en...,"react native,vue,java,react,angular,scala",senior
168,"Senior Mobile Engineer, Flutter at Team Go, Inc",https://www.remote.io/job/3880/senior-mobile-e...,"react native,graphql,mongodb,tensorflow,kotlin...",senior
170,Hardcore React Native and NodeJs application ...,https://www.remote.io/job/3878/hardcore-react-...,"react native,sql,javascript,node,express",other
187,"Frontend Engineer, React.js at productboard",https://www.remote.io/job/3861/frontend-engine...,"react native,redis,rust,python,react,node,type...",senior
207,React Native Developer [100% REMOTE] at Cleve...,https://www.remote.io/job/3841/react-native-de...,"react native,react,rust",senior


## Create a dictionary to count the times a technology is required

In [16]:
technologies_set = df['Technologies']

technologies_dict= {}

for technologies in technologies_set:
    for technology in technologies.split(','):
        if technology in technologies_dict.keys():
            technologies_dict[technology] += 1
        else:
            technologies_dict[technology] =  1

technologies_dict

{key: value for key, value in sorted(technologies_dict.items(), key=lambda item: item[1])}

{'unreal engine': 1,
 'matlab': 1,
 'react bootstrap': 2,
 'chromium': 2,
 'assembly': 2,
 'couchbase': 2,
 'cordova': 4,
 'clojure': 4,
 'flutter': 5,
 'dart': 5,
 'f#': 5,
 'firebase': 6,
 'pytorch': 7,
 'mariadb': 7,
 'xamarin': 8,
 'drupal': 9,
 'canvas': 9,
 'microsoft sql server': 10,
 'shopify': 11,
 'pandas': 12,
 'elixir': 14,
 'tensorflow': 16,
 'apollo': 17,
 'golang': 18,
 'objective c': 19,
 'cassandra': 20,
 'hadoop': 24,
 'powershell': 25,
 'rabbitmq': 25,
 'kotlin': 26,
 'laravel': 27,
 'puppet': 28,
 'flask': 30,
 'oracle': 31,
 'heroku': 34,
 'swift': 35,
 'google cloud platform': 37,
 'bash': 38,
 'wordpress': 38,
 'dynamodb': 41,
 'c++': 43,
 'jquery': 46,
 'react native': 47,
 'graphql': 50,
 'mongodb': 53,
 'django': 54,
 'shell': 60,
 'elasticsearch': 64,
 'apache': 66,
 'ansible': 67,
 'vue': 69,
 'nosql': 75,
 'machine learning': 80,
 'express': 80,
 'spring': 83,
 'typescript': 85,
 'redis': 96,
 'sass': 101,
 'php': 102,
 'postgresql': 114,
 'ruby': 126,
 'ku