In [12]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

## List of technologies


In [13]:
technologies = pd.read_csv('technologies.csv')
pattern = "|".join(technologies['Technologies'])
pattern

'sql|docker|aws|rabbitmq|apollo|nosql| git|react native|react|vue|javascript| ux| ui| node|ruby|python|java|typescript|graphql|mongodb|redis|rabbitmq|orchardcms|postgresql|machine-learning|umbraco|seo|canvas|angular|C#|react-bootstrap|django|google-cloud-platform'

In [50]:
# technologies

- It would be nice to have the level the job is asking for: Senior / Junior

## WeWorkRemotely request

In [14]:
url = 'https://weworkremotely.com/categories/remote-programming-jobs'

response = requests.get(url)
soup = BeautifulSoup(response.content, 'lxml')

links = soup.select('article > ul > li > a',recursive=False)
links = ['https://weworkremotely.com' + a['href'] for a in links[1:-1]]
titles = [' '.join(link.split('-')[3:]) for link in links]

## DataFrame 🙃

In [15]:
titles_links = [[title, link] for title,link in zip(titles,links)]
df = pd.DataFrame(titles_links, columns = ['Title', 'Link'])
df

Unnamed: 0,Title,Link
0,experienced full stack rails and react develop...,https://weworkremotely.com/remote-jobs/tanooki...
1,developer ruby on rails backbone js vue js,https://weworkremotely.com/remote-jobs/inputhe...
2,wordpress engineer americas,https://weworkremotely.com/remote-jobs/xwp-lea...
3,haskell developer,https://weworkremotely.com/remote-jobs/holmusk...
4,stack web engineer,https://weworkremotely.com/remote-jobs/otus-fu...
...,...,...
188,inc experienced typescript developer,https://weworkremotely.com/remote-jobs/monumen...
189,software engineer,https://weworkremotely.com/remote-jobs/salesfo...
190,solutions sr devops engineer,https://weworkremotely.com/remote-jobs/titan-s...
191,full stack developer remote or on site 5,https://weworkremotely.com/remote-jobs/ifit-ja...


## Find technologies

#### Trying to do it with the findall regex

In [16]:
responses = [requests.get(link) for link in links]

In [17]:
soups = [BeautifulSoup(response.content, 'lxml') for response in responses]

In [24]:
job_details = [soup.find('div',attrs = {'class':'listing-container'}) for soup in soups] 


In [45]:
techs = [list(set(re.findall(pattern, str(job).lower()))) for job in job_details]

## Add techologies to df

In [48]:
df['Technologies'] = [",".join(tech) for tech in techs]
df


Unnamed: 0,Title,Link,Technologies
0,experienced full stack rails and react develop...,https://weworkremotely.com/remote-jobs/tanooki...,"javascript,react,ruby"
1,developer ruby on rails backbone js vue js,https://weworkremotely.com/remote-jobs/inputhe...,"node,javascript,vue,ruby"
2,wordpress engineer americas,https://weworkremotely.com/remote-jobs/xwp-lea...,"git,docker,javascript,react, ux,sql,nosql"
3,haskell developer,https://weworkremotely.com/remote-jobs/holmusk...,"postgresql, git,aws"
4,stack web engineer,https://weworkremotely.com/remote-jobs/otus-fu...,"node, ui,aws,docker,javascript,react,angular,..."
...,...,...,...
188,inc experienced typescript developer,https://weworkremotely.com/remote-jobs/monumen...,"react,typescript,sql"
189,software engineer,https://weworkremotely.com/remote-jobs/salesfo...,"redis,postgresql,ruby,sql"
190,solutions sr devops engineer,https://weworkremotely.com/remote-jobs/titan-s...,"redis,aws,sql"
191,full stack developer remote or on site 5,https://weworkremotely.com/remote-jobs/ifit-ja...,"aws,postgresql,javascript,react,typescript,nosql"


## Create a dictionary to count the times a technology is required

In [49]:
technologies_set = df['Technologies']

technologies_dict= {}

for technologies in technologies_set:
    for technology in technologies.split(','):
        if technology in technologies_dict.keys():
            technologies_dict[technology] += 1
        else:
            technologies_dict[technology] =  1

{k: v for k, v in sorted(technologies_dict.items(), key=lambda item: item[1])}


{'seo': 5,
 'rabbitmq': 5,
 'apollo': 9,
 ' ux': 13,
 '': 14,
 'nosql': 15,
 'mongodb': 15,
 'django': 17,
 ' ui': 18,
 'react native': 18,
 'angular': 21,
 'typescript': 22,
 'postgresql': 23,
 'redis': 23,
 ' node': 24,
 'java': 26,
 'graphql': 27,
 'docker': 31,
 'vue': 32,
 'python': 39,
 ' git': 54,
 'ruby': 56,
 'sql': 58,
 'aws': 61,
 'react': 77,
 'javascript': 95}

## Previous working code with the technologies logic. IGNORE
It could be good to compare with a dictionary the results in one of the cases and the other one!

In [6]:
#Add React Native in this logic

#QUES - How can I do this without appending? 
#QUES - What's the best way of saving in a df the technologies
# url_techs = []
# for link in links:
#     job_tech = []
#     response = requests.get(link)
#     soup = BeautifulSoup(response.content, 'lxml')
#     text = soup.find('div',attrs = {'class':'listing-container'}).text.lower()
#     words = re.split('[\\s,;.]+', text)
#     url_techs.append(list(set([word for word in words if word in technologies])))
# url_techs

# [[word for in words if word in technologies] for link in links]


[['nosql', 'java'],
 ['python'],
 ['aws'],
 ['redis',
  'apollo',
  'node',
  'mongodb',
  'graphql',
  'react',
  'rabbitmq',
  'ruby',
  'python',
  'javascript',
  'typescript'],
 ['javascript', 'ruby'],
 [],
 ['node', 'nosql', 'ux/ui', 'sql', 'react', 'javascript'],
 ['python', 'sql'],
 ['ux/ui', 'node', 'vue'],
 ['node', 'git', 'aws', 'sql', 'react'],
 ['redis', 'javascript', 'ruby'],
 ['git', 'docker', 'aws', 'graphql', 'rabbitmq', 'python', 'javascript'],
 ['python', 'docker', 'aws'],
 ['javascript', 'git', 'react'],
 ['ruby'],
 ['javascript', 'react'],
 ['javascript'],
 [],
 ['javascript'],
 ['javascript', 'react', 'ruby'],
 ['react', 'ruby'],
 ['mongodb', 'javascript', 'react'],
 ['sql', 'java'],
 ['mongodb', 'javascript'],
 ['javascript', 'react', 'graphql'],
 ['javascript', 'vue'],
 ['ruby'],
 ['ruby'],
 ['git', 'ruby'],
 [],
 ['node', 'aws', 'sql', 'python', 'typescript'],
 ['ruby'],
 ['react', 'graphql'],
 ['sql', 'react'],
 [],
 ['javascript', 'git', 'vue', 'ruby'],
 ['ja