# Webscrapping data information from Indeed about Data Scientists jobs in São Paulo

In [1]:
#Importing the libraries and adapting the results the way we want it! :) 
import requests
import bs4
from bs4 import BeautifulSoup
import pandas as pd
import time
import collections
pd.set_option('max_rows', 99999) #That
pd.set_option('max_colwidth', 400)
pd.describe_option('max_colwidth')

display.max_colwidth : int or None
    The maximum width in characters of a column in the repr of
    a pandas data structure. When the column overflows, a "..."
    placeholder is embedded in the output. A 'None' value means unlimited.
    [default: 50] [currently: 400]


In [None]:
URL = 'https://www.indeed.com.br/empregos?q=data+scientist&l=Brasil'
#conducting a request of the stated URL above:
page = requests.get(URL)
#specifying a desired format of “page” using the html parser - this allows python to read the various components of the page, rather than treating it as one long string.
soup = BeautifulSoup(page.text, 'html.parser')
#printing soup in a more structured tree format that makes for easier reading
print(soup.prettify())

Now It's time to get **all** the data you want!

In [None]:
#Selecting the main information of jobs
jobs=soup.find(class_='result') #First of all, you need to get the big area of the desired topic, which is the class ''result''
print(jobs)

In [27]:
titles=jobs.h2.text.strip()
print(titles)

Applied Data Scientist


In [28]:
#Selecting the company 
company=jobs.span.text.strip()
print(company)

dunnhumby


In [29]:
#Selecting the location
location=jobs.find('span',class_='location').text.strip()
print(location)

São Paulo, SP


In [30]:
#Finally selecting the summary!
summary=jobs.find('div',class_='summary').text.strip()
print(summary)

Some experience with programming, and the ability to quickly pick up handling large data volumes with modern data processing tools, e.g. by using Hadoop / Spark…


Well done!You managed to get the single information from the website. But now we need to face the goal of the project: get all the information about data scientist jobs from the page!

In [31]:
#Function to find all the job titles in the page 
def extract_job_title_from_result(soup): 
  jobs_list = []
  for div in soup.find_all(name='div', attrs={'class':'row'}):
    for a in div.find_all(name='a', attrs={'data-tn-element':'jobTitle'}):
      jobs_list.append(a['title'])
  return(jobs_list)
extract_job_title_from_result(soup)

['Applied Data Scientist',
 'Data Scientist en Vie Brésil (H/F)',
 'Data Scientist',
 'Data Scientist',
 'Data Scientist',
 'Researcher Specialist (Data Scientist)',
 'Data Scientist',
 'Data Scientist - Plataform',
 'Data Scientist - Portfolio Strategy',
 'Data Scientist/Big Data']

In [85]:
#Function to find the companies
def extract_company_title_from_result(soup): 
  companys=soup.find_all('span', class_='company')
  for company in companys:
    print(companys)

In [49]:
#Function to find the location 
def extract_location_from_result(soup): 
  locations = []
  spans = soup.findAll('span', attrs={'class': 'location'})
  for span in spans:
    locations.append(span.text)
  return(locations)
extract_location_from_result(soup)

['São Paulo, SP',
 'São Paulo, SP',
 'São Paulo, SP',
 'São Paulo, SP',
 'Florianópolis, SC',
 'Campinas, SP',
 'Barueri, SP',
 'São Paulo, SP',
 'São Paulo, SP',
 'São Carlos, SP']

In [104]:
#Summary
def extract_summary_from_result(soup): 
  summary = []
  divs = soup.findAll('div', attrs={'class': 'summary'})
  for div in divs:
    summary.append(div.text.strip().replace("\n", "")) #This is good to remove the '\n' of the page
  return(summary)
extract_summary_from_result(soup)

['Some experience with programming, and the ability to quickly pick up handling large data volumes with modern data processing tools, e.g. by using Hadoop / Spark…',
 'Accompagner nos clients dans la formalisation de leurs besoins métier sous forme de problématiques Data Science claires et réalisables.Durée : 12 à 24 mois.',
 'O Zé está construindo todos os dias a maior e melhor plataforma de entrega de bebidas do Brasil, com o serviço mais rápido e confiável do mercado.',
 'Desire to develop as a data scientist.Working in teams, together with senior data scientists, data analytics specialists, designers and medical doctors, you…',
 'Aplicar algoritmos de data mining em conjunto de dados para resolver diferentes problemas ou análises exploratórias.Como será seu dia a dia:',
 'Organize data in order to guarantee high quality data for AI algorithms.Experience with data analysis for a high volume of data;',
 'Cada vez mais data-driven.Estamos buscando profissionais de Data Science com exp

Now lets put all that information in lists! 

In [None]:
title_list=list(extract_job_title_from_result(soup))
location_list=list(extract_location_from_result(soup))
company_list=list(extract_company_title_from_result(soup))
summary_list=list(extract_summary_from_result(soup))

In [96]:
indeed_jobs = pd.DataFrame({'job title':title_list,'location':location_list,'company':company,'summary':summary_list})
print(indeed_jobs)

                                job title  ...                                                                                                                                                            summary
0                  Applied Data Scientist  ...  Some experience with programming, and the ability to quickly pick up handling large data volumes with modern data processing tools, e.g. by using Hadoop / Spark…
1      Data Scientist en Vie Brésil (H/F)  ...       Accompagner nos clients dans la formalisation de leurs besoins métier sous forme de problématiques Data Science claires et réalisables.Durée : 12 à 24 mois.
2                          Data Scientist  ...                 O Zé está construindo todos os dias a maior e melhor plataforma de entrega de bebidas do Brasil, com o serviço mais rápido e confiável do mercado.
3                          Data Scientist  ...      Desire to develop as a data scientist.Working in teams, together with senior data scientists, data analytics

In [107]:
summary_complete=extract_summary_from_result(soup)
print(summary_complete)

['Some experience with programming, and the ability to quickly pick up handling large data volumes with modern data processing tools, e.g. by using Hadoop / Spark…', 'Accompagner nos clients dans la formalisation de leurs besoins métier sous forme de problématiques Data Science claires et réalisables.Durée : 12 à 24 mois.', 'O Zé está construindo todos os dias a maior e melhor plataforma de entrega de bebidas do Brasil, com o serviço mais rápido e confiável do mercado.', 'Desire to develop as a data scientist.Working in teams, together with senior data scientists, data analytics specialists, designers and medical doctors, you…', 'Aplicar algoritmos de data mining em conjunto de dados para resolver diferentes problemas ou análises exploratórias.Como será seu dia a dia:', 'Organize data in order to guarantee high quality data for AI algorithms.Experience with data analysis for a high volume of data;', 'Cada vez mais data-driven.Estamos buscando profissionais de Data Science com experiênc