<a href="https://colab.research.google.com/github/saigirishgilly98/Major_Project/blob/main/WebScraping2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Links to Scrape

1) List of Jobs in India : https://www.payscale.com/research/IN/Job

In [1]:
! pip install scrapy



In [2]:
import scrapy
from scrapy.crawler import CrawlerProcess

In [3]:
import pandas as pd
import csv

In [4]:
class Payscale_Spider(scrapy.Spider):
    
    name = 'payscale_spider'

    def start_requests(self):

        url = 'https://www.payscale.com/research/IN/Job'
        yield scrapy.Request(url = url, callback = self.parse_industries)

    
    def parse_industries(self, response):

        # Extract the industry links
        industry_links = response.xpath('//a[@class = "related-content-card"]/@href').extract()
        
        # Extract the industry titles
        industry_titles = response.xpath('//div[@class = "related-content-card__title"]/text()').extract()

        # Crawling through each of the industries to fetch corresponding jobs
        for industry_link in industry_links[:1]:
            yield response.follow(url = 'https://www.payscale.com' + industry_link, callback = self.parse_jobs)

    
    def parse_jobs(self, response):

        # Extract the job links
        job_links = response.xpath('//a[@class = "subcats__links__item"]/@href').extract() 
        
        # Extract the job titles
        job_titles = response.xpath('//a[@class = "subcats__links__item"]/text()').extract()

        # Crawling through each of the jobs to fetch corresponding salary and other information
        for link in job_links[:1]:
            yield response.follow(url = 'https://www.payscale.com' + link, callback = self.parse_salary)


    def parse_salary(self, response):

        # Extract the job title
        job_title = response.url[41:len(response.url)-7]

        # Extract the average salary
        avg_salary = response.css('span.paycharts__value::text').extract_first()

        # Extract base salary, bonus, profit sharing, commission and total pay 
        salary_component_title_list = response.css('div.tablerow__title::text').extract()
        salary_component_value_list = response.css('div.tablerow__value::text').extract()

        # Extract pay by experience percent level
        pay_percent_component_title_list = response.css('div.delta-table.pay-by-experience__delta-chart > div.entry > div.name > a::text').extract()
        pay_percent_sign_list = response.css('div.delta-table.pay-by-experience__delta-chart > div.entry > div.arrow').xpath('./span/@class').extract()
        pay_percent_value_list_without_sign = response.css('div.delta-table.pay-by-experience__delta-chart > div.entry').xpath('./div[@class="arrow"]/text()').extract()
        pay_percent_component_value_list = list()
        for sign, value in zip(pay_percent_sign_list, pay_percent_value_list_without_sign):
            if sign == 'negative':
                pay_percent_component_value_list.append('-' + value)
            else:
                pay_percent_component_value_list.append(value)

        # Extract pay by experience years level
        pay_by_experience_years_text = response.xpath('//p[@class="expandable__paragraph"]/text()').extract_first()
        pay_by_experience_years_component_title_list = list()
        pay_by_experience_years_component_value_list = list()
        
        temp_numbers_list = list()
        for text in pay_by_experience_years_text.split():
            if any(char.isdigit() for char in text):
                temp_numbers_list.append(text)
        
        for i in range(0, len(temp_numbers_list) - 2, 3):
            pay_by_experience_years_component_title_list.append(temp_numbers_list[i])
            pay_by_experience_years_component_value_list.append(temp_numbers_list[i + 1])

        # Extract Gender Breakdown Percentage
        gender_component_title_list = response.css('div.gender__label::text').extract()
        temp_gender_component_value_list = response.css('div.gender__value::text').extract()
        gender_component_value_list = list()
        for i in range(0, len(temp_gender_component_value_list), 2):
            gender_component_value_list.append(temp_gender_component_value_list[i])
        
        # Extract Health benefits item name
        health_benefits_component_title_list = response.css('div.healthbenefits__item-name::text').extract()
        temp_health_benefits_component_value_list = response.css('h2.healthbenefits__item-value::text').extract()
        health_benefits_component_value_list = list()
        for i in range(0, len(temp_health_benefits_component_value_list), 2):
            health_benefits_component_value_list.append(temp_health_benefits_component_value_list[i])

        # Popular Skills
        popular_skills_component_title_list = response.css('a.popular-skill__name::text').extract()
        popular_skills_component_value_list = response.css('div.popular-skill__pay::text').extract()

        # Extract Skills that affect salary
        affect_skills_component_title_list = response.css('div.pay-diff-by-dimension div.name > a::text').extract()
        temp_affect_skills_component_value_list = response.css('div.pay-diff-by-dimension div.arrow::text').extract()
        temp_sign = response.css('div.pay-diff-by-dimension div.arrow').xpath('./span/@class').extract()
        affect_skills_component_value_list = list()
        for sign, value in zip(temp_sign, temp_affect_skills_component_value_list):
            if sign == 'negative':
                affect_skills_component_value_list.append('-' + value)
            else:
                affect_skills_component_value_list.append(value)

        # Extract Career Paths
        career_path_component_title_list = response.css('div.tree-chart__leaf_container').xpath('./*[contains(@class,"tree-chart__leaf")]/text()').extract()

        # Extract pay difference by location
        location_component_title_list = response.css('div.carousel.related-carousel.pay-diff-related-carousel div.related-content-card__title::text').extract()
        temp_location_component_value_list = response.css('div.carousel.related-carousel.pay-diff-related-carousel div.related-content-card__subtitle::text').extract()
        location_component_value_list = list()
        for value in temp_location_component_value_list:
            if value[0] == '▼':
                location_component_value_list.append('-' + value[2:])
            else:
                location_component_value_list.append(value[2:])
        
        # Extract popular employers
        employers_component_title_list = response.css('div.carousel.related-carousel.popular-employers-related-carousel div.related-content-card__title::text').extract()
        employers_component_value_list = response.css('div.carousel.related-carousel.popular-employers-related-carousel div.related-content-card__subtitle::text').extract()
        
        # Extract Related Job Salaries
        related_job_component_title_list = response.css('span.relatedentity__card-title::text').extract()
        related_job_component_value_list = response.css('p.relatedentity__card-detail::text').extract()

        # Updating all the values in the dictionary
        salary_dict['job_title'] = job_title
        salary_dict['avg_salary'] = avg_salary
        salary_dict['salary_component'] = salary_component_title_list
        for label, value in zip(salary_component_title_list, salary_component_value_list):
            salary_dict[label] = value
        salary_dict['pay_by_experience_percent'] = pay_percent_component_title_list
        for label, value in zip(pay_percent_component_title_list, pay_percent_component_value_list):
            salary_dict[label] = value
        salary_dict['pay_by_experience_years'] =  pay_by_experience_years_component_title_list
        for label, value in zip( pay_by_experience_years_component_title_list,  pay_by_experience_years_component_value_list):
            salary_dict[label] = value
        salary_dict['gender_component'] =  gender_component_title_list
        for label, value in zip( gender_component_title_list,  gender_component_value_list):
            salary_dict[label] = value
        salary_dict['health_benefits_componentt'] =  health_benefits_component_title_list
        for label, value in zip( health_benefits_component_title_list,  health_benefits_component_value_list):
            salary_dict[label] = value
        salary_dict['popular_skills_component'] =  popular_skills_component_title_list
        for label, value in zip( popular_skills_component_title_list,  popular_skills_component_value_list):
            salary_dict[label] = value
        salary_dict['affect_skills_component'] =  affect_skills_component_title_list
        for label, value in zip( affect_skills_component_title_list,  affect_skills_component_value_list):
            salary_dict[label] = value
        salary_dict['career_path_component'] = career_path_component_title_list
        salary_dict['location_component'] =  location_component_title_list
        for label, value in zip( location_component_title_list, location_component_value_list):
            salary_dict[label] = value
        salary_dict['employers_component'] =  employers_component_title_list
        for label, value in zip(employers_component_title_list,  employers_component_value_list):
            salary_dict[label] = value
        salary_dict['related_job_component'] =  related_job_component_title_list
        for label, value in zip(related_job_component_title_list, related_job_component_value_list):
            salary_dict[label] = value
        
        # # Storing the values in json file
        # df = pd.DataFrame(salary_dict)
        # df.to_csv('' + job_title + '.csv')
        print(salary_dict)

salary_dict = dict()

process = CrawlerProcess()
process.crawl(Payscale_Spider)
process.start()

2021-01-22 14:45:08 [scrapy.utils.log] INFO: Scrapy 2.4.1 started (bot: scrapybot)
2021-01-22 14:45:08 [scrapy.utils.log] INFO: Versions: lxml 4.2.6.0, libxml2 2.9.8, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 20.3.0, Python 3.6.9 (default, Oct  8 2020, 12:12:24) - [GCC 8.4.0], pyOpenSSL 20.0.1 (OpenSSL 1.1.1i  8 Dec 2020), cryptography 3.3.1, Platform Linux-4.19.112+-x86_64-with-Ubuntu-18.04-bionic
2021-01-22 14:45:08 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.epollreactor.EPollReactor
2021-01-22 14:45:08 [scrapy.crawler] INFO: Overridden settings:
{}
2021-01-22 14:45:08 [scrapy.extensions.telnet] INFO: Telnet Password: e069fdc326e91dbe
2021-01-22 14:45:08 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.logstats.LogStats']
2021-01-22 14:45:08 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.download

{'job_title': 'Accountant', 'avg_salary': '₹249,369', 'salary_component': ['Base Salary', 'Bonus', 'Profit Sharing', 'Commission', 'Total Pay'], 'Base Salary': '₹123k - ₹507k', 'Bonus': '₹5k - ₹59k', 'Profit Sharing': '₹36 - ₹197k', 'Commission': '₹3k - ₹102k', 'Total Pay': '₹129k - ₹584k', 'pay_by_experience_percent': ['Entry Level', 'Early Career', 'Mid Career', 'Late Career', 'Experienced'], 'Entry Level': '-20%', 'Early Career': '-7%', 'Mid Career': '19%', 'Late Career': '43%', 'Experienced': '62%', 'pay_by_experience_years': ['1', '1-4'], '1': '₹198,831', '1-4': '₹230,933', 'gender_component': ['Male', 'Female', 'Prefer to self-define'], 'Male': '78.7', 'Female': '21.2', 'Prefer to self-define': '0.1', 'health_benefits_componentt': ['Medical', 'Dental', 'Vision', 'None'], 'Medical': '34', 'Dental': '3', 'Vision': '3', 'None': '66', 'popular_skills_component': [], 'affect_skills_component': [], 'career_path_component': [], 'location_component': [], 'employers_component': [], 'relat

In [5]:
# ValueError: arrays must all be same length
# {'job_title': 'Accountant', 'avg_salary': '₹249,369', 'salary_component': ['Base Salary', 'Bonus', 'Profit Sharing', 'Commission', 'Total Pay'], 'Base Salary': '₹123k - ₹507k', 'Bonus': '₹5k - ₹59k', 'Profit Sharing': '₹36 - ₹197k', 'Commission': '₹3k - ₹102k', 'Total Pay': '₹129k - ₹584k', 'pay_by_experience_percent': ['Entry Level', 'Early Career', 'Mid Career', 'Late Career', 'Experienced'], 'Entry Level': '-20%', 'Early Career': '-7%', 'Mid Career': '19%', 'Late Career': '43%', 'Experienced': '62%', 'pay_by_experience_years': ['1', '1-4'], '1': '₹198,831', '1-4': '₹230,933', 'gender_component': ['Male', 'Female', 'Prefer to self-define'], 'Male': '78.7', 'Female': '21.2', 'Prefer to self-define': '0.1', 'health_benefits_componentt': ['Medical', 'Dental', 'Vision', 'None'], 'Medical': '34', 'Dental': '3', 'Vision': '3', 'None': '66', 'popular_skills_component': [], 'affect_skills_component': [], 'career_path_component': [], 'location_component': [], 'employers_component': [], 'related_job_component': []}
# <a class="popular-skill__name" href="/research/IN/Job=Accountant/Salary/ebd17157/General-Ledger-Accounting">General Ledger Accounting</a>

In [6]:
from scrapy import Selector

In [7]:
html = '<div class="popular-skill__container"><div class="popular-skill"><div class="popular-skill__col"><a class="popular-skill__name" href="/research/IN/Job=Software_Engineer/Salary/e46947ad/Java">Java</a><div class="popular-skill__pay">₹580,974</div></div><div class="popular-skill__col"><div class="dot-chart " style="width: 150px;"><div class="dot-chart__dot dot-chart__dot--filled"></div><div class="dot-chart__dot dot-chart__dot--filled"></div><div class="dot-chart__dot dot-chart__dot--filled"></div><div class="dot-chart__dot dot-chart__dot--filled"></div><div class="dot-chart__dot dot-chart__dot--filled"></div></div><div class="popular-skill__count">6,259 responses</div></div></div><div class="popular-skill"><div class="popular-skill__col"><a class="popular-skill__name" href="/research/IN/Job=Software_Engineer/Salary/7bc0c77a/SQL">SQL</a><div class="popular-skill__pay">₹500,373</div></div><div class="popular-skill__col"><div class="dot-chart " style="width: 150px;"><div class="dot-chart__dot dot-chart__dot--filled"></div><div class="dot-chart__dot dot-chart__dot--filled"></div><div class="dot-chart__dot dot-chart__dot--filled"></div><div class="dot-chart__dot dot-chart__dot--filled"></div><div class="dot-chart__dot dot-chart__dot--filled"></div></div><div class="popular-skill__count">5,157 responses</div></div></div><div class="popular-skill"><div class="popular-skill__col"><a class="popular-skill__name" href="/research/IN/Job=Software_Engineer/Salary/b04ea629/JavaScript">JavaScript</a><div class="popular-skill__pay">₹554,193</div></div><div class="popular-skill__col"><div class="dot-chart " style="width: 150px;"><div class="dot-chart__dot dot-chart__dot--filled"></div><div class="dot-chart__dot dot-chart__dot--filled"></div><div class="dot-chart__dot dot-chart__dot--filled"></div><div class="dot-chart__dot dot-chart__dot--filled"></div><div class="dot-chart__dot "></div></div><div class="popular-skill__count">4,697 responses</div></div></div><div class="popular-skill"><div class="popular-skill__col"><a class="popular-skill__name" href="/research/IN/Job=Software_Engineer/Salary/70dd6d8e/C%23-Programming-Language">C# Programming Language</a><div class="popular-skill__pay">₹508,819</div></div><div class="popular-skill__col"><div class="dot-chart " style="width: 150px;"><div class="dot-chart__dot dot-chart__dot--filled"></div><div class="dot-chart__dot dot-chart__dot--filled"></div><div class="dot-chart__dot dot-chart__dot--filled"></div><div class="dot-chart__dot "></div><div class="dot-chart__dot "></div></div><div class="popular-skill__count">3,057 responses</div></div></div><div class="popular-skill"><div class="popular-skill__col"><a class="popular-skill__name" href="/research/IN/Job=Software_Engineer/Salary/53eb09e3/Python">Python</a><div class="popular-skill__pay">₹636,503</div></div><div class="popular-skill__col"><div class="dot-chart " style="width: 150px;"><div class="dot-chart__dot dot-chart__dot--filled"></div><div class="dot-chart__dot dot-chart__dot--filled"></div><div class="dot-chart__dot dot-chart__dot--filled"></div><div class="dot-chart__dot "></div><div class="dot-chart__dot "></div></div><div class="popular-skill__count">2,811 responses</div></div></div></div>'
sel = Selector(text = html)

print(sel.css('a.popular-skill__name::text').extract())

['Java', 'SQL', 'JavaScript', 'C# Programming Language', 'Python']
