# Scraping AmbitionBox website for company details

## Importing libraries

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [56]:
import numpy as np

In [2]:
url = "https://www.ambitionbox.com/list-of-companies?page=1"

In [3]:
# we use headers to when websites not allow bots to scrape data, so we use browser's User-Agent to act as browser.
header = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"
}

In [4]:
response = requests.get(url, headers = header)
response.status_code

200

In [5]:
html = response.content
type(html)

bytes

### making soup object

In [6]:
soup = BeautifulSoup(html, 'lxml')

### savingsoup object to a html file

In [13]:
with open('ambition_box.html','wb') as file:
    
    file.write(soup.prettify('utf-8'))

### Finding element containing all data

In [7]:
cinfo = soup.find_all('div', class_="company-content-wrapper")
len(cinfo)

30

In [8]:
cinfo[0].find('h2', class_="company-name bold-title-l").text.strip()

'TCS'

In [9]:
# finding names
name = [n.find('h2', class_="company-name bold-title-l").text.strip() for n in cinfo]

In [10]:
#finding ratings
cinfo[10].find('div', class_="rating-wrapper").find('p').text.strip()

'4.0'

In [11]:
# ratings
rating = [None if r is None else r.find('div', class_="rating-wrapper").find('p').text.strip() for r in cinfo]

In [13]:
company_info = soup.find_all('div', class_="company-basic-info")
len(company_info)

30

In [20]:
company_info[0].find_all('p')

[<p class="infoEntity sbold-list-header"><i class="icon-domain"></i>
 						Public
 					</p>,
 <p class="infoEntity sbold-list-header"><i class="icon-pin-drop"></i>
 						Mumbai,Maharashtra + 216 more
 					</p>,
 <p class="infoEntity sbold-list-header"><i class="icon-access-time"></i>
 						54 years old
 					</p>,
 <p class="infoEntity sbold-list-header"><i class="icon-supervisor-account"></i>
 						1 Lakh+ employees (India)
 					</p>]

In [24]:
# company type
company_type = [t.find_all('p')[0].text.strip() for t in company_info]

In [26]:
# company headquarters
c_headquarters = [t.find_all('p')[1].text.strip() for t in company_info]

In [28]:
# company age
c_age = [t.find_all('p')[2].text.strip() for t in company_info]

In [32]:
# number of employees
c_employees = [t.find_all('p')[3].text.strip() for t in company_info]


In [33]:
# scrapping other details

In [65]:
cinfo[0].find_all('a', class_= "ab_chip body-medium")

[<a class="ab_chip body-medium" data-filter-name="chips_Company-Tags_consumer-goods" href="/consumer-goods-companies-in-india" title="Consumer goods companies in India">
 						Consumer goods
 					</a>,
 <a class="ab_chip body-medium" data-filter-name="chips_Company-Tags_financial-services" href="/financial-services-companies-in-india" title="Financial Services companies in India">
 						Financial Services
 					</a>,
 <a class="ab_chip body-medium" data-filter-name="chips_Company-Tags_manufacturing" href="/manufacturing-companies-in-india" title="Manufacturing companies in India">
 						Manufacturing
 					</a>,
 <a class="ab_chip body-medium" data-filter-name="chips_Company-Tags_insurance" href="/insurance-companies-in-india" title="Insurance companies in India">
 						Insurance
 					</a>,
 <a class="ab_chip body-medium" data-filter-name="chips_Company-Tags_gurgaon" href="/companies-in-gurgaon" title="Companies in Gurgaon/Gurugram,Haryana">
 						Gurgaon/Gurugram,Haryana
 					

In [40]:
a_tag = [o.find_all('a', class_= "ab_chip body-medium") for o in cinfo]
len(a_tag)

30

In [44]:
other_details = []

for a in a_tag:
    ot = []
    for s in a:
        details = s.text.strip()
        ot.append(details)
    result = ", ".join(ot)
    other_details.append(result)

In [46]:
other_details[:3]

['BPO / Call Centre, IT Services & Consulting, Fortune500, Forbes Global 2000, Public, Mumbai,Maharashtra',
 'IT Services & Consulting, Software Product, Forbes Global 2000, Public',
 'IT Services & Consulting, Forbes Global 2000']

### converting data to dictonary format 

In [47]:
df = {'company_name': name, 
      'rating': rating, 
      'company_type': company_type, 
      'company_headquarters':c_headquarters, 
      'company_age':c_age, 
      'no_of_employees': c_employees, 
      'other_details': other_details}

In [52]:
data = pd.DataFrame(df)
data.head()

Unnamed: 0,company_name,rating,company_type,company_headquarters,company_age,no_of_employees,other_details
0,TCS,3.9,Public,"Mumbai,Maharashtra + 216 more",54 years old,1 Lakh+ employees (India),"BPO / Call Centre, IT Services & Consulting, F..."
1,Accenture,4.0,Public,"Dublin,Dublin + 124 more",33 years old,1 Lakh+ employees (India),"IT Services & Consulting, Software Product, Fo..."
2,Cognizant,3.9,Private,"Teaneck,New Jersey + 76 more",28 years old,1 Lakh+ employees (India),"IT Services & Consulting, Forbes Global 2000"
3,ICICI Bank,4.0,Public,"Mumbai,Maharashtra + 918 more",28 years old,50k-1 Lakh employees (India),"Chemicals / Agri Inputs, Financial Services, M..."
4,HDFC Bank,4.0,Public,"Mumbai,Maharashtra + 1002 more",28 years old,1 Lakh+ employees (India),"Financial Services, Banking, Insurance, Fortun..."


## Now automating the code and fetching data from next 4 pages for top 120 companies by popularity 

In [57]:
final = pd.DataFrame()

# for itirating through pages
for i in range(1,5):
    
    url = f'https://www.ambitionbox.com/list-of-companies?page={i}'
    header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"}
    
    response = requests.get(url, headers = header)
    html = response.content
    soup = BeautifulSoup(html, 'lxml')
    cinfo = soup.find_all('div', class_="company-content-wrapper")
    company_info = soup.find_all('div', class_="company-basic-info")
    # finding names
    name = [n.find('h2', class_="company-name bold-title-l").text.strip() for n in cinfo]
    # ratings
    rating = [None if r is None else r.find('div', class_="rating-wrapper").find('p').text.strip() for r in cinfo]
    # company type
    company_type = [t.find_all('p')[0].text.strip() for t in company_info]
    # company headquarters
    c_headquarters = [t.find_all('p')[1].text.strip() for t in company_info]
    # company age
    c_age = [t.find_all('p')[2].text.strip() for t in company_info]
    # number of employees
    c_employees = []
    
    for t in company_info:
        try:
            ss = t.find_all('p')[2].text.strip()
            c_employees.append(ss)
        except:
            c_employees.append(np.nan)
    # for other details
    a_tag = [o.find_all('a', class_= "ab_chip body-medium") for o in cinfo]
    
    other_details = []

    for a in a_tag:
        ot = []
        for s in a:
            details = s.text.strip()
            ot.append(details)
        result = ", ".join(ot)
        other_details.append(result)
    
    # creating dictionary for dataframe
    df = {'company_name': name, 
      'rating': rating, 
      'company_type': company_type, 
      'company_headquarters':c_headquarters, 
      'company_age':c_age, 
      'no_of_employees': c_employees, 
      'other_details': other_details}
    
    data = pd.DataFrame(df)
    
    final = final.append(data, ignore_index= True)
    
    
    

## Saving dataframe to a csv file

In [66]:
final.to_csv('company_info.csv',index= False)