### A) Import library

In [1]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
from datetime import timedelta

### B) Request and Fetch the Webpage

In [2]:
# hit "www.ambitionbox.com/list-of-companies"
requests.get("https://www.ambitionbox.com/list-of-companies?page=1")

<Response [200]>

In [3]:
requests.get("https://www.ambitionbox.com/list-of-companies?page=1").text

'<!doctype html>\n<html data-n-head-ssr lang="en" data-n-head="%7B%22lang%22:%7B%22ssr%22:%22en%22%7D%7D">\n  <head >\n    <meta charset="UTF-8">\n    <meta name="viewport" content="width=device-width,initial-scale=1">\n    <meta http-equiv="X-UA-Compatible" content="IE=edge"> \n    <title>List of companies - 594.3k companies | AmbitionBox</title><meta data-n-head="ssr" name="copyright" content="2021 AmbitionBox"><meta data-n-head="ssr" name="revisit-after" content="1 day"><meta data-n-head="ssr" name="application-name" content="AmbitionBox"><meta data-n-head="ssr" name="content-language" content="EN"><meta data-n-head="ssr" name="google-signin-client_id" content="462822053404-hphug4pkahqljh2tc96g35at47o4isv2.apps.googleusercontent.com"><meta data-n-head="ssr" property="fb:app_id" content="712617688793459"><meta data-n-head="ssr" name="theme-color" content="#ffffff"><meta data-n-head="ssr" name="msapplication-navbutton-color" content="#ffffff"><meta data-n-head="ssr" name="apple-mobile

In [4]:
# google chrome browser's request header (to make it look like, we are making this request from a browser)
header = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"
}

In [5]:
# hit using the header
response = requests.get("https://www.ambitionbox.com/list-of-companies?page=1", headers=header)

In [6]:
# see the recieved page source
response.text[0:500]

'<!doctype html>\n<html data-n-head-ssr lang="en" data-n-head="%7B%22lang%22:%7B%22ssr%22:%22en%22%7D%7D">\n  <head >\n    <meta charset="UTF-8">\n    <meta name="viewport" content="width=device-width,initial-scale=1">\n    <meta http-equiv="X-UA-Compatible" content="IE=edge"> \n    <title>List of companies - 594.3k companies | AmbitionBox</title><meta data-n-head="ssr" name="copyright" content="2021 AmbitionBox"><meta data-n-head="ssr" name="revisit-after" content="1 day"><meta data-n-head="ssr" name='

### C) Pass the fetched webpage response to Beautiful Soup

In [7]:
# give the webpage to Beautiful Soup using parsers: "html.parser" or "lxml"
soup = BeautifulSoup(response.text, 'lxml')

In [8]:
# we see the whole webpage is made of cards and each card has the company info
# on inspecting, we see the cards are HTML: "div"s with class-name = "company-content-wrapper"
# Let us extract the first card and see how we can extract data from it...

first_company_card = soup.find("div", class_="company-content-wrapper")

In [9]:
first_company_card

<div class="company-content-wrapper"><div class="company-content"><div class="company-logo"><img alt="Tata Consultancy Services logo" class="lazy" data-src="https://static.ambitionbox.com/alpha/company/photos/logos/tcs.jpg" onerror="this.onerror=null;this.src='/static/icons/company-placeholder.svg';" src="https://static.ambitionbox.com/static/icons/company-placeholder.svg"/></div> <div class="company-info-wrapper"><div class="company-info"><div class="left"><a href="/overview/tcs-overview"><h2 class="company-name bold-title-l" title="TCS">
								TCS
							</h2></a> <div class="rating-wrapper"><p class="rating badge-large rating-35"><i class="icon icon-star"></i>
								3.9
							</p> <a class="review-count sbold-Labels" href="https://www.ambitionbox.com/reviews/tcs-reviews">
								(30.2k Reviews)
							</a></div></div> <button class="ab_btn follow-btn invert round"><span class="ctas-btn-medium">Follow</span></button></div> <div class="leaf-list-wrapper noscrollbars-sm mg-btm-16

In [10]:
# let's see what we got here...
print(first_company_card.prettify())

<div class="company-content-wrapper">
 <div class="company-content">
  <div class="company-logo">
   <img alt="Tata Consultancy Services logo" class="lazy" data-src="https://static.ambitionbox.com/alpha/company/photos/logos/tcs.jpg" onerror="this.onerror=null;this.src='/static/icons/company-placeholder.svg';" src="https://static.ambitionbox.com/static/icons/company-placeholder.svg"/>
  </div>
  <div class="company-info-wrapper">
   <div class="company-info">
    <div class="left">
     <a href="/overview/tcs-overview">
      <h2 class="company-name bold-title-l" title="TCS">
       TCS
      </h2>
     </a>
     <div class="rating-wrapper">
      <p class="rating badge-large rating-35">
       <i class="icon icon-star">
       </i>
       3.9
      </p>
      <a class="review-count sbold-Labels" href="https://www.ambitionbox.com/reviews/tcs-reviews">
       (30.2k Reviews)
      </a>
     </div>
    </div>
    <button class="ab_btn follow-btn invert round">
     <span class="ctas-btn-m

### D) Extract the data from a single company card with its diferent attributes

### D-1: Company Name

In [11]:
# 1. company name is inside h2 tag: <h2 class="company-name bold-title-l" title="TCS">
first_company_card.find("h2")

<h2 class="company-name bold-title-l" title="TCS">
								TCS
							</h2>

In [12]:
# extract company name
first_company_card.find("h2").text.strip()

'TCS'

### D-2: Company Rating

In [13]:
# 2. company rating lies inside p tag: <p class="rating badge-large rating-35">
first_company_card.find("p", class_="rating")

<p class="rating badge-large rating-35"><i class="icon icon-star"></i>
								3.9
							</p>

In [14]:
# extract company rating
first_company_card.find("p", class_="rating").text.strip()

'3.9'

### D-3: Number of Company Reviews

In [15]:
# 3. number of company reviews lies inside a tag: <a class="review-count sbold-Labels">
first_company_card.find("a", class_="review-count sbold-Labels")

<a class="review-count sbold-Labels" href="https://www.ambitionbox.com/reviews/tcs-reviews">
								(30.2k Reviews)
							</a>

In [16]:
# extract number of company reviews
first_company_card.find("a", class_="review-count sbold-Labels").text.strip()

'(30.2k Reviews)'

In [17]:
first_company_card.find("a", class_="review-count sbold-Labels").text.strip().replace(" Reviews", "")

'(30.2k)'

### D-4: Domain | D-5: Location | D-6: Years Old | D-7: Employee Strength

Extract "__infoEntity__" containing: 'domain', 'location', 'years old', 'employee strength'

In [18]:
first_company_card.find_all("p", class_="infoEntity sbold-list-header")

[<p class="infoEntity sbold-list-header"><i class="icon-domain"></i>
 						Public
 					</p>,
 <p class="infoEntity sbold-list-header"><i class="icon-pin-drop"></i>
 						Mumbai,Maharashtra 
 					</p>,
 <p class="infoEntity sbold-list-header"><i class="icon-access-time"></i>
 						54 years old
 					</p>,
 <p class="infoEntity sbold-list-header"><i class="icon-supervisor-account"></i>
 						1 Lakh+ employees (India)
 					</p>]

In [19]:
first_company_card.find_all("p", class_="infoEntity sbold-list-header")[0]

<p class="infoEntity sbold-list-header"><i class="icon-domain"></i>
						Public
					</p>

In [20]:
inner_company_info_list = first_company_card.find_all("p", class_="infoEntity sbold-list-header")
inner_company_info_list

[<p class="infoEntity sbold-list-header"><i class="icon-domain"></i>
 						Public
 					</p>,
 <p class="infoEntity sbold-list-header"><i class="icon-pin-drop"></i>
 						Mumbai,Maharashtra 
 					</p>,
 <p class="infoEntity sbold-list-header"><i class="icon-access-time"></i>
 						54 years old
 					</p>,
 <p class="infoEntity sbold-list-header"><i class="icon-supervisor-account"></i>
 						1 Lakh+ employees (India)
 					</p>]

In [21]:
inner_company_info_list[0]

<p class="infoEntity sbold-list-header"><i class="icon-domain"></i>
						Public
					</p>

In [22]:
inner_company_info_list[3].findChildren("i")[0]["class"][0]

'icon-supervisor-account'

In [23]:
inner_company_info_list[3].text.strip()

'1 Lakh+ employees (India)'

In [24]:
# let's try using it's parent tag: div
inner_company_info_card = first_company_card.find("div", class_="company-basic-info")
print(inner_company_info_card.prettify())

<div class="company-basic-info">
 <p class="infoEntity sbold-list-header">
  <i class="icon-domain">
  </i>
  Public
 </p>
 <p class="infoEntity sbold-list-header">
  <i class="icon-pin-drop">
  </i>
  Mumbai,Maharashtra
 </p>
 <p class="infoEntity sbold-list-header">
  <i class="icon-access-time">
  </i>
  54 years old
 </p>
 <p class="infoEntity sbold-list-header">
  <i class="icon-supervisor-account">
  </i>
  1 Lakh+ employees (India)
 </p>
</div>



In [25]:
inner_company_info_card.findChildren("i")

[<i class="icon-domain"></i>,
 <i class="icon-pin-drop"></i>,
 <i class="icon-access-time"></i>,
 <i class="icon-supervisor-account"></i>]

In [26]:
inner_company_info_card.findChildren("i")[0]["class"][0]

'icon-domain'

In [27]:
inner_company_info_card.find_all("p")[0].text.strip()

'Public'

In [28]:
# ignore
first_company_card.find_all("p", class_="infoEntity sbold-list-header")[0]

<p class="infoEntity sbold-list-header"><i class="icon-domain"></i>
						Public
					</p>

In [29]:
# Extract "infoEntity" containing 'domain', 'location', 'years old' & 'employee strength'

info_list = first_company_card.find_all("p", class_="infoEntity sbold-list-header")
dom = None
loc = None
old = None
emp = None

for i in range(4):
    if info_list[i].findChildren("i")[0]["class"][0] == 'icon-domain':
        dom = info_list[i].text.strip()
        
    if info_list[i].findChildren("i")[0]["class"][0] == 'icon-pin-drop':
        loc = info_list[i].text.strip()
        
    if info_list[i].findChildren("i")[0]["class"][0] == 'icon-access-time':
        old = info_list[i].text.strip()
        
    if info_list[i].findChildren("i")[0]["class"][0] == 'icon-supervisor-account':
        emp = info_list[i].text.strip()
    
print("domain:", dom)
print("location:", loc)
print("years old:", old)
print("employee strength:", emp)

domain: Public
location: Mumbai,Maharashtra
years old: 54 years old
employee strength: 1 Lakh+ employees (India)


### D-8: Company Tags

In [30]:
# 8. company tags are inside a tags: <a class="ab_chip">
first_company_card.find_all("a", class_="ab_chip")

[<a class="ab_chip body-medium" data-filter-name="chips_Company-Tags_bpo-or-call-centre" href="/bpo-or-call-centre-companies-in-india" title="BPO / Call Centre companies in India">
 						BPO / Call Centre
 					</a>,
 <a class="ab_chip body-medium" data-filter-name="chips_Company-Tags_it-services-and-consulting" href="/it-services-and-consulting-companies-in-india" title="IT Services &amp; Consulting companies in India">
 						IT Services &amp; Consulting
 					</a>,
 <a class="ab_chip body-medium" data-filter-name="chips_Company-Tags_fortune500" href="/fortune500-companies-in-india" title="Fortune500 companies in India">
 						Fortune500
 					</a>,
 <a class="ab_chip body-medium" data-filter-name="chips_Company-Tags_forbesglobal2000" href="/forbesglobal2000-companies-in-india" title="Forbes Global 2000 companies in India">
 						Forbes Global 2000
 					</a>,
 <a class="ab_chip body-medium" data-filter-name="chips_Company-Tags_public" href="/public-companies-in-india" title="Publ

In [31]:
# extract company tags
tags = []
for tag in first_company_card.find_all("a", class_="ab_chip"):
    tags.append(tag.text.strip())
tags

['BPO / Call Centre',
 'IT Services & Consulting',
 'Fortune500',
 'Forbes Global 2000',
 'Public',
 'Mumbai,Maharashtra']

In [32]:
tags = ', '.join(tags)
tags

'BPO / Call Centre, IT Services & Consulting, Fortune500, Forbes Global 2000, Public, Mumbai,Maharashtra'

### D-9: Company Description

In [33]:
# 9. company description is inside p tag: <p class="description">
first_company_card.find("p", class_="description")

<p class="description body-small" itemprop="description">
				We ensure the highest levels of certainty and satisfaction through a deep-set commitment to our clients, comprehensive industry expertise and a global network of innovation and delivery centers.

We function as a full stakeholder to business, offering a consulting-led approach with an integrated portfolio of technology led solutions that encompass the entire Enterprise value chain. Our Customer-centric Engagement Model defines how we do engage with you, offering specialized services and solutions that meet the distinct needs of your business.

We build bespoke teams around your domain and technology requirements drawn from our talent pool of over 488,649 global professionals including 36.5% women from 154 nationalities. Our domain expertise has been built upon decades of experience working across industries and this knowledge underpins our suite of solutions.

Our organization structure is domain led and empowered to help pr

In [34]:
# extract company description
first_company_card.find("p", class_="description").text.strip()

'We ensure the highest levels of certainty and satisfaction through a deep-set commitment to our clients, comprehensive industry expertise and a global network of innovation and delivery centers.\n\nWe function as a full stakeholder to business, offering a consulting-led approach with an integrated portfolio of technology led solutions that encompass the entire Enterprise value chain. Our Customer-centric Engagement Model defines how we do engage with you, offering specialized services and solutions that meet the distinct needs of your business.\n\nWe build bespoke teams around your domain and technology requirements drawn from our talent pool of over 488,649 global professionals including 36.5% women from 154 nationalities. Our domain expertise has been built upon decades of experience working across industries and this knowledge underpins our suite of solutions.\n\nOur organization structure is domain led and empowered to help provide Customers a single window to industry specific so

### E) Scraping a Single WebPage

In [35]:
# find all the company cards in the webpage (HTML divs that encloses data about each company)
company_cards = soup.find_all("div", class_="company-content-wrapper")
len(company_cards)

30

In [36]:
%%time

name = []
rating = []
reviews = []
domain = []
location = []
years_old = []
employee_strength = []
tags = []
about = []

for card in company_cards:
    # 1. name
    name.append(card.find("h2").text.strip())
    
    # 2. rating
    rating.append(card.find("p", class_="rating").text.strip())
    
    # 3. reviews
    reviews.append(card.find("a", class_="review-count sbold-Labels").text.strip().replace(" Reviews", ""))
    
    # 4. domain, 5. location, 6. years old & 7. employee strength
    info_list = card.find_all("p", class_="infoEntity sbold-list-header")
    dom = None
    loc = None
    old = None
    emp = None
    for i in range(4):
        try:
            if info_list[i].findChildren("i")[0]["class"][0] == 'icon-domain':
                dom = info_list[i].text.strip()

            if info_list[i].findChildren("i")[0]["class"][0] == 'icon-pin-drop':
                loc = info_list[i].text.strip()

            if info_list[i].findChildren("i")[0]["class"][0] == 'icon-access-time':
                old = info_list[i].text.strip()

            if info_list[i].findChildren("i")[0]["class"][0] == 'icon-supervisor-account':
                emp = info_list[i].text.strip()
        except:
            pass
    
    domain.append(dom)
    location.append(loc)
    years_old.append(old)
    employee_strength.append(emp)
    
    # 8. tags
    t = []
    for tag in card.find_all("a", class_="ab_chip"):
        t.append(tag.text.strip())
    t = ', '.join(t)
    tags.append(t)
    
    # 9. about
    about.append(card.find("p", class_="description").text.strip())
    
col_dic = {
    "name": name,
    "rating": rating,
    "reviews": reviews,
    "domain": domain,
    "location": location,
    "years_old": years_old,
    "employee_strength": employee_strength,
     "tags": tags,
    "about": about
}

df = pd.DataFrame(col_dic)

Wall time: 57.5 ms


In [37]:
df

Unnamed: 0,name,rating,reviews,domain,location,years_old,employee_strength,tags,about
0,TCS,3.9,(30.2k),Public,"Mumbai,Maharashtra",54 years old,1 Lakh+ employees (India),"BPO / Call Centre, IT Services & Consulting, F...",We ensure the highest levels of certainty and ...
1,Accenture,4.0,(21.7k),Public,"Dublin,Dublin",33 years old,1 Lakh+ employees (India),"IT Services & Consulting, Software Product, Fo...",Accenture is a global professional services co...
2,Cognizant,3.9,(19.9k),Private,"Teaneck,New Jersey",28 years old,1 Lakh+ employees (India),"IT Services & Consulting, Forbes Global 2000","At Cognizant, we give organizations the insigh..."
3,ICICI Bank,4.0,(26.2k),Public,"Mumbai,Maharashtra",28 years old,50k-1 Lakh employees (India),"Chemicals / Agri Inputs, Financial Services, M...",ICICI Bank is a leading private sector bank in...
4,HDFC Bank,4.0,(27.5k),Public,"Mumbai,Maharashtra",28 years old,1 Lakh+ employees (India),"Financial Services, Banking, Insurance, Fortun...",The Housing Development Finance Corporation Li...
5,Wipro,3.8,(18.4k),Public,"Bangalore/Bengaluru,Karnataka",77 years old,1 Lakh+ employees (India),"IT Services & Consulting, Conglomerate, Fortun...","Wipro Limited (NYSE: WIT, BSE: 507685, NSE: WI..."
6,Capgemini,3.8,(15.2k),Public,Paris,55 years old,1 Lakh+ employees (India),"Internet, Analytics / KPO / Research, IT Servi...","Capgemini is a global leader in consulting, di..."
7,HCL Technologies,3.8,(15.5k),Public,"Noida,Uttar Pradesh",46 years old,1 Lakh+ employees (India),"Financial Services, Consumer Electronics & App...",HCL Technologies is a next-generation global t...
8,Infosys,3.9,(19k),Public,"Bangalore/Bengaluru,Karnataka",41 years old,1 Lakh+ employees (India),"IT Services & Consulting, Fortune500, Forbes G...",Infosys is a global leader in next-generation ...
9,Tech Mahindra,3.7,(15k),Public,"Pune,Maharashtra",36 years old,50k-1 Lakh employees (India),"Consulting, Analytics / KPO / Research, IT Ser...",Tech Mahindra offers innovative and customer-c...


### F) Scraping the Whole Website

In [38]:
start_time = time.time()
dataframe_final = pd.DataFrame()

# 4,86,333 unique companies found / 30 per page = 16,211 pages
# total_number_of_webpages = 16,211
total_number_of_webpages = 10

In [39]:
for page in range(1, total_number_of_webpages+1):
    print("scraping webpage number: {page} of {total}".format(page=page, total=total_number_of_webpages))
    loop_time = time.time()
    
    # set page url and header
    url = "https://www.ambitionbox.com/list-of-companies?page={}".format(page)
    header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"}
    
    # get page response from the website
    response = requests.get(url, headers=header)
    # time.sleep(0.1)
    
    # pass the page to BeautifulSoup
    soup = BeautifulSoup(response.text, 'lxml')
    
    # find all the company cards from the webpage
    company_cards = soup.find_all("div", class_="company-content-wrapper")
    
    # extract all the required data from each company card and store them in a list
    name = []
    rating = []
    reviews = []
    domain = []
    location = []
    years_old = []
    employee_strength = []
    tags = []
    about = []
    
    # scrap scrap scrap!
    for card in company_cards:
        # 1. name
        try:
            name.append(card.find("h2").text.strip())
        except:
            name.append(None)
            # 2. rating
        try:
            rating.append(card.find("p", class_="rating").text.strip())
        except:
            rating.append(None)

        # 3. reviews
        try:
            reviews.append(card.find("a", class_="review-count sbold-Labels").text.strip().replace(" Reviews", ""))
        except:
            reviews.append(None)

        # 4. domain, 5. location, 6. years old & 7. employee strength
        info_list = card.find_all("p", class_="infoEntity sbold-list-header")
        dom = None
        loc = None
        old = None
        emp = None
        for i in range(4):
            try:
                if info_list[i].findChildren("i")[0]["class"][0] == 'icon-domain':
                    dom = info_list[i].text.strip()

                if info_list[i].findChildren("i")[0]["class"][0] == 'icon-pin-drop':
                    loc = info_list[i].text.strip()

                if info_list[i].findChildren("i")[0]["class"][0] == 'icon-access-time':
                    old = info_list[i].text.strip()

                if info_list[i].findChildren("i")[0]["class"][0] == 'icon-supervisor-account':
                    emp = info_list[i].text.strip()
            except:
                pass
            
        domain.append(dom)
        location.append(loc)
        years_old.append(old)
        employee_strength.append(emp)

        # 8. tags
        t = []
        try:
            for tag in card.find_all("a", class_="ab_chip"):
                t.append(tag.text.strip())
            t = ', '.join(t)
            tags.append(t)
        except:
            tags.append(None)

        # 9. about
        try:
            about.append(card.find("p", class_="description").text.strip())
        except:
            about.append(None)
    
    # make a dictionary containing all the data extracted
    col_dic = {
        "name": name,
        "rating": rating,
        "reviews": reviews,
        "domain": domain,
        "location": location,
        "years_old": years_old,
        "employee_strength": employee_strength,
        "tags": tags,
        "about": about
    }
    
    # pass the dictionary to pandas to create a dataframe (page)
    df = pd.DataFrame(col_dic)
    
    # append the dataframe to the final dataframe (the whole website)
    dataframe_final = dataframe_final.append(df, ignore_index=True)
    
    # success
    print("success!")
    print("time taken:", round((time.time()-loop_time)*1000, 2), "ms")
    print("total time elapsed:", str(timedelta(seconds=(time.time()-start_time))))
    print()

end_time = time.time()
print("full website scraped successfully!")
print("total time taken:", str(timedelta(seconds=(end_time - start_time))))
print()

scraping webpage number: 1 of 10
success!
time taken: 590.39 ms
total time elapsed: 0:00:04.598708

scraping webpage number: 2 of 10
success!
time taken: 749.94 ms
total time elapsed: 0:00:05.349651

scraping webpage number: 3 of 10
success!
time taken: 1241.58 ms
total time elapsed: 0:00:06.591232

scraping webpage number: 4 of 10
success!
time taken: 1273.45 ms
total time elapsed: 0:00:07.864686

scraping webpage number: 5 of 10
success!
time taken: 1266.6 ms
total time elapsed: 0:00:09.131507

scraping webpage number: 6 of 10
success!
time taken: 1160.19 ms
total time elapsed: 0:00:10.291692

scraping webpage number: 7 of 10
success!
time taken: 1188.41 ms
total time elapsed: 0:00:11.480097

scraping webpage number: 8 of 10
success!
time taken: 1086.04 ms
total time elapsed: 0:00:12.566141

scraping webpage number: 9 of 10
success!
time taken: 1106.96 ms
total time elapsed: 0:00:13.673098

scraping webpage number: 10 of 10
success!
time taken: 1190.85 ms
total time elapsed: 0:00:14.

In [40]:
dataframe_final

Unnamed: 0,name,rating,reviews,domain,location,years_old,employee_strength,tags,about
0,TCS,3.9,(30.2k),Public,"Mumbai,Maharashtra",54 years old,1 Lakh+ employees (India),"BPO / Call Centre, IT Services & Consulting, F...",We ensure the highest levels of certainty and ...
1,Accenture,4.0,(21.7k),Public,"Dublin,Dublin",33 years old,1 Lakh+ employees (India),"IT Services & Consulting, Software Product, Fo...",Accenture is a global professional services co...
2,Cognizant,3.9,(19.9k),Private,"Teaneck,New Jersey",28 years old,1 Lakh+ employees (India),"IT Services & Consulting, Forbes Global 2000","At Cognizant, we give organizations the insigh..."
3,ICICI Bank,4.0,(26.2k),Public,"Mumbai,Maharashtra",28 years old,50k-1 Lakh employees (India),"Chemicals / Agri Inputs, Financial Services, M...",ICICI Bank is a leading private sector bank in...
4,HDFC Bank,4.0,(27.5k),Public,"Mumbai,Maharashtra",28 years old,1 Lakh+ employees (India),"Financial Services, Banking, Insurance, Fortun...",The Housing Development Finance Corporation Li...
...,...,...,...,...,...,...,...,...,...
294,Honda,4.1,(2.6k),Private,"Greater Noida,Uttar Pradesh",37 years old,5k-10k employees (India),"Industrial Equipment / Machinery, Manufacturin...","From commuter models loved all over the world,..."
295,Cogent E Services,3.3,(1.1k),Private,"Noida,Uttar Pradesh + 20 more",18 years old,10k-50k employees (India),"BPO/KPO, BPO / Call Centre, IT Services & Cons...",Cogent is a multinational business process sol...
296,Mahindra Holidays ...,3.7,(1.1k),Private,"Chennai,Tamil Nadu + 82 more",26 years old,1k-5k employees (India),"Travel & Tourism / Hospitality, Travel & Touri...","Mahindra Holidays & Resorts India Ltd., (MHRIL..."
297,Indiabulls,3.7,(3.3k),Public,"Gurgaon/Gurugram,Haryana",22 years old,10k-50k employees (India),"Financial Services, NBFC, Conglomerate, Public...",Indiabulls Housing Finance Limited is a mortga...


In [41]:
# Print some statistics about the final dataframe:
print("dataframe shape", dataframe_final.shape)
print()
print("column-wise null count")
print(dataframe_final.isna().sum())
print()

dataframe shape (299, 9)

column-wise null count
name                  0
rating                0
reviews               0
domain               19
location              0
years_old             1
employee_strength     0
tags                  0
about                 0
dtype: int64



In [42]:
dataframe_final.describe()

Unnamed: 0,name,rating,reviews,domain,location,years_old,employee_strength,tags,about
count,299,299.0,299,280,299,298,299,299,299
unique,298,17.0,69,4,213,103,10,275,299
top,Mahindra & Mahindr...,4.1,(1.2k),Public,"Mumbai,Maharashtra",22 years old,10k-50k employees (India),"Financial Services, Insurance, Mumbai,Maharashtra",Conduent delivers mission-critical services an...
freq,2,58.0,23,137,32,20,110,5,1


In [43]:
# export the data to external csv
dataframe_final.to_csv("AMBITION_BOX_COMPANY_DATA.csv", encoding="utf-8")