## Scrapping data using bs4

Required packages :-
1. BeautifulSoup
2. Requests
3. Pandas

### Tabular data

In [1]:
import requests
from bs4 import BeautifulSoup

# Replace with the URL you want to scrap the data from 
wiki_url = requests.get('https://en.wikipedia.org/wiki/List_of_banks_in_India').text

In [2]:
soup = BeautifulSoup(wiki_url,'lxml')

Using the inspect HTML fearture find the class name and data structure type for the data. 
For this link the data was displayed using using tables under the class name called 'wikitable sortable'.

In [3]:

# Change the 'table' and 'class' values
My_table = soup.find_all('table',{'class':'wikitable sortable'})

# Returns the HTML code snippet for the same. 
print(My_table)

[<table class="wikitable sortable">
<tbody><tr>
<th>Anchor Bank</th>
<th>Trademark</th>
<th>Merged Banks</th>
<th><a href="/wiki/Asset" title="Asset">Total Assets</a></th>
<th><a href="/wiki/Revenue" title="Revenue">Revenues</a></th>
<th>Branches</th>
<th>Established</th>
<th>Headquarter</th>
<th>Refs
</th></tr>
<tr>
<td><a href="/wiki/Bank_of_Baroda" title="Bank of Baroda">Bank of Baroda</a> (63.74%)</td>
<td><a class="image" href="/wiki/File:BankOfBarodaLogo.svg"><img alt="BankOfBarodaLogo.svg" data-file-height="31" data-file-width="89" decoding="async" height="28" src="//upload.wikimedia.org/wikipedia/en/thumb/f/f2/BankOfBarodaLogo.svg/80px-BankOfBarodaLogo.svg.png" srcset="//upload.wikimedia.org/wikipedia/en/thumb/f/f2/BankOfBarodaLogo.svg/120px-BankOfBarodaLogo.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/f/f2/BankOfBarodaLogo.svg/160px-BankOfBarodaLogo.svg.png 2x" width="80"/></a></td>
<td><div><ul><li><a href="/wiki/Vijaya_Bank" title="Vijaya Bank">Vijaya Bank</a></li

In [4]:
# Then using string operations found all HTML links , '<a>' 
links = []
for table in My_table:
    links.append(table.find_all('a'))
links

[[<a href="/wiki/Asset" title="Asset">Total Assets</a>,
  <a href="/wiki/Revenue" title="Revenue">Revenues</a>,
  <a href="/wiki/Bank_of_Baroda" title="Bank of Baroda">Bank of Baroda</a>,
  <a class="image" href="/wiki/File:BankOfBarodaLogo.svg"><img alt="BankOfBarodaLogo.svg" data-file-height="31" data-file-width="89" decoding="async" height="28" src="//upload.wikimedia.org/wikipedia/en/thumb/f/f2/BankOfBarodaLogo.svg/80px-BankOfBarodaLogo.svg.png" srcset="//upload.wikimedia.org/wikipedia/en/thumb/f/f2/BankOfBarodaLogo.svg/120px-BankOfBarodaLogo.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/f/f2/BankOfBarodaLogo.svg/160px-BankOfBarodaLogo.svg.png 2x" width="80"/></a>,
  <a href="/wiki/Vijaya_Bank" title="Vijaya Bank">Vijaya Bank</a>,
  <a href="/wiki/Dena_Bank" title="Dena Bank">Dena Bank</a>,
  <a href="/wiki/Vadodara" title="Vadodara">Vadodara</a>,
  <a href="/wiki/Gujarat" title="Gujarat">Gujarat</a>,
  <a href="#cite_note-BoB-1">[1]</a>,
  <a href="#cite_note-Offices_&am

In [5]:
# Cleaning the data to get only bank names.

Names = []

for link in links:
    for l in link:
        Names.append(l.get('title'))
    
print(Names)

final = []
for i in Names:
    if i:
        if "Bank" in i:
            final.append(i)
        
    

['Asset', 'Revenue', 'Bank of Baroda', None, 'Vijaya Bank', 'Dena Bank', 'Vadodara', 'Gujarat', None, None, 'Bank of India', None, 'Mumbai', 'Maharashtra', None, 'Bank of Maharashtra', None, 'Pune', 'Maharashtra', 'Canara Bank', None, 'Syndicate Bank', 'Bengaluru', 'Karnataka', None, 'Central Bank of India', None, 'Mumbai', 'Maharashtra', 'Indian Bank', None, 'Allahabad Bank', 'Chennai', 'Tamil Nadu', None, 'Indian Overseas Bank', None, 'Chennai', 'Tamil Nadu', None, 'Punjab and Sind Bank', None, 'New Delhi', 'Delhi', 'Punjab National Bank', None, 'Oriental Bank of Commerce', 'United Bank of India', 'New Delhi', 'Delhi', None, 'State Bank of India', None, 'State Bank of Bikaner & Jaipur', 'State Bank of Hyderabad', 'State Bank of Indore', 'State Bank of Mysore', 'State Bank of Patiala', 'State Bank of Saurashtra', 'State Bank of Travancore', 'Bhartiya Mahila Bank', 'Mumbai', 'Maharashtra', None, 'UCO Bank', None, 'Kolkata', 'West Bengal', None, 'Union Bank of India', None, 'Andhra Bank

In [6]:
print(final)

['Bank of Baroda', 'Vijaya Bank', 'Dena Bank', 'Bank of India', 'Bank of Maharashtra', 'Canara Bank', 'Syndicate Bank', 'Central Bank of India', 'Indian Bank', 'Allahabad Bank', 'Indian Overseas Bank', 'Punjab and Sind Bank', 'Punjab National Bank', 'Oriental Bank of Commerce', 'United Bank of India', 'State Bank of India', 'State Bank of Bikaner & Jaipur', 'State Bank of Hyderabad', 'State Bank of Indore', 'State Bank of Mysore', 'State Bank of Patiala', 'State Bank of Saurashtra', 'State Bank of Travancore', 'Bhartiya Mahila Bank', 'UCO Bank', 'Union Bank of India', 'Andhra Bank', 'Corporation Bank', 'Axis Bank', 'Bandhan Bank', 'Catholic Syrian Bank', 'City Union Bank', 'DCB Bank', 'Dhanlaxmi Bank', 'Federal Bank', 'HDFC Bank', 'ICICI Bank', 'IDBI Bank', 'IDFC First Bank', 'IndusInd Bank', 'Jammu & Kashmir Bank', 'Karnataka Bank', 'Karur Vysya Bank', 'Kotak Mahindra Bank', 'Lakshmi Vilas Bank', 'Nainital Bank', 'RBL Bank', 'South Indian Bank', 'Tamilnad Mercantile Bank Limited', 'Ye

In [9]:
#Using pandas to export the data in CSV format.

import pandas as pd
data = pd.DataFrame(final,columns= ['Bank name'])
data.to_csv('Banks.csv', index = False, header=True)


### Lists  

wikipedia often uses lists to display the data . So the same process is done when the data is displayed using lists.

In [27]:
wiki_url2 = requests.get('https://en.wikipedia.org/wiki/Category:Hotel_chains_in_India').text
soup2 = BeautifulSoup(wiki_url2,'lxml')
My_table2 = soup2.find_all('ul')

In [32]:
links = []
for t in My_table2:
    links.append(t.find_all('a'))

In [33]:
links

[[<a href="/wiki/Category:Oberoi_Hotels_%26_Resorts" title="Category:Oberoi Hotels &amp; Resorts">Oberoi Hotels &amp; Resorts</a>],
 [<a href="/wiki/Category:Taj_Hotels_Resorts_and_Palaces" title="Category:Taj Hotels Resorts and Palaces">Taj Hotels Resorts and Palaces</a>],
 [<a href="/wiki/Ashok_Group" title="Ashok Group">Ashok Group</a>],
 [<a href="/wiki/FabHotels" title="FabHotels">FabHotels</a>],
 [<a href="/wiki/Ginger_Hotels" title="Ginger Hotels">Ginger Hotels</a>],
 [<a href="/wiki/ITC_Hotels" title="ITC Hotels">ITC Hotels</a>],
 [<a href="/wiki/Kamat_Hotels" title="Kamat Hotels">Kamat Hotels</a>,
  <a href="/wiki/Keys_Hotels" title="Keys Hotels">Keys Hotels</a>],
 [<a href="/wiki/The_Leela_Palaces,_Hotels_and_Resorts" title="The Leela Palaces, Hotels and Resorts">The Leela Palaces, Hotels and Resorts</a>,
  <a href="/wiki/Lemon_Tree_Hotels" title="Lemon Tree Hotels">Lemon Tree Hotels</a>],
 [<a href="/wiki/Neemrana_Hotels" title="Neemrana Hotels">Neemrana Hotels</a>],
 [<a hr

In [34]:
Names2 = []
for link in links:
    for l in link:
        Names2.append(l.get('title'))

In [35]:
final = []
for i in Names2:
    if i:
        if "Hotel" in i:
            final.append(i)

In [36]:
final

['Category:Oberoi Hotels & Resorts',
 'Category:Taj Hotels Resorts and Palaces',
 'FabHotels',
 'Ginger Hotels',
 'ITC Hotels',
 'Kamat Hotels',
 'Keys Hotels',
 'The Leela Palaces, Hotels and Resorts',
 'Lemon Tree Hotels',
 'Neemrana Hotels',
 'The Park Hotels',
 'Peppermint Hotels',
 'The Residency Group of Hotels',
 'Royal Orchid Hotels',
 'Sarovar Hotels & Resorts',
 'Sinclairs Hotels Limited',
 'Taj Hotels',
 'Treebo Hotels',
 'Category:Hotels in India',
 'Category:Hotel chains by country']

In [None]:
#Using pandas to export the data in CSV format.

import pandas as pd
data = pd.DataFrame(final,columns= ['Hotel name'])
data.to_csv('Hotels.csv', index = False, header=True)

