# Web Scrapping Project using Python

## Web scrapping of The World's Billionaires - 2025 data from wikipedia

### Importing Packages

In [None]:
from bs4 import BeautifulSoup
import requests

### Fetching website data

In [None]:
url = 'https://en.wikipedia.org/wiki/The_World%27s_Billionaires'

headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"}

page = requests.get(url, headers = headers)

print(page)

soup = BeautifulSoup(page.text, 'html')

print(soup.prettify())


<Response [200]>
<!DOCTYPE html>
<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 skin-theme-clientpref-day vector-sticky-header-enabled wp25eastereggs-enable-clientpref-1 vector-toc-available skin-theme-clientpref-thumb-standard" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   The World's Billionaires - Wikipedia
  </title>
  <script>
   (function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature

### Taking the ranking table from the website

In [None]:
table = soup.find_all('table')[2]

print(table.prettify())

<table class="wikitable sortable">
 <tbody>
  <tr>
   <th>
    No.
   </th>
   <th>
    Name
   </th>
   <th>
    <a href="/wiki/Net_worth" title="Net worth">
     Net worth
    </a>
    (
    <a href="/wiki/United_States_dollar" title="United States dollar">
     USD
    </a>
    )
   </th>
   <th>
    Age
   </th>
   <th>
    <a href="/wiki/Citizenship" title="Citizenship">
     Nationality
    </a>
   </th>
   <th>
    Primary source(s) of wealth
   </th>
  </tr>
  <tr>
   <td style="text-align:center;">
    <span data-sort-value="7000100000000000000♠">
     1
    </span>
    <span typeof="mw:File">
     <span title="Increase">
      <img alt="Increase" class="mw-file-element" data-file-height="300" data-file-width="300" decoding="async" height="11" src="//upload.wikimedia.org/wikipedia/commons/thumb/b/b0/Increase2.svg/20px-Increase2.svg.png" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/b/b0/Increase2.svg/40px-Increase2.svg.png 2x" width="11"/>
     </span>
    </span>
   

### Extracting the Column Titles

In [None]:
Table_Headers = table.find_all('th')
print(Table_Headers)

[<th>No.
</th>, <th>Name
</th>, <th><a href="/wiki/Net_worth" title="Net worth">Net worth</a> (<a href="/wiki/United_States_dollar" title="United States dollar">USD</a>)
</th>, <th>Age
</th>, <th><a href="/wiki/Citizenship" title="Citizenship">Nationality</a>
</th>, <th>Primary source(s) of wealth
</th>]


### Cleaning the Column Titles

In [None]:
Table_Headers = [item.text.strip() for item in Table_Headers]

print(Table_Headers)

['No.', 'Name', 'Net worth (USD)', 'Age', 'Nationality', 'Primary source(s) of wealth']


### Importing Pandas & assigning Table_Headers as column Titles

In [None]:
import pandas as pd

df = pd.DataFrame(columns = Table_Headers)

df

Unnamed: 0,No.,Name,Net worth (USD),Age,Nationality,Primary source(s) of wealth


### Fetching all the table rows

In [None]:
Column_Data = table.find_all('tr')

print(Column_Data)

[<tr>
<th>No.
</th>
<th>Name
</th>
<th><a href="/wiki/Net_worth" title="Net worth">Net worth</a> (<a href="/wiki/United_States_dollar" title="United States dollar">USD</a>)
</th>
<th>Age
</th>
<th><a href="/wiki/Citizenship" title="Citizenship">Nationality</a>
</th>
<th>Primary source(s) of wealth
</th></tr>, <tr>
<td style="text-align:center;"><span data-sort-value="7000100000000000000♠">1</span> <span typeof="mw:File"><span title="Increase"><img alt="Increase" class="mw-file-element" data-file-height="300" data-file-width="300" decoding="async" height="11" src="//upload.wikimedia.org/wikipedia/commons/thumb/b/b0/Increase2.svg/20px-Increase2.svg.png" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/b/b0/Increase2.svg/40px-Increase2.svg.png 2x" width="11"/></span></span></td>
<td><span data-sort-value="Musk, Elon"><span class="vcard"><span class="fn"><a href="/wiki/Elon_Musk" title="Elon Musk">Elon Musk</a></span></span></span></td>
<td style="text-align:center;">$342 billion <sp

### Adding Individual rows into df dataframe

In [None]:
for row in Column_Data[1:]:
    Row_Data = row.find_all('td')
    Individual_Row_Data = [data.text.strip() for data in Row_Data]

    length = len(df)
    df.loc[length] = Individual_Row_Data

df

Unnamed: 0,No.,Name,Net worth (USD),Age,Nationality,Primary source(s) of wealth
0,1,Elon Musk,$342 billion,53,South Africa Canada United States,Tesla and SpaceX
1,2,Mark Zuckerberg[nb 1],$216 billion,40,United States,Meta Platforms
2,3,Jeff Bezos,$215 billion,61,United States,Amazon
3,4,Larry Ellison,$192 billion,80,United States,Oracle Corporation
4,5,Bernard Arnault & family,$178 billion,76,France,LVMH
5,6,Warren Buffett,$154 billion,94,United States,Berkshire Hathaway
6,7,Larry Page,$144 billion,52,United States,Google
7,8,Sergey Brin,$138 billion,51,United States,Google
8,9,Amancio Ortega,$124 billion,89,Spain,"Inditex, Zara"
9,10,Steve Ballmer,$118 billion,69,United States,Microsoft


### Saving this notebook in csv format in a specified folder path

In [None]:
df.to_csv(r'C:\Users\sasikanthchowdhary.n\Desktop\SASI\DATA ANALYST BOOTCAMP PORTFOLIO PROJECTS\Web Scrapping Project using Python.csv', index = False)

print('DataFrame is exported to CSV successfully.')



DataFrame is exported to CSV successfully.
