<a href="https://colab.research.google.com/github/rahulbhoyar1995/web-scraping-projects/blob/main/project_1_scraping_website_tables_data_using_bs4/1_basic_scraping_using_bs4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### Author : Rahul Bhoyar

We will do the basic scraping and explore what are the different tags that are present.

For scraping purpose we will take the link :


https://en.wikipedia.org/wiki/List_of_countries_by_past_and_projected_GDP_(nominal)"

In [2]:
URL  = "https://en.wikipedia.org/wiki/List_of_countries_by_past_and_projected_GDP_(nominal)"

In [3]:
import requests
from bs4 import BeautifulSoup
import re

def clean_text(text):
    cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    return cleaned_text


url = "https://en.wikipedia.org/wiki/List_of_countries_by_past_and_projected_GDP_(nominal)"
response = requests.get(url)

if response.status_code == 200:
    soup = BeautifulSoup(response.text, 'html.parser')

     # Extracting paragraphs
    titles = [clean_text(title.text) for title in soup.find_all('title')]
    print(f"Titles: {titles}")
    print("-"*200)

    # Extracting paragraphs
    paragraphs = [clean_text(paragraph.text) for paragraph in soup.find_all('p')]
    print(f"Paragraphs: {paragraphs}")
    print("Total number of paragraphs :", len(paragraphs))
    print("-"*200)

    # Extracting categories
    categories = [clean_text(category.text) for category in soup.find_all('span', {'class': 'mw-headline'})]
    print(f"Categories: {categories}")
    print("-"*200)

    # Extracting references
    references = [clean_text(reference.text) for reference in soup.find_all('span', {'class': 'reference-text'})]
    print(f"References: {references}")
    print("-"*200)

    # Extracting images
    images = [clean_text(image['src']) for image in soup.find_all('img')]
    print(f"Images: {images}")
    print("-"*200)

    # Extracting table data (if available)
    tables = soup.find_all('table')
    for table in tables:
        table_data = [[clean_text(td.text.strip()) for td in row.find_all(['th', 'td'])] for row in table.find_all('tr')]
        print(f"Table Data: {table_data}")
    print("-"*200)

    # Extracting external links
    external_links = [clean_text(link['href']) for link in soup.find_all('a', {'class': 'external text'})]
    print(f"External Links: {external_links}")
    print("-"*200)

    # Extracting see also section
    see_also = [clean_text(link.text) for link in soup.find_all('div', {'class': 'div-col columns column-width'})]
    print(f"See Also: {see_also}")
    print("-"*200)

    # Extracting revision history
    revision_history = [clean_text(revision.text) for revision in soup.find_all('li', {'class': 'history-changed'})]
    print(f"Revision History: {revision_history}")
    print("-"*200)
else:
    print(f"Failed to retrieve the page. Status code: {response.status_code}")


Titles: ['List of countries by past and projected GDP nominal Wikipedia']
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Paragraphs: ['', 'This is an alphabetical list of countries by past and projected gross domestic product nominal as ranked by the IMF Figures are based on official exchange rates not on the purchasing power parity PPP methodology Values are given in millions of United States dollars USD and have not been adjusted for inflation These figures have been taken from the International Monetary Funds World Economic Outlook WEO Database October 2023 edition andor other sources1', 'For older GDP trends see List of regions by past GDP PPP', 'The following Table is based on UN GDP data23', 'indicates GDP of country or territory or Economy of country or territory links', 'The following list contains the various countries projec

We have displayed all the data.

Let's store the data from tables into Pandas Dataframe.

In [4]:
print("Total number of tables are :", len(tables))


Total number of tables are : 12


Table 1: UN estimates between 1970 and 1979

In [None]:
import pandas as pd

table_1 = tables[0]
table_data = [[clean_text(td.text.strip()) for td in row.find_all(['th', 'td'])] for row in table_1.find_all('tr')]

column_names = table_data[0]
df = pd.DataFrame(table_data[1:], columns=column_names)
df

Unnamed: 0,Country or area,1970,1971,1972,1973,1974,1975,1976,1977,1978,1979
0,Afghanistan,1749,1831,1596,1733,2156,2367,2556,2953,3300,3698
1,Albania,2266,2331,2398,2467,2537,2610,2686,2761,2842,2372
2,Algeria,5167,5376,7193,9250,13290,15591,17790,21038,26433,33276
3,Andorra,99,113,144,191,236,279,288,321,390,521
4,Angola,3807,4007,4102,5016,5627,4147,3981,4344,4845,5380
...,...,...,...,...,...,...,...,...,...,...,...
213,South Yemen,154,142,159,130,138,141,183,226,272,293
214,Yugoslavia,14554,15802,16485,21472,29706,33279,37563,45673,54338,68198
215,Zambia,1544,1574,1852,2350,2895,2658,2814,2767,3098,3827
216,Zanzibar,,,,,,,,,,


Table_2 : IMF estimates between 1970 and 1979

In [6]:
import pandas as pd

table_2 = tables[1]
table_data = [[clean_text(td.text.strip()) for td in row.find_all(['th', 'td'])] for row in table_2.find_all('tr')]

column_names = table_data[0]
df = pd.DataFrame(table_data[1:], columns=column_names)
df

Unnamed: 0,Country or dependent territory,1980,1981,1982,1983,1984,1985,1986,1987,1988,1989
0,Afghanistan,,,,,,,,,,
1,Albania,1946,2229,2296,2319,2290,2339,2587,2566,2530,2779
2,Algeria,42346,44372,44780,47529,51513,61132,61535,63300,51664,52558
3,Andorra,,,,,,,,,,
4,Angola,6639,6214,6214,6476,6864,8457,7918,9050,9818,11421
...,...,...,...,...,...,...,...,...,...,...,...
191,Vietnam,35357,17617,23369,35204,61171,19045,43009,53385,29501,7991
192,Palestine,,,,,,,,,,
193,Yemen,,,,,,,,,,
194,Zambia,4246,4385,4232,3653,3003,2848,1962,2431,4095,4365


Table_3 : IMF estimates between 1990 and 1999

In [5]:
import pandas as pd

table_3 = tables[2]
table_data = [[clean_text(td.text.strip()) for td in row.find_all(['th', 'td'])] for row in table_3.find_all('tr')]

column_names = table_data[0]
df = pd.DataFrame(table_data[1:], columns=column_names)
df

Unnamed: 0,Country or dependent territory,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999
0,Afghanistan,,,,,,,,,,
1,Albania,2221,1333,843,1461,2361,2882,3200,2259,2560,3209
2,Algeria,61892,46670,49217,50963,42426,42066,46941,48178,48188,48845
3,Andorra,,,,,,,,,,
4,Angola,12571,12186,9395,6819,4965,6197,7994,9388,7958,7526
...,...,...,...,...,...,...,...,...,...,...,...
191,Vietnam,8217,9704,12528,16736,20712,26407,31352,34146,34580,36444
192,Palestine,,,,,2843,3283,3410,3760,4068,4271
193,Yemen,12644,14665,17959,21737,28019,12796,6496,6838,6322,7639
194,Zambia,4085,3690,3614,3549,3657,3799,3599,4303,3538,3405


Table_4 : IMF estimates between 2000 and 2009

In [None]:
import pandas as pd

table_4 = tables[3]
table_data = [[clean_text(td.text.strip()) for td in row.find_all(['th', 'td'])] for row in table_4.find_all('tr')]

column_names = table_data[0]
df = pd.DataFrame(table_data[1:], columns=column_names)
df

Unnamed: 0,Country or dependent territory,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009
0,Afghanistan,,,4367,4553,5146,6167,6925,8556,10297,12066
1,Albania,3483,3928,4348,5611,7185,8052,8896,10677,12881,12044
2,Algeria,54749,54745,56761,67864,85332,103198,117027,134977,171001,137211
3,Andorra,1429,1547,1758,2362,2896,3158,3456,3952,4082,3675
4,Angola,11166,10930,15286,17813,23552,36971,52381,65266,88539,70307
...,...,...,...,...,...,...,...,...,...,...,...
191,Vietnam,39585,41297,44563,50233,62877,73197,84301,98426,124756,129022
192,Palestine,4314,4004,3556,3968,4603,5126,5348,5816,7310,8086
193,Yemen,9679,9853,10693,11778,13868,16732,19063,21651,26911,25130
194,Zambia,3601,3870,4194,4902,6221,8329,12762,14060,17914,15332


Table 5 : : IMF estimates between 1980 and 1989

In [8]:
import pandas as pd

table_ = tables[4]
table_data = [[clean_text(td.text.strip()) for td in row.find_all(['th', 'td'])] for row in table_2.find_all('tr')]

column_names = table_data[0]
df = pd.DataFrame(table_data[1:], columns=column_names)
df

Unnamed: 0,Country or dependent territory,1980,1981,1982,1983,1984,1985,1986,1987,1988,1989
0,Afghanistan,,,,,,,,,,
1,Albania,1946,2229,2296,2319,2290,2339,2587,2566,2530,2779
2,Algeria,42346,44372,44780,47529,51513,61132,61535,63300,51664,52558
3,Andorra,,,,,,,,,,
4,Angola,6639,6214,6214,6476,6864,8457,7918,9050,9818,11421
...,...,...,...,...,...,...,...,...,...,...,...
191,Vietnam,35357,17617,23369,35204,61171,19045,43009,53385,29501,7991
192,Palestine,,,,,,,,,,
193,Yemen,,,,,,,,,,
194,Zambia,4246,4385,4232,3653,3003,2848,1962,2431,4095,4365
