# Extract informations with WEB SCRAPING 


In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np 

2. International Body Mass Index Data

In [2]:
# Website URL to scrape
url = "https://en.wikipedia.org/wiki/List_of_sovereign_states_by_body_mass_index"

# Get the content of the web page
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")

# Find the table on the page
table = soup.find("table", {"class": "wikitable"})

# Read the table into a DataFrame
df_bmi = pd.read_html(str(table))[0]

# Sort the DataFrame by the "Country" column in alphabetical order
df_bmi = df_bmi.sort_values(by='Country')


df_bmi

Unnamed: 0,Country,Both,Male,Female
183,Afghanistan *,21.6,21.5,21.8
88,Albania *,26.1,26.6,25.6
87,Algeria *,26.2,25.5,27.0
33,Andorra *,27.5,27.8,27.1
138,Angola *,24.1,23.5,24.6
...,...,...,...,...
46,Venezuela *,27.2,27.4,27.1
182,Vietnam *,21.6,21.5,21.6
102,Yemen *,25.8,25.2,26.5
165,Zambia *,22.6,21.5,23.8


In [3]:
# Reset the index to reflect the new order
df_bmi.reset_index(drop=True, inplace=True)

# Remove the "*" character in front of country names in the "Country" column
df_bmi['Country'] = df_bmi['Country'].str.replace('*', '')


#clean data 
df_bmi.replace('—', np.nan, inplace=True)
df_bmi.isna().sum()
df_bmi.dropna()

#remove Male, Female column  
df_bmi = df_bmi.drop('Male', axis=1)
df_bmi = df_bmi.drop('Female', axis=1)

#Rename "Both" to  "BMI_Mean"
df_bmi.rename(columns={'Both': 'BMI'}, inplace=True)


# Display the sorted and cleaned DataFrame
df_bmi

Unnamed: 0,Country,BMI
0,Afghanistan,21.6
1,Albania,26.1
2,Algeria,26.2
3,Andorra,27.5
4,Angola,24.1
...,...,...
191,Venezuela,27.2
192,Vietnam,21.6
193,Yemen,25.8
194,Zambia,22.6


In [4]:
# Save the DataFrame to a CSV file named "States_BMI_Data.csv"
df_bmi.to_csv('States_BMI_Data.csv', index=False)


2. Body mass index

In [5]:

# Make a GET request to the Wikipedia page
url = "https://en.wikipedia.org/wiki/Body_mass_index"
response = requests.get(url)

# Parse the HTML content of the page
soup = BeautifulSoup(response.content, "html.parser")

# Find the table containing the data
table = soup.find("table", {"class": "wikitable plainrowheaders"})

# Extract the data from the table into a list of lists
bmi = pd.read_html(str(table))[0]


In [6]:
bmi.loc[bmi['Category'] == 'Underweight (Severe thinness)', 'Category'] = 'Severe thinness'
bmi.loc[bmi['Category'] == 'Underweight (Moderate thinness)', 'Category'] = 'Moderate thinness'
bmi.loc[bmi['Category'] == 'Underweight (Mild thinness)', 'Category'] = 'Underweight'
bmi.loc[bmi['Category'] == 'Normal range', 'Category'] = 'Normal'
bmi.loc[bmi['Category'] == 'Overweight (Pre-obese)', 'Category'] = 'Overweight'
bmi.loc[bmi['Category'] == 'Obese (Class I)', 'Category'] = 'Obese I'
bmi.loc[bmi['Category'] == 'Obese (Class II)', 'Category'] = 'Obese II'
bmi.loc[bmi['Category'] == 'Obese (Class III)', 'Category'] = 'Obese III'
bmi

Unnamed: 0,Category,BMI (kg/m2)[c],BMI Prime[c]
0,Severe thinness,< 16.0,< 0.64
1,Moderate thinness,16.0 – 16.9,0.64 – 0.67
2,Underweight,17.0 – 18.4,0.68 – 0.73
3,Normal,18.5 – 24.9,0.74 – 0.99
4,Overweight,25.0 – 29.9,1.00 – 1.19
5,Obese I,30.0 – 34.9,1.20 – 1.39
6,Obese II,35.0 – 39.9,1.40 – 1.59
7,Obese III,≥ 40.0,≥ 1.60


In [7]:
# Formatting column 
# Convert column names to lowercase
bmi.columns = bmi.columns.str.lower()

# Replace spaces in column names with underscores and remove extra spaces
bmi.columns = bmi.columns.map(lambda word: word.strip().replace(" ", "_"))
bmi

Unnamed: 0,category,bmi_(kg/m2)[c],bmi_prime[c]
0,Severe thinness,< 16.0,< 0.64
1,Moderate thinness,16.0 – 16.9,0.64 – 0.67
2,Underweight,17.0 – 18.4,0.68 – 0.73
3,Normal,18.5 – 24.9,0.74 – 0.99
4,Overweight,25.0 – 29.9,1.00 – 1.19
5,Obese I,30.0 – 34.9,1.20 – 1.39
6,Obese II,35.0 – 39.9,1.40 – 1.59
7,Obese III,≥ 40.0,≥ 1.60


In [8]:
bmi['id_bmi'] = range(1, 9)
bmi = bmi.reindex(columns=['id_bmi', 'category', 'bmi_(kg/m2)[c]', 'bmi_prime[c]'])
bmi

Unnamed: 0,id_bmi,category,bmi_(kg/m2)[c],bmi_prime[c]
0,1,Severe thinness,< 16.0,< 0.64
1,2,Moderate thinness,16.0 – 16.9,0.64 – 0.67
2,3,Underweight,17.0 – 18.4,0.68 – 0.73
3,4,Normal,18.5 – 24.9,0.74 – 0.99
4,5,Overweight,25.0 – 29.9,1.00 – 1.19
5,6,Obese I,30.0 – 34.9,1.20 – 1.39
6,7,Obese II,35.0 – 39.9,1.40 – 1.59
7,8,Obese III,≥ 40.0,≥ 1.60


In [9]:
# Save the DataFrame to a CSV file named "Body_Mass_Index_Data.csv"
bmi.to_csv('Body_Mass_Index_Data.csv', index=False)
