In [None]:
#import necessary packages
import requests 
from bs4 import BeautifulSoup

In [None]:
#Retrieve the html through request
url = "https://en.wikipedia.org/wiki/World_Happiness_Report"
page= requests.get (url)

In [None]:
#The html retrieved using the request module above
page.content

In [None]:
#Use beautifulsoup to parse the html and .prettify() make html into readable format
soup = BeautifulSoup (page.content, 'html.parser')
print(soup.prettify())

In [None]:
#Since we are scraping from a table, we use soup.findall to find all 'table' tags.
All_table = soup.findAll('table')
All_table

In [None]:
#Look at raw html, count # of html tags untill you find the one we want, which is the 9th table
All_table[9]

In [None]:
#The table that we want has a class of 'wikitable sortable'
#use .find() to find one instance of it
table = soup.find ('table', class_= 'wikitable sortable')
table

In [None]:
#Creating lists for columns in the table 
L1 = []
L2 = []
L3 = []
L4 = []
L5 = []
L6 = []
L7 = []
L8 = []
L9 = []

for row in table.findAll('tr'):  #For each table row, we want to run through each table data and get the text
  cells = row.findAll('td')     
  if(len(cells) == 9):  #If the length of the table data is 9, because there exists 9 columns, then we can use .find(text=True) to extract the numbers. 
    L1.append(cells[0].find(text=True).rstrip("\n"))
    # Ran into an error where it was printing ‘\xa0’, I suspect that it’s because they had an image of the flag first, so I used .get_text(strip=True) instead to extract only the name of the country.
    L2.append(cells[1].get_text(strip=True).rstrip("\n")) #Only non-numerical text, only want name of country
    L3.append(cells[2].find(text=True).rstrip("\n"))
    L4.append(cells[3].find(text=True).rstrip("\n"))
    L5.append(cells[4].find(text=True).rstrip("\n"))
    L6.append(cells[5].find(text=True).rstrip("\n"))
    L7.append(cells[6].find(text=True).rstrip("\n"))
    L8.append(cells[7].find(text=True).rstrip("\n"))
    L9.append(cells[8].find(text=True).rstrip("\n"))  



In [None]:
#Creating dataframe
import pandas as pd
df=pd.DataFrame()
#Conver list to dataframe
#Each list goes into their respective columns in the dataframe
df['Overall rank'] = L1
df['Country or region'] = L2
df['Score'] = L3
df['GDP per capita'] = L4
df['Social support'] = L5
df['Healthy life expectancy'] = L6
df['Freedom to make life choices'] = L7
df['Generosity'] = L8
df['Perceptions of corruption'] = L9

#.set_index to overall rank since pandas automatically assigns their own column of indexes starting with 0, which we don’t need.
df.set_index('Overall rank', inplace=True)
df

Unnamed: 0_level_0,Country or region,Score,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption
Overall rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,Finland,7.809,1.285,1.500,0.961,0.662,0.160,0.478
2,Denmark,7.646,1.327,1.503,0.979,0.665,0.243,0.495
3,Switzerland,7.560,1.391,1.472,1.041,0.629,0.269,0.408
4,Iceland,7.504,1.327,1.548,1.001,0.662,0.362,0.145
5,Norway,7.488,1.424,1.495,1.008,0.670,0.288,0.434
...,...,...,...,...,...,...,...,...
149,Central African Republic,3.476,0.041,0.000,0.000,0.293,0.254,0.028
150,Rwanda,3.312,0.343,0.523,0.572,0.604,0.236,0.486
151,Zimbabwe,3.299,0.426,1.048,0.375,0.377,0.151,0.081
152,South Sudan,2.817,0.289,0.553,0.209,0.066,0.210,0.111


In [None]:
#Note that each column is an object
df.dtypes


Country or region               object
Score                           object
GDP per capita                  object
Social support                  object
Healthy life expectancy         object
Freedom to make life choices    object
Generosity                      object
Perceptions of corruption       object
dtype: object

In [None]:
#Convert each numerical column to float64
df[['Score','GDP per capita', 'Social support', 'Healthy life expectancy', 'Freedom to make life choices', 'Generosity', 'Perceptions of corruption']]= df [['Score','GDP per capita', 'Social support', 'Healthy life expectancy', 'Freedom to make life choices', 'Generosity', 'Perceptions of corruption']].apply(pd.to_numeric)
df.dtypes
#Note we now have the numerical columns as float

Country or region                object
Score                           float64
GDP per capita                  float64
Social support                  float64
Healthy life expectancy         float64
Freedom to make life choices    float64
Generosity                      float64
Perceptions of corruption       float64
dtype: object

In [None]:
#Now we can use .describe() to describe only the numerical columns
df.describe()

Unnamed: 0,Score,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption
count,153.0,153.0,153.0,153.0,153.0,153.0,153.0
mean,5.473255,0.868778,1.155562,0.692882,0.463542,0.189366,0.130719
std,1.112288,0.372418,0.286862,0.254111,0.141139,0.100433,0.113087
min,2.567,0.0,0.0,0.0,0.0,0.0,0.0
25%,4.724,0.576,0.987,0.495,0.381,0.115,0.056
50%,5.515,0.919,1.204,0.76,0.483,0.177,0.098
75%,6.228,1.169,1.387,0.867,0.577,0.256,0.163
max,7.809,1.537,1.548,1.138,0.693,0.57,0.533


In [None]:
#Export as csv
df.to_csv('World Happiness Report.csv')