#### Use requests to access a website and extract information

In [2]:
import requests

In [4]:
url = 'https://en.wikipedia.org/wiki/Global_Peace_Index'

#### Use request.get() method to establish connection to the webpage

In [6]:
page = requests.get(url)
page

<Response [200]>

#### Get the HTML content and convert it to a string for easier data extraction

In [8]:
p = str(page.content)

In [10]:
print(p)

b'<!DOCTYPE html>\n<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vector-sticky-header-enabled vector-toc-available" lang="en" dir="ltr">\n<head>\n<meta charset="UTF-8">\n<title>Global Peace Index - Wikipedia</title>\n<script>(function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled 

#### Regular expression package in Python

In [25]:
import re

In [None]:
# According to HTML structure, the country name is in the tag
# <td><span class="flagicon"><img src="flag.png"></span><a href="/wiki/Iceland" title="Iceland">Iceland</a></td>

In [97]:
pattern_country = r'<td><span class="flagicon">.*?<a href="/wiki.*?">(.*?)</a>'

In [99]:
#r': keeps everything inside the string
# <td><span class="flagicon"> : because country begin from <td><span class="flagicon">
# .*? : lazy match 
# <a href="/wiki.*?"> : link Wikipedia to country 
# (.*?): Return only this part of the string, this is country name that we need and stop immediately once the </a> occured
# Based on slide 47 in the powerpoint 

In [101]:
match_country = re.findall(pattern_country, p, re.S | re.I)

In [103]:
for i in match_country:
    print(i)

Iceland
Ireland
Austria
New Zealand
Singapore
Switzerland
Portugal
Denmark
Slovenia
Malaysia
Canada
Czech Republic
Finland
Hungary
Croatia
Belgium
Japan
Netherlands
Australia
Germany
Bhutan
Mauritius
Spain
Estonia
Kuwait
Bulgaria
Slovakia
Norway
Qatar
Latvia
Lithuania
Poland
Italy
United Kingdom
Montenegro
Romania
Oman
North Macedonia
Sweden
Greece
Vietnam
Albania
Taiwan
Madagascar
Mongolia
South Korea
Argentina
Indonesia
Laos
Botswana
East Timor
Uruguay
United Arab Emirates
Serbia
Ghana
Kosovo
Zambia
Costa Rica
Kazakhstan
Uzbekistan
Bosnia and Herzegovina
Namibia
Moldova
Chile
Tanzania
Sierra Leone
Jordan
Bolivia
Liberia
Cambodia
Tajikistan
Angola
Paraguay
Tunisia
Thailand
Armenia
Kyrgyzstan
Morocco
Malawi
Nepal
Bahrain
The Gambia
Turkmenistan
Senegal
Guinea-Bissau
France
Trinidad and Tobago
China
Cyprus
Algeria
Jamaica
Rwanda
Bangladesh
Equatorial Guinea
Mauritania
Panama
Dominican Republic
Cuba
Peru
Georgia
Sri Lanka
Saudi Arabia
Eswatini
Philippines
Egypt
Azerbaijan
El Salvador
Moz

In [105]:
# According to HTML structure, the score name is in the tag
# <td style="background:#00847f; color:#ffffff;">1.303</td>

In [167]:
pattern_score = r'<td style="background:.*?">(.*?)</td>'

In [169]:
# r': keeps everything inside the string
# <td style="background: because score begin from <td style="background
# .*? : lazy match 
# (.*?): get the string inside , the score that we need 
# </td>: the end the tag 

In [171]:
match_score = re.findall(pattern_score, p, re.S | re.I)

In [193]:
for e in match_score:
    print(e)

1.112\n
1.303\n
1.313\n
1.323\n
1.339\n
1.35\n
1.372\n
1.382\n
1.395\n
1.427\n
1.449\n
1.459\n
1.474\n
1.502\n
1.504\n
1.51\n
1.525\n
1.527\n
1.536\n
1.542\n
1.564\n
1.577\n
1.597\n
1.615\n
1.622\n
1.629\n
1.634\n
1.638\n
1.656\n
1.661\n
1.672\n
1.678\n
1.692\n
1.703\n
1.746\n
1.755\n
1.761\n
1.764\n
1.782\n
1.793\n
1.802\n
1.809\n
1.818\n
1.838\n
1.845\n
1.848\n
1.855\n
1.857\n
1.861\n
1.863\n
1.882\n
1.893\n
1.897\n
1.93\n
1.938\n
1.945\n
1.948\n
1.95\n
1.954\n
1.957\n
1.961\n
1.972\n
1.976\n
1.978\n
1.987\n
1.993\n
1.998\n
2.009\n
2.025\n
2.028\n
2.035\n
2.043\n
2.044\n
2.044\n
2.048\n
2.052\n
2.053\n
2.054\n
2.063\n
2.069\n
2.072\n
2.079\n
2.079\n
2.084\n
2.085\n
2.088\n
2.092\n
2.101\n
2.101\n
2.11\n
2.119\n
2.12\n
2.126\n
2.132\n
2.136\n
2.14\n
2.157\n
2.16\n
2.179\n
2.195\n
2.195\n
2.206\n
2.209\n
2.21\n
2.212\n
2.248\n
2.25\n
2.25\n
2.255\n
2.261\n
2.286\n
2.291\n
2.295\n
2.306\n
2.315\n
2.319\n
2.332\n
2.372\n
2.374\n
2.381\n
2.396\n
2.409\n
2.415\n
2.423\n
2.461\n
2.477\n
2.5

#### Print dictionary with {country: score} structure.

In [223]:
file_dict = {}           # dictionary 
for i in range(len(match_country)):
    file_dict[match_country[i]] = match_score[i]  # store country as key and score as value
print(file_dict)

{'Iceland': '1.112\\n', 'Ireland': '1.303\\n', 'Austria': '1.313\\n', 'New Zealand': '1.323\\n', 'Singapore': '1.339\\n', 'Switzerland': '1.35\\n', 'Portugal': '1.372\\n', 'Denmark': '1.382\\n', 'Slovenia': '1.395\\n', 'Malaysia': '1.427\\n', 'Canada': '1.449\\n', 'Czech Republic': '1.459\\n', 'Finland': '1.474\\n', 'Hungary': '1.502\\n', 'Croatia': '1.504\\n', 'Belgium': '1.51\\n', 'Japan': '1.525\\n', 'Netherlands': '1.527\\n', 'Australia': '1.536\\n', 'Germany': '1.542\\n', 'Bhutan': '1.564\\n', 'Mauritius': '1.577\\n', 'Spain': '1.597\\n', 'Estonia': '1.615\\n', 'Kuwait': '1.622\\n', 'Bulgaria': '1.629\\n', 'Slovakia': '1.634\\n', 'Norway': '1.638\\n', 'Qatar': '1.656\\n', 'Latvia': '1.661\\n', 'Lithuania': '1.672\\n', 'Poland': '1.678\\n', 'Italy': '1.692\\n', 'United Kingdom': '1.703\\n', 'Montenegro': '1.746\\n', 'Romania': '1.755\\n', 'Oman': '1.761\\n', 'North Macedonia': '1.764\\n', 'Sweden': '1.782\\n', 'Greece': '1.793\\n', 'Vietnam': '1.802\\n', 'Albania': '1.809\\n', 'Tai

#### Create the CSV file 

In [235]:
file = open('global_peace_index.csv', 'w')    # open a file and write 
    
for i in range(len(match_country)):
    line = match_country[i] + ',' + match_score[i] + '\n'     # create a line with country and score
    file.writelines(line)  
    print(line.strip()) 
file.close()

Iceland,1.112\n
Ireland,1.303\n
Austria,1.313\n
New Zealand,1.323\n
Singapore,1.339\n
Switzerland,1.35\n
Portugal,1.372\n
Denmark,1.382\n
Slovenia,1.395\n
Malaysia,1.427\n
Canada,1.449\n
Czech Republic,1.459\n
Finland,1.474\n
Hungary,1.502\n
Croatia,1.504\n
Belgium,1.51\n
Japan,1.525\n
Netherlands,1.527\n
Australia,1.536\n
Germany,1.542\n
Bhutan,1.564\n
Mauritius,1.577\n
Spain,1.597\n
Estonia,1.615\n
Kuwait,1.622\n
Bulgaria,1.629\n
Slovakia,1.634\n
Norway,1.638\n
Qatar,1.656\n
Latvia,1.661\n
Lithuania,1.672\n
Poland,1.678\n
Italy,1.692\n
United Kingdom,1.703\n
Montenegro,1.746\n
Romania,1.755\n
Oman,1.761\n
North Macedonia,1.764\n
Sweden,1.782\n
Greece,1.793\n
Vietnam,1.802\n
Albania,1.809\n
Taiwan,1.818\n
Madagascar,1.838\n
Mongolia,1.845\n
South Korea,1.848\n
Argentina,1.855\n
Indonesia,1.857\n
Laos,1.861\n
Botswana,1.863\n
East Timor,1.882\n
Uruguay,1.893\n
United Arab Emirates,1.897\n
Serbia,1.93\n
Ghana,1.938\n
Kosovo,1.945\n
Zambia,1.948\n
Costa Rica,1.95\n
Kazakhstan,1.954\n
Uzb