In [1]:
import pandas as pd
import sqlite3
import re

##### Scrape and load the Wiki data into a Pandas dataframe

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_national_capitals'
list_of_tables = pd.read_html(url)
capitals_df = list_of_tables[1]

##### Clean the column names

In [4]:
capitals_df.rename(columns={'City/Town':'city'},inplace=True)
capitals_df.rename(columns={'Country/Territory':'country'},inplace=True)
capitals_df.rename(columns={'Notes':'notes'},inplace=True)

capitals_df['city_extra_info'] =  capitals_df['city'].apply(lambda x: x[len(re.search('^(.+?)\(',str(x)).group(1))+1:-1] if str(x).find('(') > 0 else '')
capitals_df['city'] = capitals_df['city'].apply(lambda x: re.search('^(.+?)\(',str(x)).group(1).strip() if str(x).find('(') > 0 else x)


In [5]:
capitals_df

Unnamed: 0,city,country,notes,city_extra_info
0,Abidjan,Ivory Coast (Côte d'Ivoire),Abidjan is the largest city in Ivory Coast and...,former capital; still hosts some government of...
1,Yamoussoukro,Ivory Coast (Côte d'Ivoire),Abidjan is the largest city in Ivory Coast and...,de jure
2,Abu Dhabi,United Arab Emirates,,
3,Abuja,Nigeria,Lagos was the capital from 1914 to 1991.,
4,Accra,Ghana,,
...,...,...,...,...
255,Windhoek,Namibia,,
256,Yaoundé,Cameroon,,
257,Yaren,Nauru,"Nauru has no official capital; however, the go...",de facto
258,Yerevan,Armenia,,


In [6]:
capitals_df.to_csv('capitals.csv',header=True,index=False,encoding='utf-16')
capitals_csv = pd.read_csv('capitals.csv',encoding='utf-16')

##### Create a SQL Lite database

In [7]:
conn = sqlite3.connect('capitals.db')
cc_cursor = sqlite3.Cursor(conn)


In [8]:
drop_table = 'DROP TABLE IF EXISTS capitals'

In [9]:
cc_cursor.execute(drop_table)

<sqlite3.Cursor at 0x1fd55534bc0>

In [10]:
create_table = '''CREATE TABLE capitals (
                    city VARCHAR(250),
                    city_extra_info VARCHAR(250),
                    country VARCHAR(250),
                    notes VARCHAR(500)
)'''

In [11]:
cc_cursor.execute(create_table)

<sqlite3.Cursor at 0x1fd55534bc0>

In [12]:
capitals_csv.to_sql('capitals',conn,if_exists='append',index=False)

260

In [13]:
view_snapshot = pd.read_sql('SELECT * FROM capitals LIMIT 20',conn)

In [14]:
view_snapshot

Unnamed: 0,city,city_extra_info,country,notes
0,Abidjan,former capital; still hosts some government of...,Ivory Coast (Côte d'Ivoire),Abidjan is the largest city in Ivory Coast and...
1,Yamoussoukro,de jure,Ivory Coast (Côte d'Ivoire),Abidjan is the largest city in Ivory Coast and...
2,Abu Dhabi,,United Arab Emirates,
3,Abuja,,Nigeria,Lagos was the capital from 1914 to 1991.
4,Accra,,Ghana,
5,Adamstown,,Pitcairn Islands,British Overseas Territory.
6,Addis Ababa,,Ethiopia,
7,Aden,"de facto, temporary",Yemen,"Due to the Yemeni civil war (2014–present), Sa..."
8,Sanaa,de jure,Yemen,"Due to the Yemeni civil war (2014–present), Sa..."
9,Algiers,,Algeria,


In [23]:
more_than_2_capitals = pd.read_sql( '''SELECT country,city,city_extra_info FROM capitals
                                    WHERE country IN (SELECT country
                                                    FROM capitals 
                                                    GROUP BY country
                                                    HAVING count(*) > 1)
                          ''',conn)

In [24]:
more_than_2_capitals

Unnamed: 0,country,city,city_extra_info
0,Ivory Coast (Côte d'Ivoire),Abidjan,former capital; still hosts some government of...
1,Ivory Coast (Côte d'Ivoire),Yamoussoukro,de jure
2,Yemen,Aden,"de facto, temporary"
3,Yemen,Sanaa,de jure
4,Netherlands,Amsterdam,official
5,Netherlands,The Hague,de facto
6,South Africa,Bloemfontein,judicial
7,South Africa,Cape Town,legislative
8,South Africa,Pretoria,executive
9,Montserrat,Brades,de facto


In [None]:
conn.close()