In [1]:
import pandas as pd

link1="https://www.cia.gov/library/publications/resources/the-world-factbook/fields/274.html"

dataco2=pd.read_html(link1,header=0,attrs={'id': 'fieldListing'})

In [2]:
type(dataco2), len(dataco2) 

(list, 1)

In [3]:
cia=dataco2[0]

In [4]:
cia.head()

Unnamed: 0,Country,Carbon dioxide emissions from consumption of energy
0,Afghanistan,9.067 million Mt (2017 est.)
1,Albania,4.5 million Mt (2017 est.)
2,Algeria,135.9 million Mt (2017 est.)
3,American Samoa,"361,100 Mt (2017 est.)"
4,Angola,20.95 million Mt (2017 est.)


In [5]:
# current columns:
cia.columns

Index(['Country', 'Carbon dioxide emissions from consumption of energy'], dtype='object')

In [6]:
# creating dictionary of changes:
OldToNew={cia.columns[0]:'countries',
          cia.columns[1]:'co2'}

In [12]:
# making change happen:
cia.rename(columns=OldToNew,inplace=True)

In [13]:
# current situation
cia.head()

Unnamed: 0,countries,co2
0,Afghanistan,9.067 million Mt (2017 est.)
1,Albania,4.5 million Mt (2017 est.)
2,Algeria,135.9 million Mt (2017 est.)
3,American Samoa,"361,100 Mt (2017 est.)"
4,Angola,20.95 million Mt (2017 est.)


In [14]:
cia.tail()

Unnamed: 0,countries,co2
211,Western Sahara,"268,400 Mt (2017 est.)"
212,World,33.62 billion Mt (2013 est.)
213,Yemen,13.68 million Mt (2017 est.)
214,Zambia,3.777 million Mt (2017 est.)
215,Zimbabwe,12.06 million Mt (2017 est.)


In [15]:
cia.countries=cia.countries.str.strip()

In [17]:
cia.countries

0         Afghanistan
1             Albania
2             Algeria
3      American Samoa
4              Angola
            ...      
211    Western Sahara
212             World
213             Yemen
214            Zambia
215          Zimbabwe
Name: countries, Length: 216, dtype: object

In [16]:
# first look (notice the blank space before Mt)

cia.co2.str.split(pat=' Mt')

0      [9.067 million,   (2017 est.)]
1        [4.5 million,   (2017 est.)]
2      [135.9 million,   (2017 est.)]
3            [361,100,   (2017 est.)]
4      [20.95 million,   (2017 est.)]
                    ...              
211          [268,400,   (2017 est.)]
212    [33.62 billion,   (2013 est.)]
213    [13.68 million,   (2017 est.)]
214    [3.777 million,   (2017 est.)]
215    [12.06 million,   (2017 est.)]
Name: co2, Length: 216, dtype: object

In [18]:
# improving first look: "expand" separates into columns

cia.co2.str.split(' Mt',expand=True)

Unnamed: 0,0,1
0,9.067 million,(2017 est.)
1,4.5 million,(2017 est.)
2,135.9 million,(2017 est.)
3,361100,(2017 est.)
4,20.95 million,(2017 est.)
...,...,...
211,268400,(2017 est.)
212,33.62 billion,(2013 est.)
213,13.68 million,(2017 est.)
214,3.777 million,(2017 est.)


In [19]:
# keeping the first element of the last result:

cia.co2.str.split(' Mt',expand=True)[0]

0      9.067 million
1        4.5 million
2      135.9 million
3            361,100
4      20.95 million
           ...      
211          268,400
212    33.62 billion
213    13.68 million
214    3.777 million
215    12.06 million
Name: 0, Length: 216, dtype: object

In [20]:
# Notice that the previous steps **HAVE NOT** done any changes. I have only displayed the results. 
# Now I will replace the column:

result1=cia.co2.str.split(' Mt',expand=True)[0]

In [21]:
# assign can create or overwrite a column. Then, I use 'result1' here
cia=cia.assign(co2=result1)

In [24]:
# Current situation:
cia

Unnamed: 0,countries,co2
0,Afghanistan,9.067 million
1,Albania,4.5 million
2,Algeria,135.9 million
3,American Samoa,361100
4,Angola,20.95 million
...,...,...
211,Western Sahara,268400
212,World,33.62 billion
213,Yemen,13.68 million
214,Zambia,3.777 million


In [25]:
# \d+  one or more digits
# \.?  with or without a dot
# \,?  with or without a comma
# \d*  with zero or more digits

cia.co2.str.extract('(\d+\,*\.*\d*)') #Notice the use of parentheses, they signal a *group* for Pandas:

Unnamed: 0,0
0,9.067
1,4.5
2,135.9
3,361100
4,20.95
...,...
211,268400
212,33.62
213,13.68
214,3.777


In [26]:
#  a sequence of non digits after a space
# \s before \D+ 
cia.co2.str.extract('\s(\D+)')

Unnamed: 0,0
0,million
1,million
2,million
3,
4,million
...,...
211,
212,billion
213,million
214,million


In [34]:
## NOTE: Steps **2** and **3** can be done at once:

# simultaneously
cia.co2.str.extract('(\d+\,*\.*\d*)\s(\D+)') # Notice rows indexes **3** and **211**

Unnamed: 0,0,1
0,9.067,million
1,4.5,million
2,135.9,million
3,,
4,20.95,million
...,...,...
211,,
212,33.62,billion
213,13.68,million
214,3.777,million


In [35]:
# Solving previous issue by making the second group conditional (using *s).

cia.co2.str.extract('(\d+\,?\.?\d*)\s*(\D+)*')

Unnamed: 0,0,1
0,9.067,million
1,4.5,million
2,135.9,million
3,361100,
4,20.95,million
...,...,...
211,268400,
212,33.62,billion
213,13.68,million
214,3.777,million


In [30]:
# Pandas can give a **name** to the result with **?P < name >**:

cia.co2.str.extract('(?P<number>\d+\,*\.*\d*)\s*(?P<text>\D+)*')

Unnamed: 0,number,text
0,9.067,million
1,4.5,million
2,135.9,million
3,361100,
4,20.95,million
...,...,...
211,268400,
212,33.62,billion
213,13.68,million
214,3.777,million


In [36]:
# Notice you have a data frame, let's save it:
result2=cia.co2.str.extract('(?P<number>\d+\,*\.*\d*)\s*(?P<text>\D+)*')

In [37]:
result2

Unnamed: 0,number,text
0,9.067,million
1,4.5,million
2,135.9,million
3,361100,
4,20.95,million
...,...,...
211,268400,
212,33.62,billion
213,13.68,million
214,3.777,million


In [39]:
# And let's use the columns of these new data frame:
cia=cia.assign(value=result2.number,
               unit=result2.text)

In [40]:
# Current situation:
cia.head()

Unnamed: 0,countries,co2,value,unit
0,Afghanistan,9.067 million,9.067,million
1,Albania,4.5 million,4.5,million
2,Algeria,135.9 million,135.9,million
3,American Samoa,361100,361100.0,
4,Angola,20.95 million,20.95,million


In [41]:
# the number have commas, let's get rid of those:
cia.value=cia.value.str.replace(",","")

In [42]:
# Check what you have:
cia.unit.value_counts(dropna=False)

million    163
NaN         46
billion      7
Name: unit, dtype: int64

In [43]:
# create dictionary for replacements:
replacements={'million': 10**6, "billion": 10**9,None:10**0}

In [44]:
# take a loook at the result:
cia.unit.replace(replacements)

0         1000000
1         1000000
2         1000000
3               1
4         1000000
          ...    
211             1
212    1000000000
213       1000000
214       1000000
215       1000000
Name: unit, Length: 216, dtype: int64

In [45]:
# make it happen
cia.unit.replace(replacements,inplace=True)

In [46]:
#Current situation:
cia.head()

Unnamed: 0,countries,co2,value,unit
0,Afghanistan,9.067 million,9.067,1000000
1,Albania,4.5 million,4.5,1000000
2,Algeria,135.9 million,135.9,1000000
3,American Samoa,361100,361100.0,1
4,Angola,20.95 million,20.95,1000000


In [47]:
# when using 'columns=' or 'index=', axis not needed
# when using 'labels' axis is needed
cia.drop(columns='co2',inplace=True) 

In [49]:
cia.dtypes

countries    object
value        object
unit          int64
dtype: object

In [50]:
cia.describe()

Unnamed: 0,unit
count,216.0
mean,33162040.0
std,177352900.0
min,1.0
25%,1000000.0
50%,1000000.0
75%,1000000.0
max,1000000000.0


In [51]:
pd.to_numeric(cia.value)

0           9.067
1           4.500
2         135.900
3      361100.000
4          20.950
          ...    
211    268400.000
212        33.620
213        13.680
214         3.777
215        12.060
Name: value, Length: 216, dtype: float64

In [52]:
cia=cia.assign(value=pd.to_numeric(cia.value))

In [53]:
cia.head()

Unnamed: 0,countries,value,unit
0,Afghanistan,9.067,1000000
1,Albania,4.5,1000000
2,Algeria,135.9,1000000
3,American Samoa,361100.0,1
4,Angola,20.95,1000000


In [54]:
cia.dtypes

countries     object
value        float64
unit           int64
dtype: object

In [55]:
cia.describe()

Unnamed: 0,value,unit
count,216.0,216.0
mean,83366.395898,33162040.0
std,208340.495123,177352900.0
min,1.03,1.0
25%,7.66725,1000000.0
50%,37.865,1000000.0
75%,442.85,1000000.0
max,985600.0,1000000000.0


In [56]:
#previous result:
cia.value*cia.unit

0      9.067000e+06
1      4.500000e+06
2      1.359000e+08
3      3.611000e+05
4      2.095000e+07
           ...     
211    2.684000e+05
212    3.362000e+10
213    1.368000e+07
214    3.777000e+06
215    1.206000e+07
Length: 216, dtype: float64

In [57]:
cia=cia.assign(co2_in_MT=cia.value*cia.unit)

In [58]:
# current situation:
cia.head()

Unnamed: 0,countries,value,unit,co2_in_MT
0,Afghanistan,9.067,1000000,9067000.0
1,Albania,4.5,1000000,4500000.0
2,Algeria,135.9,1000000,135900000.0
3,American Samoa,361100.0,1,361100.0
4,Angola,20.95,1000000,20950000.0


In [59]:
# you want this:
cia.drop(columns=['value','unit'])

Unnamed: 0,countries,co2_in_MT
0,Afghanistan,9.067000e+06
1,Albania,4.500000e+06
2,Algeria,1.359000e+08
3,American Samoa,3.611000e+05
4,Angola,2.095000e+07
...,...,...
211,Western Sahara,2.684000e+05
212,World,3.362000e+10
213,Yemen,1.368000e+07
214,Zambia,3.777000e+06


In [60]:
cia.drop(columns=['value','unit'],inplace=True)

In [61]:
demoLink = "https://en.wikipedia.org/wiki/Democracy_Index" 

# getting the data frame in one step:
demodex=pd.read_html(demoLink,header=0,flavor='bs4',attrs={'class': 'wikitable sortable'})[0]

In [62]:
demodex.head(10)

Unnamed: 0,Rank,Country,Score,Electoral processand pluralism,Functioning ofgovernment,Politicalparticipation,Politicalculture,Civilliberties,Regimetype,Continent
0,1,Norway,9.87,10.0,9.64,10.0,10.0,9.71,Full democracy,Europe
1,2,Iceland,9.58,10.0,9.29,8.89,10.0,9.71,Full democracy,Europe
2,3,Sweden,9.39,9.58,9.64,8.33,10.0,9.41,Full democracy,Europe
3,4,New Zealand,9.26,10.0,9.29,8.89,8.13,10.0,Full democracy,Oceania
4,5,Denmark,9.22,10.0,9.29,8.33,9.38,9.12,Full democracy,Europe
5,6,Ireland,9.15,9.58,7.86,8.33,10.0,10.0,Full democracy,Europe
6,6,Canada,9.15,9.58,9.64,7.78,8.75,10.0,Full democracy,North America
7,8,Finland,9.14,10.0,8.93,8.33,8.75,9.71,Full democracy,Europe
8,9,Australia,9.09,10.0,8.93,7.78,8.75,10.0,Full democracy,Oceania
9,10,Switzerland,9.03,9.58,9.29,7.78,9.38,9.12,Full democracy,Europe


In [63]:
demodex.tail(10)

Unnamed: 0,Rank,Country,Score,Electoral processand pluralism,Functioning ofgovernment,Politicalparticipation,Politicalculture,Civilliberties,Regimetype,Continent
158,159,Saudi Arabia,1.93,0.00,2.86,2.22,3.13,1.47,Authoritarian,Asia
159,159,Tajikistan,1.93,0.08,0.79,1.67,6.25,0.88,Authoritarian,Asia
160,161,Equatorial Guinea,1.92,0.00,0.43,3.33,4.38,1.47,Authoritarian,Africa
161,162,Turkmenistan,1.72,0.00,0.79,2.22,5.00,0.59,Authoritarian,Asia
162,163,Chad,1.61,0.00,0.00,1.67,3.75,2.65,Authoritarian,Africa
163,164,Central African Republic,1.52,2.25,0.00,1.11,1.88,2.35,Authoritarian,Africa
164,165,Democratic Republic of the Congo,1.49,0.50,0.71,2.22,3.13,0.88,Authoritarian,Africa
165,166,Syria,1.43,0.00,0.00,2.78,4.38,0.00,Authoritarian,Asia
166,167,North Korea,1.08,0.00,2.50,1.67,1.25,0.00,Authoritarian,Asia
167,Rank,Country,Score,Electoral processand pluralism,Functioning ofgovernment,Politicalparticipation,Politicalculture,Civilliberties,Regimetype,Continent


In [64]:
#bye row 167, and Rank
demodex=demodex.drop(index=167,columns=['Rank','Score'])

In [66]:
demodex.columns

Index(['Country', 'Electoral processand pluralism', 'Functioning ofgovernment',
       'Politicalparticipation', 'Politicalculture', 'Civilliberties',
       'Regimetype', 'Continent'],
      dtype='object')

In [67]:
pattern='\s+'
replacement=""
demodex.columns=demodex.columns.str.replace(pattern,replacement)

In [68]:
# current situation:
demodex

Unnamed: 0,Country,Electoralprocessandpluralism,Functioningofgovernment,Politicalparticipation,Politicalculture,Civilliberties,Regimetype,Continent
0,Norway,10.00,9.64,10.00,10.00,9.71,Full democracy,Europe
1,Iceland,10.00,9.29,8.89,10.00,9.71,Full democracy,Europe
2,Sweden,9.58,9.64,8.33,10.00,9.41,Full democracy,Europe
3,New Zealand,10.00,9.29,8.89,8.13,10.00,Full democracy,Oceania
4,Denmark,10.00,9.29,8.33,9.38,9.12,Full democracy,Europe
...,...,...,...,...,...,...,...,...
162,Chad,0.00,0.00,1.67,3.75,2.65,Authoritarian,Africa
163,Central African Republic,2.25,0.00,1.11,1.88,2.35,Authoritarian,Africa
164,Democratic Republic of the Congo,0.50,0.71,2.22,3.13,0.88,Authoritarian,Africa
165,Syria,0.00,0.00,2.78,4.38,0.00,Authoritarian,Asia


In [69]:
# this is a preventive step!!
badSymbols=[]
NumericColNames=demodex.iloc[:,1:6].columns
for columnName in NumericColNames:
    for cell in demodex[columnName]:
        try:
            float(cell)
        except:
            if cell not in badSymbols:
                badSymbols.append(cell)

In [71]:
import numpy as np
# notice use of loc
demodex.loc[:,NumericColNames].replace(to_replace=badSymbols,value=np.nan,inplace=True)

In [74]:
demodex.iloc[:,-2::].apply(set).to_list()

[{'Authoritarian', 'Flawed democracy', 'Full democracy', 'Hybrid regime'},
 {'Africa',
  'Asia',
  'Europe',
  'Europe/Asia',
  'North America',
  'Oceania',
  'South America'}]

In [76]:
# checking data types:
demodex.dtypes

Country                         object
Electoralprocessandpluralism    object
Functioningofgovernment         object
Politicalparticipation          object
Politicalculture                object
Civilliberties                  object
Regimetype                      object
Continent                       object
dtype: object

In [77]:
# save column names of the columns to change:
colsToChange=demodex.iloc[:,1:6].columns

In [79]:
# make changes NOT using iloc:
demodex[colsToChange]=demodex[colsToChange].apply(pd.to_numeric)

In [82]:
demodex.Continent=pd.Categorical(demodex.Continent)

In [84]:
# check the levels:
pd.unique(demodex.Regimetype).tolist()

['Full democracy', 'Flawed democracy', 'Hybrid regime', 'Authoritarian']

In [85]:
#rewrite the levels in order:
correctLevels=['Authoritarian', 'Hybrid regime', 'Flawed democracy','Full democracy']

In [86]:
#format as ordinal:
demodex.Regimetype=pd.Categorical(demodex.Regimetype,categories=correctLevels,ordered=True)

In [87]:
#then
demodex.dtypes

Country                           object
Electoralprocessandpluralism     float64
Functioningofgovernment          float64
Politicalparticipation           float64
Politicalculture                 float64
Civilliberties                   float64
Regimetype                      category
Continent                       category
dtype: object

In [88]:
demodex.Regimetype

0      Full democracy
1      Full democracy
2      Full democracy
3      Full democracy
4      Full democracy
            ...      
162     Authoritarian
163     Authoritarian
164     Authoritarian
165     Authoritarian
166     Authoritarian
Name: Regimetype, Length: 167, dtype: category
Categories (4, object): [Authoritarian < Hybrid regime < Flawed democracy < Full democracy]