### CIA 

In [2]:
import pandas as pd

link1="https://www.cia.gov/library/publications/resources/the-world-factbook/fields/274.html"

cia=pd.read_html(link1,header=0,attrs={'id': 'fieldListing'})[0]

In [4]:
# renaming both columns:
OldToNew={cia.columns[0]:'countries',cia.columns[1]:'co2'}
cia.rename(columns=OldToNew,inplace=True)

# cleaning Country (preventive)
cia.countries=cia.countries.str.strip()

# splitting second column, keeping main information. This overwrites co2.
result1=cia.co2.str.split(' Mt',expand=True)[0]
cia=cia.assign(co2=result1)

# extracting number and text from co2 into a dataframe, adding those to CIA
result2=cia.co2.str.extract('(?P<number>\d+\,*\.*\d*)\s*(?P<text>\D+)*')
cia=cia.assign(value=result2.number,
               unit=result2.text)

# deleting comma from newly created column'value'
cia.value=cia.value.str.replace(",","")

# recoding newly created column'unit':
replacements={'million': 10**6, "billion": 10**9,None:10**0}
cia.unit.replace(replacements,inplace=True)

# dropping unneeded column:
cia.drop(columns='co2',inplace=True)

In [145]:
# cia.countries.str.strip()

0         Afghanistan
1             Albania
2             Algeria
3      American Samoa
4              Angola
            ...      
211    Western Sahara
212             World
213             Yemen
214            Zambia
215          Zimbabwe
Name: countries, Length: 216, dtype: object

In [5]:
# Formatting numeric columns:

# value is text, now to numeric type:
cia=cia.assign(value=pd.to_numeric(cia.value))

# once value and unit are numeric, 
# new column 'co2_in_MT' is created from multiplying both.
cia=cia.assign(co2_in_MT=cia.value*cia.unit)

# value and unit were temporary columns, they can go now.
cia.drop(columns=['value','unit'],inplace=True)

In [6]:
cia.head()

Unnamed: 0,countries,co2_in_MT
0,Afghanistan,9.067
1,Albania,4.5
2,Algeria,135.9
3,American Samoa,361100.0
4,Angola,20.95


In [7]:
cia.dtypes

countries     object
co2_in_MT    float64
dtype: object

### Decmocracy

In [8]:
import pandas as pd
link2= "https://en.wikipedia.org/wiki/Democracy_Index" 

# getting the data frame in one step:
demodex=pd.read_html(link2,header=0,flavor='bs4',attrs={'class': 'wikitable sortable'})[0]

In [10]:
#bye row 167, and Rank
demodex=demodex.drop(index=167,columns=['Rank','Score'])

# Simplifying column names to facilitate further work:
demodex.columns=demodex.columns.str.replace('\s+',"")

# preventive cleaning of numeric cell values:
NumericColNames=demodex.iloc[:,1:6].columns
badSymbols=[]
for columnName in NumericColNames:
    for cell in demodex[columnName]:
        try:
            float(cell)
        except:
            if cell not in badSymbols:
                badSymbols.append(cell)
            
import numpy as np  
demodex.loc[:,NumericColNames].replace(to_replace=badSymbols,value=np.nan,inplace=True)

# see if we have some strange value in the categorical columns:
demodex.iloc[:,-2::].apply(set).to_list()

[{'Authoritarian', 'Flawed democracy', 'Full democracy', 'Hybrid regime'},
 {'Africa',
  'Asia',
  'Europe',
  'Europe/Asia',
  'North America',
  'Oceania',
  'South America'}]

In [11]:
# Formatting numeric columns

# save column names of the columns to change:
colsToChange=demodex.iloc[:,1:6].columns
# make changes NOT using iloc:
demodex[colsToChange]=demodex[colsToChange].apply(pd.to_numeric)


# Formatting categorical columns
# NOMINAL
demodex.Continent=pd.Categorical(demodex.Continent)
# ORDINAL
#rewrite the levels in order:
correctLevels=['Authoritarian', 'Hybrid regime', 'Flawed democracy','Full democracy']
demodex.Regimetype=pd.Categorical(demodex.Regimetype,categories=correctLevels,ordered=True)

In [12]:
demodex.dtypes

Country                           object
Electoralprocessandpluralism     float64
Functioningofgovernment          float64
Politicalparticipation           float64
Politicalculture                 float64
Civilliberties                   float64
Regimetype                      category
Continent                       category
dtype: object

In [13]:
demodex.Continent.cat.ordered

False

In [14]:
demodex.Regimetype.cat.ordered

True

### Human development

In [87]:
link1="https://docs.google.com/spreadsheets/d/e/2PACX-1vRojxNKqFZMSqPbZA0D6DYsuRqBuDmZuyK0DFbbatwTQIfClU2lwoCYsEKalaYv_IorAYSmMeTWmlKF/pub?gid=239626258&single=true&output=csv"

In [88]:
# need to call pandas:

import pandas as pd

hdi=pd.read_csv(link1)

In [89]:
# Is the header in the right place:
hdi.head(10)

Unnamed: 0.1,Unnamed: 0,Table 1. Human Development Index and its components,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14
0,,,,,,,,,,,,,,,
1,,,,,SDG3,,SDG4.3,,SDG4.6,,SDG8.5,,,,
2,,,,,,,,,,,,,,,
3,,,Human development index (HDI),,Life expectancy at birth,,Expected years of schooling,,Mean years of schooling,,Gross national income (GNI) per capita,,GNI per capita rank minus HDI rank,,HDI rank
4,HDI rank,Country,(index value),,(years),,(years),,(years),,(2011 PPP $),,,,
5,,,2018,,2018,,2018,a,2018,a,2018,,2018,,2017
6,,VERY HIGH HUMAN DEVELOPMENT,,,,,,,,,,,,,
7,1,Norway,0.95,,82.27,,18.06,b,12.57,,68058.62,,5,,1
8,2,Switzerland,0.95,,83.63,,16.21,,13.38,,59374.73,,8,,2
9,3,Ireland,0.94,,82.1,,18.79,b,12.53,c,55659.68,,9,,3


In [90]:
# for the first two columns:
hdi.iloc[4,:2].tolist()

['HDI rank', 'Country']

In [91]:
# for the rest of columns columns:
hdi.iloc[3,2:].tolist()

['Human development index (HDI)',
 nan,
 'Life expectancy at birth',
 nan,
 'Expected years of schooling',
 nan,
 'Mean years of schooling',
 nan,
 'Gross national income (GNI) per capita',
 nan,
 'GNI per capita rank minus HDI rank',
 nan,
 'HDI rank']

In [92]:
# saving headers:
CurrentHeaders=hdi.iloc[4,:2].tolist()+hdi.iloc[3,2:].tolist()

# you saved this:
CurrentHeaders

['HDI rank',
 'Country',
 'Human development index (HDI)',
 nan,
 'Life expectancy at birth',
 nan,
 'Expected years of schooling',
 nan,
 'Mean years of schooling',
 nan,
 'Gross national income (GNI) per capita',
 nan,
 'GNI per capita rank minus HDI rank',
 nan,
 'HDI rank']

In [93]:
# PREVIEW: deleting rows, NORWAY should be the first row:
hdi.drop(index=range(0,7)) #7 in range() will not be erased.

Unnamed: 0.1,Unnamed: 0,Table 1. Human Development Index and its components,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14
7,1,Norway,0.95,,82.27,,18.06,b,12.57,,68058.62,,5,,1
8,2,Switzerland,0.95,,83.63,,16.21,,13.38,,59374.73,,8,,2
9,3,Ireland,0.94,,82.1,,18.79,b,12.53,c,55659.68,,9,,3
10,4,Germany,0.94,,81.18,,17.1,,14.13,,46945.95,,15,,4
11,4,"Hong Kong, China (SAR)",0.94,,84.69,,16.51,,12.04,,60220.8,,5,,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
264,,Column 2: UNDESA (2019b).,,,,,,,,,,,,,
265,,Column 3: UNESCO Institute for Statistics (201...,,,,,,,,,,,,,
266,,Column 4: UNESCO Institute for Statistics (201...,,,,,,,,,,,,,
267,,"Column 5: World Bank (2019a), IMF (2019) and U...",,,,,,,,,,,,,


In [94]:
hdi.drop(index=range(0,7),inplace=True)

In [95]:
hdi.reset_index(drop=True,inplace=True)

In [96]:
# rename columns
hdi.columns=CurrentHeaders

In [97]:
hdi.head()

Unnamed: 0,HDI rank,Country,Human development index (HDI),NaN,Life expectancy at birth,NaN.1,Expected years of schooling,NaN.2,Mean years of schooling,NaN.3,Gross national income (GNI) per capita,NaN.4,GNI per capita rank minus HDI rank,NaN.5,HDI rank.1
0,1,Norway,0.95,,82.27,,18.06,b,12.57,,68058.62,,5,,1
1,2,Switzerland,0.95,,83.63,,16.21,,13.38,,59374.73,,8,,2
2,3,Ireland,0.94,,82.1,,18.79,b,12.53,c,55659.68,,9,,3
3,4,Germany,0.94,,81.18,,17.1,,14.13,,46945.95,,15,,4
4,4,"Hong Kong, China (SAR)",0.94,,84.69,,16.51,,12.04,,60220.8,,5,,6


In [98]:
hdi.tail(65) #change until you see last data row

Unnamed: 0,HDI rank,Country,Human development index (HDI),NaN,Life expectancy at birth,NaN.1,Expected years of schooling,NaN.2,Mean years of schooling,NaN.3,Gross national income (GNI) per capita,NaN.4,GNI per capita rank minus HDI rank,NaN.5,HDI rank.1
197,..,Somalia,..,,57.07,,..,,..,,..,,..,,..
198,..,Tuvalu,..,,..,,12.31,,..,,5408.95,,..,,..
199,,,,,,,,,,,,,,,
200,,Human development groups,,,,,,,,,,,,,
201,,Very high human development,0.89,,79.51,,16.36,,12.04,,40111.57,,—,,—
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
257,,Column 2: UNDESA (2019b).,,,,,,,,,,,,,
258,,Column 3: UNESCO Institute for Statistics (201...,,,,,,,,,,,,,
259,,Column 4: UNESCO Institute for Statistics (201...,,,,,,,,,,,,,
260,,"Column 5: World Bank (2019a), IMF (2019) and U...",,,,,,,,,,,,,


In [99]:
# deleting: preview
hdi.drop(index=range(199,262)) # rows starting from 199 will be erased

Unnamed: 0,HDI rank,Country,Human development index (HDI),NaN,Life expectancy at birth,NaN.1,Expected years of schooling,NaN.2,Mean years of schooling,NaN.3,Gross national income (GNI) per capita,NaN.4,GNI per capita rank minus HDI rank,NaN.5,HDI rank.1
0,1,Norway,0.95,,82.27,,18.06,b,12.57,,68058.62,,5,,1
1,2,Switzerland,0.95,,83.63,,16.21,,13.38,,59374.73,,8,,2
2,3,Ireland,0.94,,82.1,,18.79,b,12.53,c,55659.68,,9,,3
3,4,Germany,0.94,,81.18,,17.1,,14.13,,46945.95,,15,,4
4,4,"Hong Kong, China (SAR)",0.94,,84.69,,16.51,,12.04,,60220.8,,5,,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
194,..,Monaco,..,,..,,..,,..,,..,,..,,..
195,..,Nauru,..,,..,,11.26,e,..,,17312.59,,..,,..
196,..,San Marino,..,,..,,15.11,,..,,..,,..,,..
197,..,Somalia,..,,57.07,,..,,..,,..,,..,,..


In [100]:
hdi.drop(index=range(199,262),inplace=True) # should I reset indexes?

In [101]:
hdi

Unnamed: 0,HDI rank,Country,Human development index (HDI),NaN,Life expectancy at birth,NaN.1,Expected years of schooling,NaN.2,Mean years of schooling,NaN.3,Gross national income (GNI) per capita,NaN.4,GNI per capita rank minus HDI rank,NaN.5,HDI rank.1
0,1,Norway,0.95,,82.27,,18.06,b,12.57,,68058.62,,5,,1
1,2,Switzerland,0.95,,83.63,,16.21,,13.38,,59374.73,,8,,2
2,3,Ireland,0.94,,82.1,,18.79,b,12.53,c,55659.68,,9,,3
3,4,Germany,0.94,,81.18,,17.1,,14.13,,46945.95,,15,,4
4,4,"Hong Kong, China (SAR)",0.94,,84.69,,16.51,,12.04,,60220.8,,5,,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
194,..,Monaco,..,,..,,..,,..,,..,,..,,..
195,..,Nauru,..,,..,,11.26,e,..,,17312.59,,..,,..
196,..,San Marino,..,,..,,15.11,,..,,..,,..,,..
197,..,Somalia,..,,57.07,,..,,..,,..,,..,,..


In [102]:
# Get good columns

GoodHeaders=[header for header in CurrentHeaders if str(header) != 'nan']
#
GoodHeaders

['HDI rank',
 'Country',
 'Human development index (HDI)',
 'Life expectancy at birth',
 'Expected years of schooling',
 'Mean years of schooling',
 'Gross national income (GNI) per capita',
 'GNI per capita rank minus HDI rank',
 'HDI rank']

In [103]:
FinalHeaders=[header for header in GoodHeaders if 'rank' not in header]
# then
FinalHeaders

['Country',
 'Human development index (HDI)',
 'Life expectancy at birth',
 'Expected years of schooling',
 'Mean years of schooling',
 'Gross national income (GNI) per capita']

In [104]:
hdi.head()

Unnamed: 0,HDI rank,Country,Human development index (HDI),NaN,Life expectancy at birth,NaN.1,Expected years of schooling,NaN.2,Mean years of schooling,NaN.3,Gross national income (GNI) per capita,NaN.4,GNI per capita rank minus HDI rank,NaN.5,HDI rank.1
0,1,Norway,0.95,,82.27,,18.06,b,12.57,,68058.62,,5,,1
1,2,Switzerland,0.95,,83.63,,16.21,,13.38,,59374.73,,8,,2
2,3,Ireland,0.94,,82.1,,18.79,b,12.53,c,55659.68,,9,,3
3,4,Germany,0.94,,81.18,,17.1,,14.13,,46945.95,,15,,4
4,4,"Hong Kong, China (SAR)",0.94,,84.69,,16.51,,12.04,,60220.8,,5,,6


In [105]:
# remember loc works with names, not with positions:

hdi.loc[:,FinalHeaders]

Unnamed: 0,Country,Human development index (HDI),Life expectancy at birth,Expected years of schooling,Mean years of schooling,Gross national income (GNI) per capita
0,Norway,0.95,82.27,18.06,12.57,68058.62
1,Switzerland,0.95,83.63,16.21,13.38,59374.73
2,Ireland,0.94,82.1,18.79,12.53,55659.68
3,Germany,0.94,81.18,17.1,14.13,46945.95
4,"Hong Kong, China (SAR)",0.94,84.69,16.51,12.04,60220.8
...,...,...,...,...,...,...
194,Monaco,..,..,..,..,..
195,Nauru,..,..,11.26,..,17312.59
196,San Marino,..,..,15.11,..,..
197,Somalia,..,57.07,..,..,..


In [106]:
hdi=hdi.loc[:,FinalHeaders]

In [107]:
# you have:
hdi.head()

Unnamed: 0,Country,Human development index (HDI),Life expectancy at birth,Expected years of schooling,Mean years of schooling,Gross national income (GNI) per capita
0,Norway,0.95,82.27,18.06,12.57,68058.62
1,Switzerland,0.95,83.63,16.21,13.38,59374.73
2,Ireland,0.94,82.1,18.79,12.53,55659.68
3,Germany,0.94,81.18,17.1,14.13,46945.95
4,"Hong Kong, China (SAR)",0.94,84.69,16.51,12.04,60220.8


In [110]:
# replace with '' (empty) the "blanks":
hdi.columns.str.replace("\s","") 

Index(['Country', 'Humandevelopmentindex(HDI)', 'Lifeexpectancyatbirth',
       'Expectedyearsofschooling', 'Meanyearsofschooling',
       'Grossnationalincome(GNI)percapita'],
      dtype='object')

In [111]:
# replace with '' (empty) consecutive word characters in parenthesis:
hdi.columns.str.replace("\(\w+\)","")

Index(['Country', 'Human development index ', 'Life expectancy at birth',
       'Expected years of schooling', 'Mean years of schooling',
       'Gross national income  per capita'],
      dtype='object')

In [112]:
#or all combines
hdi.columns.str.replace("\s+|\(\w+\)","")

Index(['Country', 'Humandevelopmentindex', 'Lifeexpectancyatbirth',
       'Expectedyearsofschooling', 'Meanyearsofschooling',
       'Grossnationalincomepercapita'],
      dtype='object')

In [113]:
hdi.columns=hdi.columns.str.replace("\s+|\(\w+\)","")

In [114]:
hdi.head()

Unnamed: 0,Country,Humandevelopmentindex,Lifeexpectancyatbirth,Expectedyearsofschooling,Meanyearsofschooling,Grossnationalincomepercapita
0,Norway,0.95,82.27,18.06,12.57,68058.62
1,Switzerland,0.95,83.63,16.21,13.38,59374.73
2,Ireland,0.94,82.1,18.79,12.53,55659.68
3,Germany,0.94,81.18,17.1,14.13,46945.95
4,"Hong Kong, China (SAR)",0.94,84.69,16.51,12.04,60220.8


In [123]:
# check empty cells from second to last
hdi.iloc[:,1:].isnull().all(axis=1)
# hdi[hdi.iloc[:,1:].isnull().all(axis=1)] #ALL

0      False
1      False
2      False
3      False
4      False
       ...  
194    False
195    False
196    False
197    False
198    False
Length: 199, dtype: bool

In [124]:
# the opposite
hdi[hdi.iloc[:,1:].notnull().all(axis=1)]

Unnamed: 0,Country,Humandevelopmentindex,Lifeexpectancyatbirth,Expectedyearsofschooling,Meanyearsofschooling,Grossnationalincomepercapita
0,Norway,0.95,82.27,18.06,12.57,68058.62
1,Switzerland,0.95,83.63,16.21,13.38,59374.73
2,Ireland,0.94,82.1,18.79,12.53,55659.68
3,Germany,0.94,81.18,17.1,14.13,46945.95
4,"Hong Kong, China (SAR)",0.94,84.69,16.51,12.04,60220.8
...,...,...,...,...,...,...
194,Monaco,..,..,..,..,..
195,Nauru,..,..,11.26,..,17312.59
196,San Marino,..,..,15.11,..,..
197,Somalia,..,57.07,..,..,..


In [125]:
hdi=hdi[hdi.iloc[:,1:].notnull().all(axis=1)]

In [126]:
hdi.reset_index(drop=True, inplace=True)

In [128]:
hdi

Unnamed: 0,Country,Humandevelopmentindex,Lifeexpectancyatbirth,Expectedyearsofschooling,Meanyearsofschooling,Grossnationalincomepercapita
0,Norway,0.95,82.27,18.06,12.57,68058.62
1,Switzerland,0.95,83.63,16.21,13.38,59374.73
2,Ireland,0.94,82.1,18.79,12.53,55659.68
3,Germany,0.94,81.18,17.1,14.13,46945.95
4,"Hong Kong, China (SAR)",0.94,84.69,16.51,12.04,60220.8
...,...,...,...,...,...,...
190,Monaco,..,..,..,..,..
191,Nauru,..,..,11.26,..,17312.59
192,San Marino,..,..,15.11,..,..
193,Somalia,..,57.07,..,..,..


In [129]:
hdi[hdi.iloc[:,1:].isnull().any(axis=1)] #ANY

Unnamed: 0,Country,Humandevelopmentindex,Lifeexpectancyatbirth,Expectedyearsofschooling,Meanyearsofschooling,Grossnationalincomepercapita


In [131]:
badHDISymbols=[] # list for bad symbols

NumericColNames=hdi.iloc[:,1:].columns # save names of columns with numeric data

for columnName in NumericColNames:# visit every column name
    for cell in hdi[columnName]:# visit every cell for that column
        try:
            float(cell) # try this
        except: # if not possible:            
            if cell not in badHDISymbols:# if cell is not in the list                
                badHDISymbols.append(cell)# add it to the list

# you get:
badHDISymbols

['..']

In [132]:
import numpy as np

hdi.replace(to_replace=badHDISymbols,value=np.nan,inplace=True)

In [133]:
# you have:
hdi

Unnamed: 0,Country,Humandevelopmentindex,Lifeexpectancyatbirth,Expectedyearsofschooling,Meanyearsofschooling,Grossnationalincomepercapita
0,Norway,0.95,82.27,18.06,12.57,68058.62
1,Switzerland,0.95,83.63,16.21,13.38,59374.73
2,Ireland,0.94,82.1,18.79,12.53,55659.68
3,Germany,0.94,81.18,17.1,14.13,46945.95
4,"Hong Kong, China (SAR)",0.94,84.69,16.51,12.04,60220.8
...,...,...,...,...,...,...
190,Monaco,,,,,
191,Nauru,,,11.26,,17312.59
192,San Marino,,,15.11,,
193,Somalia,,57.07,,,


In [134]:
hdi.describe(include='all')

Unnamed: 0,Country,Humandevelopmentindex,Lifeexpectancyatbirth,Expectedyearsofschooling,Meanyearsofschooling,Grossnationalincomepercapita
count,195,189.0,191.0,193.0,189.0,191.0
unique,195,55.0,186.0,183.0,174.0,191.0
top,Marshall Islands,0.76,76.7,12.69,9.67,659.73
freq,1,10.0,2.0,3.0,3.0,1.0


In [135]:
hdi.dtypes

Country                         object
Humandevelopmentindex           object
Lifeexpectancyatbirth           object
Expectedyearsofschooling        object
Meanyearsofschooling            object
Grossnationalincomepercapita    object
dtype: object

In [136]:
# as easy as:

hdi[NumericColNames]=hdi.loc[:,NumericColNames].apply(pd.to_numeric)

In [137]:
#recheck
hdi.dtypes


Country                          object
Humandevelopmentindex           float64
Lifeexpectancyatbirth           float64
Expectedyearsofschooling        float64
Meanyearsofschooling            float64
Grossnationalincomepercapita    float64
dtype: object

In [138]:
# recheck
hdi.describe(include='all')

Unnamed: 0,Country,Humandevelopmentindex,Lifeexpectancyatbirth,Expectedyearsofschooling,Meanyearsofschooling,Grossnationalincomepercapita
count,195,189.0,191.0,193.0,189.0,191.0
unique,195,,,,,
top,Marshall Islands,,,,,
freq,1,,,,,
mean,,0.713598,72.41466,13.212073,8.613228,18368.072461
std,,0.150813,7.509694,2.935116,3.082296,19627.352304
min,,0.38,52.81,5.0,1.59,659.73
25%,,0.6,67.225,11.3,6.35,4009.435
50%,,0.73,73.75,13.07,9.02,11610.91
75%,,0.83,77.7,15.23,11.29,26535.42


In [139]:
hdi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 195 entries, 0 to 194
Data columns (total 6 columns):
Country                         195 non-null object
Humandevelopmentindex           189 non-null float64
Lifeexpectancyatbirth           191 non-null float64
Expectedyearsofschooling        193 non-null float64
Meanyearsofschooling            189 non-null float64
Grossnationalincomepercapita    191 non-null float64
dtypes: float64(5), object(1)
memory usage: 9.3+ KB


### Integrating


In [140]:
cia.columns

Index(['countries', 'co2_in_MT'], dtype='object')

In [141]:
demodex.columns

Index(['Country', 'Electoralprocessandpluralism', 'Functioningofgovernment',
       'Politicalparticipation', 'Politicalculture', 'Civilliberties',
       'Regimetype', 'Continent'],
      dtype='object')

In [142]:
hdi.columns

Index(['Country', 'Humandevelopmentindex', 'Lifeexpectancyatbirth',
       'Expectedyearsofschooling', 'Meanyearsofschooling',
       'Grossnationalincomepercapita'],
      dtype='object')

In [143]:
#hdi and demodex have a common column: Country
hdi.merge(demodex)

Unnamed: 0,Country,Humandevelopmentindex,Lifeexpectancyatbirth,Expectedyearsofschooling,Meanyearsofschooling,Grossnationalincomepercapita,Electoralprocessandpluralism,Functioningofgovernment,Politicalparticipation,Politicalculture,Civilliberties,Regimetype,Continent
0,Norway,0.95,82.27,18.06,12.57,68058.62,10.00,9.64,10.00,10.00,9.71,Full democracy,Europe
1,Switzerland,0.95,83.63,16.21,13.38,59374.73,9.58,9.29,7.78,9.38,9.12,Full democracy,Europe
2,Ireland,0.94,82.10,18.79,12.53,55659.68,10.00,7.86,8.33,10.00,10.00,Full democracy,Europe
3,Germany,0.94,81.18,17.10,14.13,46945.95,9.58,8.57,8.33,7.50,9.41,Full democracy,Europe
4,"Hong Kong, China (SAR)",0.94,84.69,16.51,12.04,60220.80,3.58,4.36,6.11,7.50,8.53,Flawed democracy,Asia
...,...,...,...,...,...,...,...,...,...,...,...,...,...
161,Burundi,0.42,61.25,11.30,3.12,659.73,0.00,0.07,3.33,5.00,2.35,Authoritarian,Africa
162,Chad,0.40,53.98,7.47,2.41,1715.57,0.00,0.00,1.67,3.75,2.65,Authoritarian,Africa
163,Central African Republic,0.38,52.81,7.57,4.28,776.68,1.25,0.00,1.11,1.88,2.35,Authoritarian,Africa
164,Niger,0.38,62.02,6.47,2.03,912.04,2.92,1.14,3.33,4.38,4.71,Authoritarian,Africa


In [73]:
len(hdi), len(demodex)

(0, 167)

In [75]:
# all hdi and the ones in common with demodex:
hdi.merge(demodex,how='left') 

MergeError: No common columns to perform merge on. Merge options: left_on=None, right_on=None, left_index=False, right_index=False

In [76]:
hdi.merge(demodex,how='outer',indicator=True) # see last column

MergeError: No common columns to perform merge on. Merge options: left_on=None, right_on=None, left_index=False, right_index=False

In [77]:
dirtyMerge1=hdi.merge(demodex,how='outer',indicator=True) 

MergeError: No common columns to perform merge on. Merge options: left_on=None, right_on=None, left_index=False, right_index=False

In [78]:
dirtyMerge1.loc[dirtyMerge1['_merge']=='right_only',"Country"]

NameError: name 'dirtyMerge1' is not defined

In [79]:
dirtyMerge1.loc[dirtyMerge1['_merge']=='left_only',"Country"]

NameError: name 'dirtyMerge1' is not defined

In [80]:
#dictionary of replacements:
replacements1={'South Korea[n 1]': 'Korea (Republic of)', 
              'Cape Verde':'Cabo Verde',
              'Czech Republic':'Czechia',
              'Hong Kong':'Hong Kong, China (SAR)',
              'Moldova':'Moldova (Republic of)',
              'Bolivia':'Bolivia (Plurinational State of)',
              'Tanzania':'Tanzania (United Republic of)',
              'Palestine':'Palestine, State of',
              'Ivory Coast':"Côte d'Ivoire",
              'Republic of the Congo':'Congo',
              'Venezuela':'Venezuela (Bolivarian Republic of)',
              'Vietnam':'Viet Nam',
              'Eswatini':'Eswatini (Kingdom of)',              
              'Russia':'Russian Federation',
              'Iran':'Iran (Islamic Republic of)',
              'Laos':"Lao People's Democratic Republic",
              'Democratic Republic of the Congo':'Congo (Democratic Republic of the)',
              'Syria':'Syrian Arab Republic',
              'North Korea': "Korea (Democratic People's Rep. of)" #check ""
             }

# replacing
demodex.Country.replace(replacements1,inplace=True)

In [81]:
hdi.merge(demodex)

MergeError: No common columns to perform merge on. Merge options: left_on=None, right_on=None, left_index=False, right_index=False