In [21]:
# Importing the relevant dependencies
import pandas as pd

In [22]:
# Obtaining a list of all the countries and their associated three letter codes
countries = pd.read_html('https://countrycode.org/')

In [23]:
# Viewing the Dataframe
countries

[               COUNTRY COUNTRY CODE ISO CODES  POPULATION  AREA KM2  \
 0          Afghanistan           93  AF / AFG    29121286    647500   
 1              Albania          355  AL / ALB     2986952     28748   
 2              Algeria          213  DZ / DZA    34586184   2381740   
 3       American Samoa        1-684  AS / ASM       57881       199   
 4              Andorra          376  AD / AND       84000       468   
 ..                 ...          ...       ...         ...       ...   
 235  Wallis and Futuna          681  WF / WLF       16025       274   
 236     Western Sahara          212  EH / ESH      273008    266000   
 237              Yemen          967  YE / YEM    23495361    527970   
 238             Zambia          260  ZM / ZMB    13460305    752614   
 239           Zimbabwe          263  ZW / ZWE    11651858    390580   
 
           GDP $USD  
 0    20.65 Billion  
 1     12.8 Billion  
 2    215.7 Billion  
 3    462.2 Million  
 4      4.8 Billion  
 .

In [24]:
# Determing the type for the country variable
type(countries)

list

In [25]:
# Obtain the first key of the list
countries_list = countries[0]
countries_list

Unnamed: 0,COUNTRY,COUNTRY CODE,ISO CODES,POPULATION,AREA KM2,GDP $USD
0,Afghanistan,93,AF / AFG,29121286,647500,20.65 Billion
1,Albania,355,AL / ALB,2986952,28748,12.8 Billion
2,Algeria,213,DZ / DZA,34586184,2381740,215.7 Billion
3,American Samoa,1-684,AS / ASM,57881,199,462.2 Million
4,Andorra,376,AD / AND,84000,468,4.8 Billion
...,...,...,...,...,...,...
235,Wallis and Futuna,681,WF / WLF,16025,274,
236,Western Sahara,212,EH / ESH,273008,266000,
237,Yemen,967,YE / YEM,23495361,527970,43.89 Billion
238,Zambia,260,ZM / ZMB,13460305,752614,22.24 Billion


In [26]:
# Splitting the column relating to ISO CODES
iso_code_split = countries_list['ISO CODES'].str.split("/",n=1, expand=True)
iso_code_split

Unnamed: 0,0,1
0,AF,AFG
1,AL,ALB
2,DZ,DZA
3,AS,ASM
4,AD,AND
...,...,...
235,WF,WLF
236,EH,ESH
237,YE,YEM
238,ZM,ZMB


In [27]:
# Adding the split list back to the dataframe
countries_list['ISO CODE 2L'] = iso_code_split[0]
countries_list

Unnamed: 0,COUNTRY,COUNTRY CODE,ISO CODES,POPULATION,AREA KM2,GDP $USD,ISO CODE 2L
0,Afghanistan,93,AF / AFG,29121286,647500,20.65 Billion,AF
1,Albania,355,AL / ALB,2986952,28748,12.8 Billion,AL
2,Algeria,213,DZ / DZA,34586184,2381740,215.7 Billion,DZ
3,American Samoa,1-684,AS / ASM,57881,199,462.2 Million,AS
4,Andorra,376,AD / AND,84000,468,4.8 Billion,AD
...,...,...,...,...,...,...,...
235,Wallis and Futuna,681,WF / WLF,16025,274,,WF
236,Western Sahara,212,EH / ESH,273008,266000,,EH
237,Yemen,967,YE / YEM,23495361,527970,43.89 Billion,YE
238,Zambia,260,ZM / ZMB,13460305,752614,22.24 Billion,ZM


In [28]:
# Adding the second three letter code list back to the original dataframe
countries_list['ISO CODE 3L'] = iso_code_split[1]
countries_list

Unnamed: 0,COUNTRY,COUNTRY CODE,ISO CODES,POPULATION,AREA KM2,GDP $USD,ISO CODE 2L,ISO CODE 3L
0,Afghanistan,93,AF / AFG,29121286,647500,20.65 Billion,AF,AFG
1,Albania,355,AL / ALB,2986952,28748,12.8 Billion,AL,ALB
2,Algeria,213,DZ / DZA,34586184,2381740,215.7 Billion,DZ,DZA
3,American Samoa,1-684,AS / ASM,57881,199,462.2 Million,AS,ASM
4,Andorra,376,AD / AND,84000,468,4.8 Billion,AD,AND
...,...,...,...,...,...,...,...,...
235,Wallis and Futuna,681,WF / WLF,16025,274,,WF,WLF
236,Western Sahara,212,EH / ESH,273008,266000,,EH,ESH
237,Yemen,967,YE / YEM,23495361,527970,43.89 Billion,YE,YEM
238,Zambia,260,ZM / ZMB,13460305,752614,22.24 Billion,ZM,ZMB


In [29]:
# Creating a list of countries to iterate
list_of_countries = countries_list['COUNTRY'].tolist()
list_of_countries[:5]

['Afghanistan', 'Albania', 'Algeria', 'American Samoa', 'Andorra']

In [30]:
# Creating a list of the ISO codes to iterate over
_2l_ISO_CODES = countries_list['ISO CODE 2L'].tolist()
_2l_ISO_CODES[:5]

['AF ', 'AL ', 'DZ ', 'AS ', 'AD ']

In [31]:
# Removing the spaces within the list
_2l_ISO_CODES = [x.strip(' ') for x in _2l_ISO_CODES]
_2l_ISO_CODES[:5]

['AF', 'AL', 'DZ', 'AS', 'AD']

In [32]:
# Creating a list of the three letter ISO CODES to iterate over
_3l_ISO_CODES = countries_list['ISO CODE 3L'].tolist()
_3l_ISO_CODES[:5]

[' AFG', ' ALB', ' DZA', ' ASM', ' AND']

In [33]:
# Removing spaces within the list
_3l_ISO_CODES = [x.strip(' ') for x in _3l_ISO_CODES]
_3l_ISO_CODES[:5]

['AFG', 'ALB', 'DZA', 'ASM', 'AND']

In [34]:
# Creating a dictionary for the list of countries and their codes
countries_dict = {
    "Country":list_of_countries,
    "LOCATION":_3l_ISO_CODES,
    "ISO CODE 2 letter":_2l_ISO_CODES
}

In [35]:
# Converting the Dictionary to a dataframe
country_df = pd.DataFrame(countries_dict)

In [36]:
# Opening the realestate data frame
real_estate = pd.read_csv('raw_data/worldrealestate.csv')
real_estate

Unnamed: 0,LOCATION,INDICATOR,SUBJECT,MEASURE,FREQUENCY,TIME,Value,Flag Codes
0,AUS,HOUSECOST,NOMINAL,IDX2015,A,1970,2.552380,
1,AUS,HOUSECOST,NOMINAL,IDX2015,A,1971,2.851352,
2,AUS,HOUSECOST,NOMINAL,IDX2015,A,1972,3.183667,
3,AUS,HOUSECOST,NOMINAL,IDX2015,A,1973,3.837183,
4,AUS,HOUSECOST,NOMINAL,IDX2015,A,1974,4.690199,
...,...,...,...,...,...,...,...,...
1556,SAU,HOUSECOST,NOMINAL,IDX2015,A,2015,100.000000,
1557,SAU,HOUSECOST,NOMINAL,IDX2015,A,2016,95.091000,
1558,SAU,HOUSECOST,NOMINAL,IDX2015,A,2017,88.967500,
1559,SAU,HOUSECOST,NOMINAL,IDX2015,A,2018,86.462500,


In [37]:
# Final Real Estate Data
final_real_estate_data = pd.merge(country_df, real_estate, how='inner',on='LOCATION')
final_real_estate_data

Unnamed: 0,Country,LOCATION,ISO CODE 2 letter,INDICATOR,SUBJECT,MEASURE,FREQUENCY,TIME,Value,Flag Codes
0,Australia,AUS,AU,HOUSECOST,NOMINAL,IDX2015,A,1970,2.552380,
1,Australia,AUS,AU,HOUSECOST,NOMINAL,IDX2015,A,1971,2.851352,
2,Australia,AUS,AU,HOUSECOST,NOMINAL,IDX2015,A,1972,3.183667,
3,Australia,AUS,AU,HOUSECOST,NOMINAL,IDX2015,A,1973,3.837183,
4,Australia,AUS,AU,HOUSECOST,NOMINAL,IDX2015,A,1974,4.690199,
...,...,...,...,...,...,...,...,...,...,...
1416,United States,USA,US,HOUSECOST,NOMINAL,IDX2015,A,2015,100.000000,
1417,United States,USA,US,HOUSECOST,NOMINAL,IDX2015,A,2016,105.730769,
1418,United States,USA,US,HOUSECOST,NOMINAL,IDX2015,A,2017,112.382036,
1419,United States,USA,US,HOUSECOST,NOMINAL,IDX2015,A,2018,119.534759,
