# Feature Selection

## Imports

In [21]:
import pandas as pd
import numpy as np

In [22]:
df = pd.read_csv('./data/WDIData.csv')
df

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,Unnamed: 65
0,Arab World,ARB,Access to clean fuels and technologies for coo...,EG.CFT.ACCS.ZS,,,,,,,...,83.120303,83.533457,83.897596,84.171599,84.510171,,,,,
1,Arab World,ARB,Access to electricity (% of population),EG.ELC.ACCS.ZS,,,,,,,...,87.512260,88.129881,87.275323,88.720097,89.308602,90.283638,89.286856,,,
2,Arab World,ARB,"Access to electricity, rural (% of rural popul...",EG.ELC.ACCS.RU.ZS,,,,,,,...,77.251714,78.165706,75.512153,78.211000,79.065508,81.102134,79.248100,,,
3,Arab World,ARB,"Access to electricity, urban (% of urban popul...",EG.ELC.ACCS.UR.ZS,,,,,,,...,96.435957,96.772853,96.466705,96.936319,97.290083,97.467915,97.063959,,,
4,Arab World,ARB,Account ownership at a financial institution o...,FX.OWN.TOTL.ZS,,,,,,,...,,,30.277130,,,37.165211,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
378571,Zimbabwe,ZWE,Women who believe a husband is justified in be...,SG.VAW.REFU.ZS,,,,,,,...,,,,14.500000,,,,,,
378572,Zimbabwe,ZWE,Women who were first married by age 15 (% of w...,SP.M15.2024.FE.ZS,,,,,,,...,,,,3.700000,,,,5.4,,
378573,Zimbabwe,ZWE,Women who were first married by age 18 (% of w...,SP.M18.2024.FE.ZS,,,,,,,...,,,33.500000,32.400000,,,,33.7,,
378574,Zimbabwe,ZWE,Women's share of population ages 15+ living wi...,SH.DYN.AIDS.FE.ZS,,,,,,,...,59.200000,59.300000,59.400000,59.400000,59.500000,59.500000,59.600000,59.6,,


In [23]:
for ind in list(df['Indicator Name'].unique()):
    print(ind)

Access to clean fuels and technologies for cooking (% of population)
Access to electricity (% of population)
Access to electricity, rural (% of rural population)
Access to electricity, urban (% of urban population)
Account ownership at a financial institution or with a mobile-money-service provider (% of population ages 15+)
Account ownership at a financial institution or with a mobile-money-service provider, female (% of population ages 15+)
Account ownership at a financial institution or with a mobile-money-service provider, male (% of population ages 15+)
Account ownership at a financial institution or with a mobile-money-service provider, older adults (% of population ages 25+)
Account ownership at a financial institution or with a mobile-money-service provider, poorest 40% (% of population ages 15+)
Account ownership at a financial institution or with a mobile-money-service provider, primary education or less (% of population ages 15+)
Account ownership at a financial institution 

# Which indicators has the lowest number of nans

In [24]:
correct_columns = pd.concat([df.iloc[:,0:3], df.iloc[:,-33:-2]], axis=1)
correct_columns = correct_columns.drop('Country Code', axis=1)
correct_columns.head()

Unnamed: 0,Country Name,Indicator Name,1989,1990,1991,1992,1993,1994,1995,1996,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,Arab World,Access to clean fuels and technologies for coo...,,,,,,,,,...,82.368101,82.783289,83.120303,83.533457,83.897596,84.171599,84.510171,,,
1,Arab World,Access to electricity (% of population),,,,,,,,,...,86.959991,87.199474,87.51226,88.129881,87.275323,88.720097,89.308602,90.283638,89.286856,
2,Arab World,"Access to electricity, rural (% of rural popul...",,,,54.173305,56.103435,57.350189,57.353298,59.528339,...,75.81616,75.958878,77.251714,78.165706,75.512153,78.211,79.065508,81.102134,79.2481,
3,Arab World,"Access to electricity, urban (% of urban popul...",,,,,,,,,...,96.290866,96.466418,96.435957,96.772853,96.466705,96.936319,97.290083,97.467915,97.063959,
4,Arab World,Account ownership at a financial institution o...,,,,,,,,,...,,22.260538,,,30.27713,,,37.165211,,


In [25]:
nans_dict = {}

for indicator in df['Indicator Name'].unique():
    nans_value = np.sum(np.sum(correct_columns[(correct_columns['Indicator Name'] == indicator)].isna()))
    nans_dict[indicator] = nans_value
    


In [26]:
sorted_values = sorted(nans_dict.values())
sorted_dict = {}

for i in sorted_values:
    for k in nans_dict.keys():
        if nans_dict[k] == i:
            sorted_dict[k] = nans_dict[k]
            break

In [27]:
sorted_dict

{'Population, total': 53,
 'Population growth (annual %)': 57,
 'Rural population (% of total population)': 101,
 'Rural population': 105,
 'Urban population growth (annual %)': 107,
 'Fixed telephone subscriptions': 291,
 'Mobile cellular subscriptions': 323,
 'Primary education, duration (years)': 356,
 'Population density (people per sq. km of land area)': 370,
 'Land area (sq. km)': 389,
 'Rural population growth (annual %)': 392,
 'Secondary education, duration (years)': 483,
 'GDP (current US$)': 671,
 'GDP per capita (current US$)': 674,
 'Birth rate, crude (per 1,000 people)': 680,
 'Individuals using the Internet (% of population)': 691,
 'Death rate, crude (per 1,000 people)': 697,
 'Inflation, GDP deflator (annual %)': 752,
 'Population ages 00-04, female (% of female population)': 754,
 'Age dependency ratio (% of working-age population)': 757,
 'Arable land (hectares per person)': 760,
 'GDP growth (annual %)': 765,
 'GDP per capita growth (annual %)': 768,
 'Merchandise i

# Choosing particular indices

In [64]:
indicators = [
            # economical
             'GDP per capita (current US$)',
             'Consumer price index (2010 = 100)',
             'Exports of goods and services (current US$)',
             'Adjusted net national income per capita (annual % growth)',
             'Exports of goods and services (constant 2010 US$)',
              
            # sociological
             'Unemployment, total (% of total labor force) (national estimate)',
             'Adjusted net enrollment rate, primary (% of primary school age children)',
             'Individuals using the Internet (% of population)',
             'Access to electricity (% of population)',
             'Mortality rate, under-5 (per 1,000 live births)',
             'Life expectancy at birth, total (years)',
             'Age dependency ratio (% of working-age population)',
             'Fertility rate, total (births per woman)',
             'Death rate, crude (per 1,000 people)',
              
            # geographical
             'Land area (sq. km)',
             'Agricultural land (sq. km)',
             'Urban population (% of total population)',
             'Population density (people per sq. km of land area)',
             'Permanent cropland (% of land area)']


final_df = correct_columns[correct_columns['Indicator Name'].isin(indicators)]
final_df.head()
final_df.shape

(5016, 33)

In [11]:
df.shape

(378576, 66)

In [65]:
final_df.to_csv('../data/preprocessed_data.csv')