In [1]:
import pandas as pd

In [2]:
# read csv file

gapminder_total = pd.read_csv('../data/gapminder_total.csv')
gapminder_total

# NOTE THAT AFTER READING CSV-FILE DTYPE FOR 'YEAR' AND 'POPULATION' IS 'FLOAT64' AGAIN, NOT 'INT64' :)

Unnamed: 0,country,year,life expectancy,continent,population,fertility
0,Afghanistan,1950.0,26.85,Asia,7752118.0,7.67
1,Afghanistan,1951.0,27.13,Asia,7839426.0,7.67
2,Afghanistan,1952.0,27.67,Asia,7934798.0,7.67
3,Afghanistan,1953.0,28.19,Asia,8038312.0,7.67
4,Afghanistan,1954.0,28.73,Asia,8150037.0,7.67
...,...,...,...,...,...,...
16970,Turks and Caicos Islands,2015.0,,,34339.0,
16971,Tuvalu,2015.0,,,9916.0,
16972,Wallis et Futuna,2015.0,,,13151.0,
16973,Curaçao,2015.0,,,157203.0,


In [3]:
# Convert 'year' and 'population' columns from 'float64' to 'Int64'

#gapminder_total['year'] = gapminder_total['year'].astype('Int64')
#gapminder_total['population'] = gapminder_total['population'].astype('Int64')
#gapminder_total.info()

## Checking for NaN values

In [4]:
# checking for nulls
gapminder_total.isnull().sum()

country               0
year                 22
life expectancy    3268
continent          5549
population          234
fertility          3713
dtype: int64

### 1. Missed 'continent' values

In [5]:
missed_continents_percent = round(100*5549/16975, 2)
missed_continents_percent

32.69

### 33 % of data could not be used for 'continent'-related visualization !!

In [6]:
# continent : 5549 null values

# getting all these records filtered

null_continent = gapminder_total['continent'].isnull()
# gapminder_total[null_continent]['country'].nunique() # 87
gapminder_total[null_continent]['country'].nunique() # 87

# saving list of countries with null continent as DataFrame
df_countries = pd.DataFrame(gapminder_total[null_continent]['country'].unique(), columns=['country'])

# adding a new column 'continent' with default value 'TBD' (To Be Defined)
df_countries['continent_geo'] = 'TBD'
df_countries['continent_politic'] = 'TBD'
df_countries

Unnamed: 0,country,continent_geo,continent_politic
0,Aruba,TBD,TBD
1,Burkina Faso,TBD,TBD
2,Channel Islands,TBD,TBD
3,"Congo, Dem. Rep.",TBD,TBD
4,"Congo, Rep.",TBD,TBD
...,...,...,...
82,North Yemen (former),TBD,TBD
83,South Yemen (former),TBD,TBD
84,Yugoslavia,TBD,TBD
85,Curaçao,TBD,TBD


In [7]:
# cheking is there any missed continent?
# NO, just missed countries in 'continents' dataset :(

gapminder_total['continent'].nunique() # 6
gapminder_total['continent'].unique()

array(['Asia', 'Europe', 'Africa', 'North America', 'South America', nan,
       'Australia and Oceania'], dtype=object)

In [8]:
# Getting the country name, when 'continent' is not defined

df_countries[df_countries['continent_geo'] == 'TBD'].to_csv('../data/countries_no_continent.csv', index=False)

In [9]:
# trying to add continent values for a particuar cuntry
country_name = 'Aruba'
continent_geo_name = 'South America'
continent_politic_name = 'Europe'

# DafaFrame update (found in StackOverflow)
# https://stackoverflow.com/questions/36531289/pandas-update-dataframe-row-with-new-value

df_countries.loc[df_countries['country'] == country_name, 'continent_geo'] = continent_geo_name
df_countries.loc[df_countries['country'] == country_name, 'continent_politic'] = continent_politic_name
df_countries.loc[df_countries['country'] == country_name]


Unnamed: 0,country,continent_geo,continent_politic
0,Aruba,South America,Europe


In [10]:
# List of valid continents

continents_list = ['Asia', 'Europe', 'Africa', 'North America', 'South America', 'Australia and Oceania']

### 2. Missed 'fertility rate' values

In [11]:
missed_fertility_percent = round(100*3713/16975, 2)
missed_fertility_percent

21.87

### 22 % of data could not be used for 'fertility'-related visualization !

### 3. Missed 'life expectancy' values

In [12]:
missed_life_exp_percent = round(100*3268/16975, 2)
missed_life_exp_percent

19.25

### 19 % of data could not be used for 'life expectancy'-related visualization !

### 4. Comparing missed values subsets

In [13]:
# getting all rows with any missed data
# year                 22
# life expectancy    3268
# continent          5549
# population          234
# fertility          3713

mask = (gapminder_total['year'].isnull()) | (gapminder_total['life expectancy'].isnull()) | (gapminder_total['continent'].isnull()) | (gapminder_total['population'].isnull()) | (gapminder_total['fertility'].isnull())
        
gapminder_missed = gapminder_total[mask]

gapminder_missed.reset_index(inplace=True, drop=True)
gapminder_missed

Unnamed: 0,country,year,life expectancy,continent,population,fertility
0,Afghanistan,2016.0,52.72,Asia,,
1,Albania,2016.0,78.10,Europe,,
2,Algeria,2016.0,76.50,Africa,,
3,Angola,2016.0,60.00,Africa,,
4,Antigua and Barbuda,2016.0,76.50,North America,,
...,...,...,...,...,...,...
5882,Turks and Caicos Islands,2015.0,,,34339.0,
5883,Tuvalu,2015.0,,,9916.0,
5884,Wallis et Futuna,2015.0,,,13151.0,
5885,Curaçao,2015.0,,,157203.0,


### GOOD NEWS! Missed values subsets overlap (intercect) !

In [14]:
# saving a list of columns as a separate list
# columns[0] = 'index' could be ignored
columns = gapminder_missed.columns.tolist()
columns

['country', 'year', 'life expectancy', 'continent', 'population', 'fertility']

In [15]:
gapminder_missed

Unnamed: 0,country,year,life expectancy,continent,population,fertility
0,Afghanistan,2016.0,52.72,Asia,,
1,Albania,2016.0,78.10,Europe,,
2,Algeria,2016.0,76.50,Africa,,
3,Angola,2016.0,60.00,Africa,,
4,Antigua and Barbuda,2016.0,76.50,North America,,
...,...,...,...,...,...,...
5882,Turks and Caicos Islands,2015.0,,,34339.0,
5883,Tuvalu,2015.0,,,9916.0,
5884,Wallis et Futuna,2015.0,,,13151.0,
5885,Curaçao,2015.0,,,157203.0,


In [22]:
# types of total (combined) observations with missed data

# adding new column 'notes' - summary for missed data
gapminder_missed['notes'] = ''

#gapminder_missed

for i in range (0, 5887):
    for col in columns:
        #if gapminder_missed.loc[i,col] is None:     #  .loc[row_indexer,col_indexer]
        if str(gapminder_missed.loc[i,col]) == 'nan':     #  .loc[row_indexer,col_indexer]
            gapminder_missed.at[i,'notes'] = gapminder_missed.loc[i,'notes'] + col + ';'
            
#gapminder_missed []

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gapminder_missed['notes'] = ''


In [23]:
gapminder_missed

Unnamed: 0,country,year,life expectancy,continent,population,fertility,notes
0,Afghanistan,2016.0,52.72,Asia,,,population;fertility;
1,Albania,2016.0,78.10,Europe,,,population;fertility;
2,Algeria,2016.0,76.50,Africa,,,population;fertility;
3,Angola,2016.0,60.00,Africa,,,population;fertility;
4,Antigua and Barbuda,2016.0,76.50,North America,,,population;fertility;
...,...,...,...,...,...,...,...
5882,Turks and Caicos Islands,2015.0,,,34339.0,,life expectancy;continent;fertility;
5883,Tuvalu,2015.0,,,9916.0,,life expectancy;continent;fertility;
5884,Wallis et Futuna,2015.0,,,13151.0,,life expectancy;continent;fertility;
5885,Curaçao,2015.0,,,157203.0,,life expectancy;continent;fertility;


In [21]:
d = gapminder_missed.loc[5885,'continent']
str(d)

'nan'

In [33]:
# Getting counts on 'Notes'
missed_stats = pd.DataFrame(gapminder_missed['notes'].value_counts())
missed_stats
#missed_stats.plot(x=missed_stats[1], y=missed_stats[0])

Unnamed: 0,notes
life expectancy;continent;fertility;,3246
continent;,2172
population;fertility;,172
fertility;,144
continent;fertility;,91
continent;population;fertility;,38
year;life expectancy;population;fertility;,22
continent;population;,2


In [34]:
round(100*3246/16975, 2)

19.12

### Some statements about missed data:

1. life expectancy;continent;fertility;
    * Quantity = 3246
    * **the biggest cathegory (19% of total dataset)**
    * only 'population' info is available
    * too much gaps to restore!   
2. continent; 	
    * Quantity = 2172
    * may be fixed just by requesting a new 'continent' dataset (as far as it is a dictionary)

### Not so essential:

3. population;fertility;
    * Quantity = 172
    * life expectancy is missed
    
4. fertility;
    * Quantity = 144
    
5. continent;fertility;
    * Quantity = 91
    
6. continent;population;fertility;
    * Quantity = 38
    
7. year;life expectancy;population;fertility;
    * Quantity = 22
    
8. continent;population;
    * Quantity = 2