# Day 11 Workout - Missing Data

In [64]:
import pandas as pd
import numpy as np

The file `gapminder_with_missing.tsv` contains missing values indicated by `-1`

In [65]:
data = pd.read_csv('data/gapminder_with_missing.tsv', sep='\t')

In [66]:
data.head()

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
1,Afghanistan,Asia,1957,30.332,9240934,820.85303
2,Afghanistan,Asia,1962,31.997,10267083,-1.0
3,Afghanistan,Asia,1967,34.02,11537966,836.197138
4,Afghanistan,Asia,1972,36.088,13079460,739.981106


Calculate the average `lifeExp`, `pop`, and `gdpPercap`

In [67]:
aveLifeExp = data.lifeExp.mean()
avepop = data['pop'].mean()
aveGDP = data.gdpPercap.mean()

print(aveLifeExp)
print(avepop)
print(aveGDP)

55.81203833333334
27992515.794014085
6833.852739445188


The values you calculated are not accurate because `-1` was included in the calculation. However, `-1` is used in the file to indicate the entry is missing. Let us deal with that.

Replace all `-1` with `NaN` *HINT* uses the function replace, i.e., `data.replace(...)`

In [68]:
data.replace(-1,np.nan, inplace=True)

Calculate the average `lifeExp`, `pop`, and `gdpPercap` again

In [69]:
aveLifeExp = data.lifeExp.mean()
avepop = data['pop'].mean()
aveGDP = data.gdpPercap.mean()

print(aveLifeExp)
print(avepop)
print(aveGDP)

59.391586600124775
29774810.870786518
7201.590641938527


What are the rows that contain missing values?
- Hint, think of comparing the indices of `data` and `data.dropna()`

In [70]:
sorted(set(data.index) - set(data.dropna().index)) 

[2,
 7,
 29,
 32,
 39,
 42,
 45,
 58,
 61,
 68,
 69,
 73,
 79,
 83,
 101,
 102,
 104,
 113,
 114,
 127,
 129,
 145,
 147,
 154,
 158,
 159,
 161,
 169,
 173,
 178,
 179,
 185,
 189,
 190,
 192,
 196,
 197,
 200,
 209,
 210,
 224,
 226,
 240,
 243,
 245,
 253,
 259,
 261,
 268,
 276,
 283,
 286,
 293,
 297,
 309,
 320,
 321,
 328,
 338,
 340,
 342,
 354,
 358,
 374,
 380,
 391,
 401,
 410,
 416,
 424,
 430,
 431,
 433,
 438,
 448,
 452,
 470,
 475,
 480,
 492,
 495,
 498,
 501,
 502,
 503,
 524,
 525,
 549,
 550,
 554,
 557,
 574,
 581,
 585,
 587,
 606,
 609,
 631,
 632,
 638,
 644,
 645,
 647,
 650,
 658,
 663,
 665,
 667,
 674,
 680,
 691,
 697,
 711,
 721,
 723,
 727,
 741,
 754,
 762,
 790,
 793,
 799,
 800,
 802,
 804,
 809,
 811,
 821,
 825,
 832,
 838,
 845,
 847,
 853,
 863,
 867,
 868,
 874,
 875,
 880,
 881,
 882,
 889,
 892,
 902,
 903,
 904,
 920,
 947,
 958,
 982,
 983,
 984,
 987,
 994,
 998,
 1005,
 1014,
 1015,
 1017,
 1025,
 1047,
 1059,
 1070,
 1073,
 1081,
 1088,
 10

What are the cols that contain missing values?

In [71]:
#solution 1
rm = set()
cm = set()
for i in data.index:
    for c in data.columns:
        v = data.loc[i, c]
        if pd.isnull(v):
            rm.add(i)
            cm.add(c)
print(cm)

{'lifeExp', 'gdpPercap', 'pop'}


In [72]:
#solution 2
set(data.columns) - set(data.dropna(axis=1).columns)

{'gdpPercap', 'lifeExp', 'pop'}

In [73]:
#Solution 3
data.isnull()

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,False,False,False,False,False,False
1,False,False,False,False,False,False
2,False,False,False,False,False,True
3,False,False,False,False,False,False
4,False,False,False,False,False,False
...,...,...,...,...,...,...
1699,False,False,False,False,False,False
1700,False,False,False,False,False,False
1701,False,False,False,False,False,False
1702,False,False,False,False,False,False


In [74]:
#solution 3
s = data.isnull().any()
s[s].index

Index(['lifeExp', 'pop', 'gdpPercap'], dtype='object')

### Part 2

The file `gapminder_with_corrupt.tsv` contains the same gapminder data but some numerical entries are corrupt

In [75]:
c_data = pd.read_csv('data/gapminder_with_corrupt.tsv', sep='\t')
c_data.head()

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333,779.4453145
1,Afghanistan,Asia,1957,30.332,9240934,820.8530296nn
2,Afghanistan,Asia,1962,31.997,10267083,853.10071
3,Afghanistan,Asia,1967,34.02,11537966,836.1971382
4,Afghanistan,Asia,1972,36.088,13079460,739.9811058


Calculate the average `lifeExp`, `pop`, and `gdpPercap`

You will need to detect the corrupt values and exclude them from the calculation

*HINT* replace the corrupt values with NaN

In [76]:
c_data.dtypes

country      object
continent    object
year          int64
lifeExp      object
pop          object
gdpPercap    object
dtype: object

In [77]:
# Solution 1: Manually update non-numbers
def is_number(n):
    for c in n:
        if c not in '0123456789.':
            return False
    return True #

def convert_to_number(n):
    if is_number(n):
        return float(n)
    else:
        return np.nan

In [78]:
c_data['lifeExp'] = c_data['lifeExp'].apply(convert_to_number)
c_data['pop'] = c_data['pop'].apply(convert_to_number)
c_data['gdpPercap'] = c_data['gdpPercap'].apply(convert_to_number)

In [79]:
aveLifeExp = c_data.lifeExp.mean()
avepop = c_data['pop'].mean()
aveGDP = c_data.gdpPercap.mean()

print(aveLifeExp)
print(avepop)
print(aveGDP)

59.44340112104949
29361346.398798797
7186.243429043541


### Solution 2: Using pandas to_numeric function

The 'errors' parameter asks what the computer should do if it sees an error. 'coerce' will replace it with a null value if it is unable to make it a number.

In [80]:
#Solution 2: Using pandas to_numeric function
c_data = pd.read_csv('data/gapminder_with_corrupt.tsv', sep='\t')

c_data['lifeExp'] = pd.to_numeric(c_data['lifeExp'], errors='coerce')
c_data['pop'] = pd.to_numeric(c_data['pop'], errors='coerce')
c_data['gdpPercap'] = pd.to_numeric(c_data['gdpPercap'], errors='coerce')

In [81]:
aveLifeExp = c_data.lifeExp.mean()
avepop = c_data['pop'].mean()
aveGDP = c_data.gdpPercap.mean()

print(aveLifeExp)
print(avepop)
print(aveGDP)

59.44340112104949
29361346.398798797
7186.243429043541
