# Mod13 Handling Missing Data

In [1]:
import numpy as np
import pandas as pd

In [2]:
np.__version__

'1.19.1'

In [3]:
pd.__version__

'1.0.5'

### Detecting null values

In [8]:
# None can only exist in 'object' array
d1 = pd.Series([1,'hello', None]); d1

0        1
1    hello
2     None
dtype: object

In [11]:
# None can only exist in 'object' array
d2 = pd.Series([1, None]); d2

0    1.0
1    NaN
dtype: float64

In [10]:
# None can only exist in 'object' array
data = pd.Series([1, np.nan, 'hello', None])   
data                                     # 因為有hello, 所以無法把None轉成NaN

0        1
1      NaN
2    hello
3     None
dtype: object

In [None]:
# np 有定義NaN, pd沒有, 但pd建在np 上

In [6]:
data

0        1
1      NaN
2    hello
3     None
dtype: object

In [12]:
data.isnull()

0    False
1     True
2    False
3     True
dtype: bool

In [14]:
# Masking 
data.notnull()

0     True
1    False
2     True
3    False
dtype: bool

In [13]:
# Masking 
data[data.notnull()]

0        1
2    hello
dtype: object

In [15]:
# None can be convert to NaN
d2 = pd.Series([1, np.nan, 33, None])
d2

0     1.0
1     NaN
2    33.0
3     NaN
dtype: float64

In [18]:
%timeit d2.isnull().any()

63.9 µs ± 1.24 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [16]:
d2.isnull().any()  #any()如果找到一個為真, 就傳回True

True

In [19]:
%timeit d2.isnull().values.any()

41.2 µs ± 688 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [17]:
d2.isnull().values.any()

True

In [44]:
np.isnan(d2.values).any()

True

In [46]:
%timeit np.isnan(d2.values).any()     #所有方式中最快, 只能用在數字上, 文字無效

2 µs ± 8.52 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [22]:
import seaborn as sns
planets = sns.load_dataset('planets')
planets.shape

(1035, 6)

In [25]:
planets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1035 entries, 0 to 1034
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   method          1035 non-null   object 
 1   number          1035 non-null   int64  
 2   orbital_period  992 non-null    float64
 3   mass            513 non-null    float64
 4   distance        808 non-null    float64
 5   year            1035 non-null   int64  
dtypes: float64(3), int64(2), object(1)
memory usage: 48.6+ KB


In [61]:
planets.loc[1:]

Unnamed: 0,method,number,orbital_period,mass,distance,year
1,Radial Velocity,1,874.774000,2.21,56.95,2008
2,Radial Velocity,1,763.000000,2.60,19.84,2011
3,Radial Velocity,1,326.030000,19.40,110.62,2007
4,Radial Velocity,1,516.220000,10.50,119.47,2009
5,Radial Velocity,1,185.840000,4.80,76.39,2008
...,...,...,...,...,...,...
1030,Transit,1,3.941507,,172.00,2006
1031,Transit,1,2.615864,,148.00,2007
1032,Transit,1,3.191524,,174.00,2007
1033,Transit,1,4.125083,,293.00,2008


In [35]:
%timeit planets.isnull().any().any()

385 µs ± 3.8 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [54]:
np.isnan(planets.loc[1:].any().any())

False

In [33]:
%timeit planets.isnull().values.any()

294 µs ± 4.09 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [59]:
np.isnan(planets.iloc[:,1:].values).any()

True

In [60]:
%timeit np.isnan(planets.iloc[:,1:].values).any()

200 µs ± 3.32 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [42]:
np.isnan(planets.values)     #因為有文字所以無法以此查詢

TypeError: ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

In [62]:
planets.values

array([['Radial Velocity', 1, 269.3, 7.1, 77.4, 2006],
       ['Radial Velocity', 1, 874.7739999999999, 2.21, 56.95, 2008],
       ['Radial Velocity', 1, 763.0, 2.6, 19.84, 2011],
       ...,
       ['Transit', 1, 3.1915239, nan, 174.0, 2007],
       ['Transit', 1, 4.1250828, nan, 293.0, 2008],
       ['Transit', 1, 4.187757, nan, 260.0, 2008]], dtype=object)

### Dropping null values

For a ``Series``

In [63]:
planets['mass']

0        7.10
1        2.21
2        2.60
3       19.40
4       10.50
        ...  
1030      NaN
1031      NaN
1032      NaN
1033      NaN
1034      NaN
Name: mass, Length: 1035, dtype: float64

In [73]:
planets['mass'][planets['mass'].notnull()]  # mask method 方式

0       7.100
1       2.210
2       2.600
3      19.400
4      10.500
        ...  
784     0.947
913    19.800
914     0.340
915     0.400
916     1.540
Name: mass, Length: 513, dtype: float64

In [74]:
%timeit planets['mass'][planets['mass'].notnull()]

265 µs ± 4.41 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [75]:
planets.dropna()

Unnamed: 0,method,number,orbital_period,mass,distance,year
0,Radial Velocity,1,269.30000,7.100,77.40,2006
1,Radial Velocity,1,874.77400,2.210,56.95,2008
2,Radial Velocity,1,763.00000,2.600,19.84,2011
3,Radial Velocity,1,326.03000,19.400,110.62,2007
4,Radial Velocity,1,516.22000,10.500,119.47,2009
...,...,...,...,...,...,...
640,Radial Velocity,1,111.70000,2.100,14.90,2009
641,Radial Velocity,1,5.05050,1.068,44.46,2013
642,Radial Velocity,1,311.28800,1.940,17.24,1999
649,Transit,1,2.70339,1.470,178.00,2013


In [None]:
data.dropna()

For a ``DataFrame``

In [76]:
df = pd.DataFrame([[1,      np.nan, 2],
                   [2,      3,      5],
                   [np.nan, 4,      6]])
df

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


By default, ``dropna()`` will drop all rows in which *any* null value is present:

In [77]:
df.dropna()

Unnamed: 0,0,1,2
1,2.0,3.0,5


In [78]:
df.dropna(axis='columns')

Unnamed: 0,2
0,2
1,5
2,6


In [79]:
df[3] = np.nan
df

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,,4.0,6,


In [80]:
df.dropna(axis='columns', how='all')

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [81]:
df.dropna(axis='rows', thresh=3)

Unnamed: 0,0,1,2,3
1,2.0,3.0,5,


### Filling null values

#### For ``Series``

Masking style

In [None]:
d1 = pd.Series([1, np.nan, 2, None, 3], index=list('abcde'))
d1

In [None]:
d1.isnull()

In [None]:
d1[d1.isnull()]=0
d1

fill NA entries with a single value, such as zero:

In [None]:
data = pd.Series([1, np.nan, 2, None, 3], index=list('abcde'))
data

In [None]:
# fill with zero
data.fillna(0)

In [None]:
# fill with mean
data.fillna(data.mean())

In [None]:
# fill with median
data.fillna(data.median())

specify a forward-fill to propagate the previous value forward:

In [None]:
# forward-fill
data.fillna(method='ffill')

Or we can specify a back-fill to propagate the next values backward:

In [None]:
# back-fill
data.fillna(method='bfill')

#### For ``DataFrame``

In [None]:
df

In [None]:
type(df.iloc[1])

In [None]:
df.fillna(method='ffill', axis=1)

In [None]:
df.fillna(method='ffill')

## Lab

<b>有一個 DataFrame df，試著計算空值個數有幾個?</b>

In [None]:
arr = np.array([[ 8.,  2., 17., 20., 10.],
       [ 4., np.nan,  3., np.nan,  2.],
       [24., 26., 14., 23., 21.],
       [ 7., 29.,  3., 19., 25.],
       [14., 24., np.nan, 21., 10.],
       [np.nan, np.nan, 20., 26., np.nan]])

df = pd.DataFrame(arr, columns=['one', 'two', 'three', 'four', 'five'])
df

In [None]:
df.values

<b>試著將有空值的列拋棄</b>

<b>試著將空值超過一個以上的列拋棄</b>