### [참고] <a href="https://pandas.pydata.org/Pandas_Cheat_Sheet.pdf">Pandas Cheat Sheet</a>

https://pandas.pydata.org/docs/user_guide/missing_data.html

#### NaN(Not a Number) - 표현 불가능한 데이터(비어 있는 값)

- NaN : missing value 를 표현하는 기본 형태
- 기본적으로 float 형식으로 처리됨

#### NA(Not Available) : 결측값
#### None : 값의 부재(값이 존재하지 않거나, 없음, 정의되지 않음)

In [1]:
import pandas as pd

In [22]:
import numpy as np

### [실습 1]

#### 1) missing data 가 포함된 데이터 프레임 생성

In [2]:
df = pd.DataFrame({
    "name":["Alfred","Batman","Catwoman"],
    "toy":[pd.NA, 'Batmobile','Bullwhip'],
    "born":[None,pd.Timestamp("19400425"),pd.NaT]
})
df

Unnamed: 0,name,toy,born
0,Alfred,,NaT
1,Batman,Batmobile,1940-04-25
2,Catwoman,Bullwhip,NaT


#### 2) 데이터 타입 확인

In [3]:
df.dtypes

name            object
toy             object
born    datetime64[ns]
dtype: object

#### 3) missing data 처리

**dropna : missing values 제거**

In [4]:
df.dropna?

[1;31mSignature:[0m
[0mdf[0m[1;33m.[0m[0mdropna[0m[1;33m([0m[1;33m
[0m    [1;33m*[0m[1;33m,[0m[1;33m
[0m    [0maxis[0m[1;33m:[0m [1;34m'Axis'[0m [1;33m=[0m [1;36m0[0m[1;33m,[0m[1;33m
[0m    [0mhow[0m[1;33m:[0m [1;34m'AnyAll | NoDefault'[0m [1;33m=[0m [1;33m<[0m[0mno_default[0m[1;33m>[0m[1;33m,[0m[1;33m
[0m    [0mthresh[0m[1;33m:[0m [1;34m'int | NoDefault'[0m [1;33m=[0m [1;33m<[0m[0mno_default[0m[1;33m>[0m[1;33m,[0m[1;33m
[0m    [0msubset[0m[1;33m:[0m [1;34m'IndexLabel'[0m [1;33m=[0m [1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0minplace[0m[1;33m:[0m [1;34m'bool'[0m [1;33m=[0m [1;32mFalse[0m[1;33m,[0m[1;33m
[0m    [0mignore_index[0m[1;33m:[0m [1;34m'bool'[0m [1;33m=[0m [1;32mFalse[0m[1;33m,[0m[1;33m
[0m[1;33m)[0m [1;33m->[0m [1;34m'DataFrame | None'[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Remove missing values.

See the :ref:`User Guide <missing_data>` for more on whic

df.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)

In [5]:
# 컬럼 기준으로 하나라도 널 값이 있으면 제거

df.dropna(axis=1,how="any")

Unnamed: 0,name
0,Alfred
1,Batman
2,Catwoman


In [6]:
# axis=0, how="any"

df.dropna()

Unnamed: 0,name,toy,born
1,Batman,Batmobile,1940-04-25


In [7]:
df.dropna(how="all")

Unnamed: 0,name,toy,born
0,Alfred,,NaT
1,Batman,Batmobile,1940-04-25
2,Catwoman,Bullwhip,NaT


**fillna : missing values 를 임의의 값으로 채우기**

In [8]:
df.fillna?

[1;31mSignature:[0m
[0mdf[0m[1;33m.[0m[0mfillna[0m[1;33m([0m[1;33m
[0m    [0mvalue[0m[1;33m:[0m [1;34m'Hashable | Mapping | Series | DataFrame'[0m [1;33m=[0m [1;32mNone[0m[1;33m,[0m[1;33m
[0m    [1;33m*[0m[1;33m,[0m[1;33m
[0m    [0mmethod[0m[1;33m:[0m [1;34m'FillnaOptions | None'[0m [1;33m=[0m [1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0maxis[0m[1;33m:[0m [1;34m'Axis | None'[0m [1;33m=[0m [1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0minplace[0m[1;33m:[0m [1;34m'bool'[0m [1;33m=[0m [1;32mFalse[0m[1;33m,[0m[1;33m
[0m    [0mlimit[0m[1;33m:[0m [1;34m'int | None'[0m [1;33m=[0m [1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mdowncast[0m[1;33m:[0m [1;34m'dict | None'[0m [1;33m=[0m [1;32mNone[0m[1;33m,[0m[1;33m
[0m[1;33m)[0m [1;33m->[0m [1;34m'DataFrame | None'[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Fill NA/NaN values using the specified method.

Parameters
----------
value : scalar, dict, Seri

df.fillna(
    value=None,
    method=None,
    axis=None,
    inplace=False,
    limit=None,
    downcast=None,
)

In [9]:
df.fillna(0)

Unnamed: 0,name,toy,born
0,Alfred,0,0
1,Batman,Batmobile,1940-04-25 00:00:00
2,Catwoman,Bullwhip,0


In [10]:
# 특정 값으로 채우기

values = {"name":"noname", "toy":"Bat", "born":pd.Timestamp("1900-01-01")}

In [11]:
df.fillna(value=values)

Unnamed: 0,name,toy,born
0,Alfred,Bat,1900-01-01
1,Batman,Batmobile,1940-04-25
2,Catwoman,Bullwhip,1900-01-01


### [실습 2]

In [12]:
data = {
    "name":["John","Nate","Yuna","Abraham","Brian","Janny","Nate","John"],
    "job":["teacher","teacher","teacher","student","student","student","teacher","student"],
    "age":[40,35,37,10,12,11,None,None]
}
df = pd.DataFrame(data)
df

Unnamed: 0,name,job,age
0,John,teacher,40.0
1,Nate,teacher,35.0
2,Yuna,teacher,37.0
3,Abraham,student,10.0
4,Brian,student,12.0
5,Janny,student,11.0
6,Nate,teacher,
7,John,student,


In [13]:
df.shape

(8, 3)

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   name    8 non-null      object 
 1   job     8 non-null      object 
 2   age     6 non-null      float64
dtypes: float64(1), object(2)
memory usage: 324.0+ bytes


In [15]:
df.isna()

Unnamed: 0,name,job,age
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,False
4,False,False,False
5,False,False,False
6,False,False,True
7,False,False,True


In [16]:
df.isnull()

Unnamed: 0,name,job,age
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,False
4,False,False,False
5,False,False,False
6,False,False,True
7,False,False,True


In [17]:
df.age.fillna(0)

0    40.0
1    35.0
2    37.0
3    10.0
4    12.0
5    11.0
6     0.0
7     0.0
Name: age, dtype: float64

<b>* 상식적으로 나이가 0인 것은 말이 안되니까 그럴듯하게 변경하기</b><br>
<b>* 선생님의 나이는 다른 선생님들 나이의 평균값으로, 학생들의 나이또한 평균값으로 변경하기</b>

In [19]:
df.groupby("job")["age"].transform("median")

0    37.0
1    37.0
2    37.0
3    11.0
4    11.0
5    11.0
6    37.0
7    11.0
Name: age, dtype: float64

In [20]:
df.age.fillna(df.groupby("job")["age"].transform("median"),inplace=True)

In [21]:
df

Unnamed: 0,name,job,age
0,John,teacher,40.0
1,Nate,teacher,35.0
2,Yuna,teacher,37.0
3,Abraham,student,10.0
4,Brian,student,12.0
5,Janny,student,11.0
6,Nate,teacher,37.0
7,John,student,11.0


### 실습

In [23]:
# None, pd.NaT, pd.NA, np.nan

df = pd.DataFrame(
    [
        [np.nan,2,np.nan,0],
        [3,4,np.nan,1],
        [np.nan,np.nan,np.nan,5]   
    ],
    columns=list('ABCD')
)
df

Unnamed: 0,A,B,C,D
0,,2.0,,0
1,3.0,4.0,,1
2,,,,5


In [24]:
df.isna()

Unnamed: 0,A,B,C,D
0,True,False,True,False
1,False,False,True,False
2,True,True,True,False


In [25]:
# 모든 결측치 0 으로 채우기

df.fillna(0)

Unnamed: 0,A,B,C,D
0,0.0,2.0,0.0,0
1,3.0,4.0,0.0,1
2,0.0,0.0,0.0,5


In [26]:
# 특정 값으로 채우기
# A : 0, B : 1, C : 2, D : 3

values = {"A" : 0, "B" : 1, "C" : 2, "D" : 3}
df.fillna(value=values)

Unnamed: 0,A,B,C,D
0,0.0,2.0,2.0,0
1,3.0,4.0,2.0,1
2,0.0,1.0,2.0,5


In [28]:
# 중앙값

# df["D"].median()

df.fillna(df["D"].median())

Unnamed: 0,A,B,C,D
0,1.0,2.0,1.0,0
1,3.0,4.0,1.0,1
2,1.0,1.0,1.0,5


In [30]:
#df["D"].max()

df.fillna(df["D"].max())

Unnamed: 0,A,B,C,D
0,5.0,2.0,5.0,0
1,3.0,4.0,5.0,1
2,5.0,5.0,5.0,5


#### NaN
    - NaN : np.nan, pd.NaT, pd.NA, None
    - None 비교 가능
    - np.nan 비교가 불가능

In [31]:
np.nan != np.nan

True

In [32]:
np.nan == np.nan

False

In [33]:
pd.NaT == pd.NaT

False

In [34]:
pd.NA == pd.NA

<NA>

In [35]:
None == None

True