# 결측치(Missing Value) 처리 실습
- 결측치(Missing Value)란?
    - 값이 들어있지 않은 것으로 NaN, NA로 표기
- 관련 메서드
    - 체크 : isnull(), isna(), notnull(), notna()
    - 삭제 : dropna()
    - 치환 : fillna()

### 처리방법 (1) 삭제

In [2]:
# 모듈 삽입
import pandas as pd

# 파일 경로 관련 변수 선언
DIR_PATH = r'../Data/'
FILE_NAME = DIR_PATH + 'weather.csv'

In [3]:
# csv => dataframe으로 로딩
df = pd.read_csv(FILE_NAME)
df

Unnamed: 0,id,year,month,element,d1,d2,d3,d4,d5,d6,...,d22,d23,d24,d25,d26,d27,d28,d29,d30,d31
0,MX17004,2010,1,tmax,,,,,,,...,,,,,,,,,27.8,
1,MX17004,2010,1,tmin,,,,,,,...,,,,,,,,,14.5,
2,MX17004,2010,2,tmax,,27.3,24.1,,,,...,,29.9,,,,,,,,
3,MX17004,2010,2,tmin,,14.4,14.4,,,,...,,10.7,,,,,,,,
4,MX17004,2010,3,tmax,,,,,32.1,,...,,,,,,,,,,
5,MX17004,2010,3,tmin,,,,,14.2,,...,,,,,,,,,,
6,MX17004,2010,4,tmax,,,,,,,...,,,,,,36.3,,,,
7,MX17004,2010,4,tmin,,,,,,,...,,,,,,16.7,,,,
8,MX17004,2010,5,tmax,,,,,,,...,,,,,,33.2,,,,
9,MX17004,2010,5,tmin,,,,,,,...,,,,,,18.2,,,,


In [4]:
# 데이터 요약 정보 확인
df.info(), df.columns

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22 entries, 0 to 21
Data columns (total 35 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   id       22 non-null     object 
 1   year     22 non-null     int64  
 2   month    22 non-null     int64  
 3   element  22 non-null     object 
 4   d1       2 non-null      float64
 5   d2       4 non-null      float64
 6   d3       4 non-null      float64
 7   d4       2 non-null      float64
 8   d5       8 non-null      float64
 9   d6       2 non-null      float64
 10  d7       2 non-null      float64
 11  d8       2 non-null      float64
 12  d9       0 non-null      float64
 13  d10      2 non-null      float64
 14  d11      2 non-null      float64
 15  d12      0 non-null      float64
 16  d13      2 non-null      float64
 17  d14      4 non-null      float64
 18  d15      2 non-null      float64
 19  d16      2 non-null      float64
 20  d17      2 non-null      float64
 21  d18      0 non-nul

(None,
 Index(['id', 'year', 'month', 'element', 'd1', 'd2', 'd3', 'd4', 'd5', 'd6',
        'd7', 'd8', 'd9', 'd10', 'd11', 'd12', 'd13', 'd14', 'd15', 'd16',
        'd17', 'd18', 'd19', 'd20', 'd21', 'd22', 'd23', 'd24', 'd25', 'd26',
        'd27', 'd28', 'd29', 'd30', 'd31'],
       dtype='object'))

In [5]:
# 결측치 갯수 파악
print(df.isnull().sum())

print(f'총 결측치의 합 : {df.isnull().sum().sum()}개')

id          0
year        0
month       0
element     0
d1         20
d2         18
d3         18
d4         20
d5         14
d6         20
d7         20
d8         20
d9         22
d10        20
d11        20
d12        22
d13        20
d14        18
d15        20
d16        20
d17        20
d18        22
d19        22
d20        22
d21        22
d22        22
d23        18
d24        22
d25        20
d26        20
d27        16
d28        20
d29        18
d30        20
d31        20
dtype: int64
총 결측치의 합 : 616개


In [6]:
# 날짜별 관측치가 하나도 없으면 column을 지운다.
df.dropna(axis = 1, how='all' ,inplace=True)
# 1월 데이터만 추출
month_1 = df[df['month'] == 1]
# NaN(결측치) 제거 / 최대, 최소 온도가 모두 NaN인 것
month_1_remove = month_1.dropna(axis = 1, how='all')
month_1_remove

Unnamed: 0,id,year,month,element,d30
0,MX17004,2010,1,tmax,27.8
1,MX17004,2010,1,tmin,14.5


In [7]:
# 10월 데이터만 추출
month_10 = df[df['month'] == 10]
# NaN(결측치) 제거 / 최대, 최소 온도가 모두 NaN인 것
month_10_remove = month_10.dropna(axis = 1, how='all')
month_10_remove

Unnamed: 0,id,year,month,element,d5,d7,d14,d15,d28
16,MX17004,2010,10,tmax,27.0,28.1,29.5,28.7,31.2
17,MX17004,2010,10,tmin,14.0,12.9,13.0,10.5,15.0


# ------------------------------------------------과제 0704------------------------------------------------

In [15]:
# 과제 0704 - (1) 2010년 1월 ~ 12월까지 최고온도 - 최저온도 데이터 추출
df2 = df.drop(['id', 'year'], axis=1)
df2_trans = df2.transpose()
df2_trans[2:].max()
df2_trans[2:].min()
for i in range(1,12):
    if i > 8:
        print(f'{i+1}월의 최고온도는', end=' ')
    else:
        print(f'{i}월의 최고온도는', end=' ')
    print(f'[{df2_trans[2:].max()[(i-1)*2]}]', end=' ')
    print(f'최저온도는', end=' ')
    print(f'[{df2_trans[2:].max()[((i-1)*2)+1]}]')    # 9월 데이터가 빠져있습니다.

1월의 최고온도는 [27.8] 최저온도는 [14.5]
2월의 최고온도는 [29.9] 최저온도는 [14.4]
3월의 최고온도는 [34.5] 최저온도는 [17.6]
4월의 최고온도는 [36.3] 최저온도는 [16.7]
5월의 최고온도는 [33.2] 최저온도는 [18.2]
6월의 최고온도는 [30.1] 최저온도는 [18.0]
7월의 최고온도는 [29.9] 최저온도는 [17.5]
8월의 최고온도는 [29.8] 최저온도는 [17.3]
10월의 최고온도는 [31.2] 최저온도는 [15.0]
11월의 최고온도는 [31.3] 최저온도는 [16.3]
12월의 최고온도는 [29.9] 최저온도는 [13.8]


# -------------------------------------------------------------------------------------------------------------

### 처리방법 (2) 치환

In [9]:
# csv => dataframe으로 로딩
df = pd.read_csv(FILE_NAME)
df.head()

Unnamed: 0,id,year,month,element,d1,d2,d3,d4,d5,d6,...,d22,d23,d24,d25,d26,d27,d28,d29,d30,d31
0,MX17004,2010,1,tmax,,,,,,,...,,,,,,,,,27.8,
1,MX17004,2010,1,tmin,,,,,,,...,,,,,,,,,14.5,
2,MX17004,2010,2,tmax,,27.3,24.1,,,,...,,29.9,,,,,,,,
3,MX17004,2010,2,tmin,,14.4,14.4,,,,...,,10.7,,,,,,,,
4,MX17004,2010,3,tmax,,,,,32.1,,...,,,,,,,,,,


In [10]:
# (2) NaN 데이터를 다른 값으로 치환(변경)하기 => fillna()
# 모든 NaN을 0으로 채우기
df.fillna(0).head()

Unnamed: 0,id,year,month,element,d1,d2,d3,d4,d5,d6,...,d22,d23,d24,d25,d26,d27,d28,d29,d30,d31
0,MX17004,2010,1,tmax,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,27.8,0.0
1,MX17004,2010,1,tmin,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14.5,0.0
2,MX17004,2010,2,tmax,0.0,27.3,24.1,0.0,0.0,0.0,...,0.0,29.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,MX17004,2010,2,tmin,0.0,14.4,14.4,0.0,0.0,0.0,...,0.0,10.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,MX17004,2010,3,tmax,0.0,0.0,0.0,0.0,32.1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
