In [27]:
import pandas as pd

In [None]:
# 파일 로드 
corona = pd.read_csv("../../csv/corona.csv")
# 특정 컬럼을 제거 (Unnamed: 0, seq)
# 컬럼명 순서는 관계 X
corona.drop( ['Unnamed: 0', 'seq'], axis=1, inplace=True )      # axis='columns'도 같은 의미
# 컬럼의 이름을 변경 
corona.columns = ["등록일시", '총사망자', '총확진자', '기준일', 
                  '기준시간', '수정일시', '누적의심자', '누적확진율']
# 복사본 생성
df = corona.copy()
# 등록일시를 기준으로 오름차순 정렬
df.sort_values( ['등록일시'], inplace=True )
# 인덱스를 초기화하고 기존의 인덱스는 제거 
df.reset_index(drop = True, inplace=True)


In [29]:
df.head()

Unnamed: 0,등록일시,총사망자,총확진자,기준일,기준시간,수정일시,누적의심자,누적확진율
0,2020-03-10 00:00:00.000,54,7513,20200310,00:00,2021-10-07 10:30:51.51,210144.0,3.919308
1,2020-03-11 00:00:00.000,60,7755,20200311,00:00,2021-10-07 10:30:51.51,222395.0,3.804175
2,2020-03-12 00:00:00.000,66,7869,20200312,00:00,2021-10-07 10:30:51.51,234998.0,3.621744
3,2020-03-13 00:00:00.000,67,7979,20200313,00:00,2021-10-07 10:30:51.51,248647.0,3.458499
4,2020-03-14 00:00:00.000,72,8086,20200314,00:00,2021-10-07 10:30:51.51,261335.0,3.318


In [30]:
# 파생변수 생성 
# 반복문을 이용하여 일일사망자 컬럼을 추가 
datas = [0]

for i in range(1, len(df), 1):
    # 오늘의 인덱스의 총 사망자 - 전날의 인덱스의 총사망자 
    data = df.loc[i, '총사망자'] - df.loc[i-1, '총사망자']
    datas.append(data)

df['일일사망자'] = datas

In [31]:
df.head()

Unnamed: 0,등록일시,총사망자,총확진자,기준일,기준시간,수정일시,누적의심자,누적확진율,일일사망자
0,2020-03-10 00:00:00.000,54,7513,20200310,00:00,2021-10-07 10:30:51.51,210144.0,3.919308,0
1,2020-03-11 00:00:00.000,60,7755,20200311,00:00,2021-10-07 10:30:51.51,222395.0,3.804175,6
2,2020-03-12 00:00:00.000,66,7869,20200312,00:00,2021-10-07 10:30:51.51,234998.0,3.621744,6
3,2020-03-13 00:00:00.000,67,7979,20200313,00:00,2021-10-07 10:30:51.51,248647.0,3.458499,1
4,2020-03-14 00:00:00.000,72,8086,20200314,00:00,2021-10-07 10:30:51.51,261335.0,3.318,5


In [32]:
# shift() 함수 이용
# shift(n) : 특정 Series데이터에서 n만큼 인덱스의 위치를 이동
df['일일사망자2'] = (df['총사망자'] - df['총사망자'].shift()).fillna(0)

In [33]:
# diff() 함수 이용
# diff(n) : 특정 Seried 데이터에서 n만큼 인덱스의 위치를 이동한 값과의 차이를 출력
# n은 1이 기본값
df['일일사망자3'] = df['총사망자'].diff().fillna(0)

In [34]:
df.head()

Unnamed: 0,등록일시,총사망자,총확진자,기준일,기준시간,수정일시,누적의심자,누적확진율,일일사망자,일일사망자2,일일사망자3
0,2020-03-10 00:00:00.000,54,7513,20200310,00:00,2021-10-07 10:30:51.51,210144.0,3.919308,0,0.0,0.0
1,2020-03-11 00:00:00.000,60,7755,20200311,00:00,2021-10-07 10:30:51.51,222395.0,3.804175,6,6.0,6.0
2,2020-03-12 00:00:00.000,66,7869,20200312,00:00,2021-10-07 10:30:51.51,234998.0,3.621744,6,6.0,6.0
3,2020-03-13 00:00:00.000,67,7979,20200313,00:00,2021-10-07 10:30:51.51,248647.0,3.458499,1,1.0,1.0
4,2020-03-14 00:00:00.000,72,8086,20200314,00:00,2021-10-07 10:30:51.51,261335.0,3.318,5,5.0,5.0


In [35]:
# 데이터프레임에서 마지막 3개의 컬럼만 추출
# 위치에서 음수가 의미하는것은?
df.iloc[ : , -3: ]

Unnamed: 0,일일사망자,일일사망자2,일일사망자3
0,0,0.0,0.0
1,6,6.0,6.0
2,6,6.0,6.0
3,1,1.0,1.0
4,5,5.0,5.0
...,...,...,...
815,9,9.0,9.0
816,20,20.0,20.0
817,21,21.0,21.0
818,20,20.0,20.0


In [36]:
(df['일일사망자'] == df['일일사망자2']).sum()

np.int64(820)

In [37]:
(df['일일사망자'] == df['일일사망자3']).value_counts()

True    820
Name: count, dtype: int64

In [38]:
# 통계 요약 정보를 출력하는 함수 
# describe()
df.describe()

Unnamed: 0,총사망자,총확진자,기준일,누적의심자,누적확진율,일일사망자,일일사망자2,일일사망자3
count,820.0,820.0,820.0,692.0,623.0,820.0,820.0,820.0
mean,3927.832927,1913097.0,20208970.0,7545057.0,1.556435,29.57439,29.57439,29.57439
std,6164.310693,4857030.0,7118.265,6172020.0,0.543014,68.530863,68.530863,68.530863
min,54.0,7513.0,20200310.0,210144.0,0.902205,-2.0,-2.0,-2.0
25%,415.75,23935.25,20201000.0,1934309.0,1.078089,2.0,2.0,2.0
50%,1812.5,118564.0,20210420.0,6368310.0,1.416159,5.0,5.0,5.0
75%,3120.5,397991.5,20211120.0,12168900.0,1.816009,20.0,20.0,20.0
max,24305.0,18188200.0,20220610.0,21518070.0,3.919308,469.0,469.0,469.0


In [39]:
# 통계 정보를 확인하니 일일사망자 데이터에 음수가 확인
# 일일사망자 데이터에서 음수인 데이터를 확인하자
# 조건식 : 일일사망자 컬럼의 데이터가 0보다 작은
flag = df['일일사망자'] < 0

In [40]:
# flag는 인덱스의 조건식으로 사용이 가능
# 인덱스의 조건식을 사용하는 것은 loc
df.loc[flag, ]

Unnamed: 0,등록일시,총사망자,총확진자,기준일,기준시간,수정일시,누적의심자,누적확진율,일일사망자,일일사망자2,일일사망자3
446,2021-05-30 00:00:00.000,1957,139907,20210530,00:00,2021-10-07 10:30:51.51,9747612.0,1.454166,-2,-2.0,-2.0


In [41]:
# 해당 날짜 데이터만 가지고 데이터의 문제는 발견이 힘들다. 
# +- 2일치 데이터를 확인 
df.loc[444:448, ]

Unnamed: 0,등록일시,총사망자,총확진자,기준일,기준시간,수정일시,누적의심자,누적확진율,일일사망자,일일사망자2,일일사망자3
444,2021-05-29 00:00:00.000,1951,139427,20210529,00:00,2021-10-07 10:30:51.51,9733588.0,1.450884,5,5.0,5.0
445,2021-05-30 00:00:00.0,1959,140337,20210531,00:00,2021-10-14 13:48:56.821,9761156.0,1.456805,8,8.0,8.0
446,2021-05-30 00:00:00.000,1957,139907,20210530,00:00,2021-10-07 10:30:51.51,9747612.0,1.454166,-2,-2.0,-2.0
447,2021-06-01 00:00:00.000,1963,140796,20210601,00:00,2021-10-07 10:30:51.51,9798400.0,1.45589,6,6.0,6.0
448,2021-06-02 00:00:00.000,1965,141473,20210602,00:00,2021-10-07 10:30:51.51,9834348.0,1.457668,2,2.0,2.0


In [42]:
# index는 445, column은 '등록일시' 인 데이터를 변경 
df.loc[445, '등록일시'] = '2021-05-31 00:00:00.000'

In [43]:
df.loc[444:448, ]

Unnamed: 0,등록일시,총사망자,총확진자,기준일,기준시간,수정일시,누적의심자,누적확진율,일일사망자,일일사망자2,일일사망자3
444,2021-05-29 00:00:00.000,1951,139427,20210529,00:00,2021-10-07 10:30:51.51,9733588.0,1.450884,5,5.0,5.0
445,2021-05-31 00:00:00.000,1959,140337,20210531,00:00,2021-10-14 13:48:56.821,9761156.0,1.456805,8,8.0,8.0
446,2021-05-30 00:00:00.000,1957,139907,20210530,00:00,2021-10-07 10:30:51.51,9747612.0,1.454166,-2,-2.0,-2.0
447,2021-06-01 00:00:00.000,1963,140796,20210601,00:00,2021-10-07 10:30:51.51,9798400.0,1.45589,6,6.0,6.0
448,2021-06-02 00:00:00.000,1965,141473,20210602,00:00,2021-10-07 10:30:51.51,9834348.0,1.457668,2,2.0,2.0


1. 마지막 3개의 컬럼을 제거
2. 등록일시를 기준으로 오름차순 정렬
3. 인덱스를 초기화하고 기존의 인덱스는 제거
4. '일일사망자' 컬럼을 생성하여 shift() 함수를 이용하여 데이터를 대입
5. '일일확진자' 컬럼을 생성하여 diff() 함수를 이용하여 데이터를 대입
6. 통계정보를 통해서 일일사망자, 일일확진자 컬럼에 음수가 존재하는가? 확이

In [45]:
df

Unnamed: 0,등록일시,총사망자,총확진자,기준일,기준시간,수정일시,누적의심자,누적확진율,일일사망자,일일사망자2,일일사망자3
0,2020-03-10 00:00:00.000,54,7513,20200310,00:00,2021-10-07 10:30:51.51,210144.0,3.919308,0,0.0,0.0
1,2020-03-11 00:00:00.000,60,7755,20200311,00:00,2021-10-07 10:30:51.51,222395.0,3.804175,6,6.0,6.0
2,2020-03-12 00:00:00.000,66,7869,20200312,00:00,2021-10-07 10:30:51.51,234998.0,3.621744,6,6.0,6.0
3,2020-03-13 00:00:00.000,67,7979,20200313,00:00,2021-10-07 10:30:51.51,248647.0,3.458499,1,1.0,1.0
4,2020-03-14 00:00:00.000,72,8086,20200314,00:00,2021-10-07 10:30:51.51,261335.0,3.318000,5,5.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...
815,2022-06-04 08:56:49.219,24238,18153814,20220604,00:00,2022-06-08 09:11:26.303,,,9,9.0,9.0
816,2022-06-05 08:53:19.426,24258,18163648,20220605,00:00,2022-06-08 09:11:04.758,,,20,20.0,20.0
817,2022-06-06 09:00:06.734,24279,18168670,20220606,00:00,2022-06-08 09:10:50.441,,,21,21.0,21.0
818,2022-06-07 09:09:00.897,24299,18174842,20220607,00:00,2022-06-08 09:10:36.846,,,20,20.0,20.0


In [None]:
df.drop(['일일사망자', '일일사망자2', '일일사망자3'], axis=1, inplace=True)
# df.drop(df.columns[-3:], axis=1)
# df.iloc[:, :-3]
df.sort_values(['등록일시'], inplace=True)
df.reset_index(drop = True, inplace=True)

In [47]:
df

Unnamed: 0,등록일시,총사망자,총확진자,기준일,기준시간,수정일시,누적의심자,누적확진율
0,2020-03-10 00:00:00.000,54,7513,20200310,00:00,2021-10-07 10:30:51.51,210144.0,3.919308
1,2020-03-11 00:00:00.000,60,7755,20200311,00:00,2021-10-07 10:30:51.51,222395.0,3.804175
2,2020-03-12 00:00:00.000,66,7869,20200312,00:00,2021-10-07 10:30:51.51,234998.0,3.621744
3,2020-03-13 00:00:00.000,67,7979,20200313,00:00,2021-10-07 10:30:51.51,248647.0,3.458499
4,2020-03-14 00:00:00.000,72,8086,20200314,00:00,2021-10-07 10:30:51.51,261335.0,3.318000
...,...,...,...,...,...,...,...,...
815,2022-06-04 08:56:49.219,24238,18153814,20220604,00:00,2022-06-08 09:11:26.303,,
816,2022-06-05 08:53:19.426,24258,18163648,20220605,00:00,2022-06-08 09:11:04.758,,
817,2022-06-06 09:00:06.734,24279,18168670,20220606,00:00,2022-06-08 09:10:50.441,,
818,2022-06-07 09:09:00.897,24299,18174842,20220607,00:00,2022-06-08 09:10:36.846,,


In [None]:
# shift() 함수 이용
# shift(n) : 특정 Series데이터에서 n만큼 인덱스의 위치를 이동
df['일일사망자'] = (df['총사망자'] - df['총사망자'].shift()).fillna(0)

In [None]:
# diff() 함수 이용
# diff(n) : 특정 Seried 데이터에서 n만큼 인덱스의 위치를 이동한 값과의 차이를 출력
# n은 1이 기본값
df['일일확진자'] = (df['총확진자']).diff().fillna(0)

In [50]:
df

Unnamed: 0,등록일시,총사망자,총확진자,기준일,기준시간,수정일시,누적의심자,누적확진율,일일사망자,일일확진자
0,2020-03-10 00:00:00.000,54,7513,20200310,00:00,2021-10-07 10:30:51.51,210144.0,3.919308,0.0,0.0
1,2020-03-11 00:00:00.000,60,7755,20200311,00:00,2021-10-07 10:30:51.51,222395.0,3.804175,6.0,242.0
2,2020-03-12 00:00:00.000,66,7869,20200312,00:00,2021-10-07 10:30:51.51,234998.0,3.621744,6.0,114.0
3,2020-03-13 00:00:00.000,67,7979,20200313,00:00,2021-10-07 10:30:51.51,248647.0,3.458499,1.0,110.0
4,2020-03-14 00:00:00.000,72,8086,20200314,00:00,2021-10-07 10:30:51.51,261335.0,3.318000,5.0,107.0
...,...,...,...,...,...,...,...,...,...,...
815,2022-06-04 08:56:49.219,24238,18153814,20220604,00:00,2022-06-08 09:11:26.303,,,9.0,12039.0
816,2022-06-05 08:53:19.426,24258,18163648,20220605,00:00,2022-06-08 09:11:04.758,,,20.0,9834.0
817,2022-06-06 09:00:06.734,24279,18168670,20220606,00:00,2022-06-08 09:10:50.441,,,21.0,5022.0
818,2022-06-07 09:09:00.897,24299,18174842,20220607,00:00,2022-06-08 09:10:36.846,,,20.0,6172.0


In [51]:
df.describe()

Unnamed: 0,총사망자,총확진자,기준일,누적의심자,누적확진율,일일사망자,일일확진자
count,820.0,820.0,820.0,692.0,623.0,820.0,820.0
mean,3927.832927,1913097.0,20208970.0,7545057.0,1.556435,29.57439,22171.569512
std,6164.310693,4857030.0,7118.265,6172020.0,0.543014,68.530435,71013.844067
min,54.0,7513.0,20200310.0,210144.0,0.902205,0.0,0.0
25%,415.75,23935.25,20201000.0,1934309.0,1.078089,2.0,125.0
50%,1812.5,118564.0,20210420.0,6368310.0,1.416159,5.0,634.0
75%,3120.5,397991.5,20211120.0,12168900.0,1.816009,20.0,2782.75
max,24305.0,18188200.0,20220610.0,21518070.0,3.919308,469.0,621204.0


In [52]:
# 일일사망자 데이터에서 음수인 데이터를 확인하자
# 조건식 : 일일사망자 컬럼의 데이터가 0보다 작은
flag = df['일일사망자'] < 0

In [53]:
# flag는 인덱스의 조건식으로 사용이 가능
# 인덱스의 조건식을 사용하는 것은 loc
df.loc[flag, ]

Unnamed: 0,등록일시,총사망자,총확진자,기준일,기준시간,수정일시,누적의심자,누적확진율,일일사망자,일일확진자


In [54]:
flag2 = df['일일확진자'] < 0
df.loc[flag, ]

Unnamed: 0,등록일시,총사망자,총확진자,기준일,기준시간,수정일시,누적의심자,누적확진율,일일사망자,일일확진자


In [None]:
# DataFrame을 csv 파일로 저장
df.to_csv('./corona.csv')       # ./(현재 파일의 위치) 생략 가능
# Unnamed: 0으로 인덱스 생성(0, 1, 2, ...)

In [57]:
df.to_csv('./corona.csv', index=False)
# Unnamed: 0 이라는 인덱스 컬럼 생성X

In [58]:
pd.read_csv('corona.csv')

Unnamed: 0,등록일시,총사망자,총확진자,기준일,기준시간,수정일시,누적의심자,누적확진율,일일사망자,일일확진자
0,2020-03-10 00:00:00.000,54,7513,20200310,00:00,2021-10-07 10:30:51.51,210144.0,3.919308,0.0,0.0
1,2020-03-11 00:00:00.000,60,7755,20200311,00:00,2021-10-07 10:30:51.51,222395.0,3.804175,6.0,242.0
2,2020-03-12 00:00:00.000,66,7869,20200312,00:00,2021-10-07 10:30:51.51,234998.0,3.621744,6.0,114.0
3,2020-03-13 00:00:00.000,67,7979,20200313,00:00,2021-10-07 10:30:51.51,248647.0,3.458499,1.0,110.0
4,2020-03-14 00:00:00.000,72,8086,20200314,00:00,2021-10-07 10:30:51.51,261335.0,3.318000,5.0,107.0
...,...,...,...,...,...,...,...,...,...,...
815,2022-06-04 08:56:49.219,24238,18153814,20220604,00:00,2022-06-08 09:11:26.303,,,9.0,12039.0
816,2022-06-05 08:53:19.426,24258,18163648,20220605,00:00,2022-06-08 09:11:04.758,,,20.0,9834.0
817,2022-06-06 09:00:06.734,24279,18168670,20220606,00:00,2022-06-08 09:10:50.441,,,21.0,5022.0
818,2022-06-07 09:09:00.897,24299,18174842,20220607,00:00,2022-06-08 09:10:36.846,,,20.0,6172.0


In [None]:
# 파일을 로드할 때 특정 컬럼 인덱스로 사용
pd.read_csv('corona.csv', index_col=[0])        # 첫 번째 컬럼을 인덱스로
pd.read_csv('corona.csv', index_col=[0,1])      # 여러 개의 컬럼을 인덱스로 사용 가능

Unnamed: 0_level_0,Unnamed: 1_level_0,총확진자,기준일,기준시간,수정일시,누적의심자,누적확진율,일일사망자,일일확진자
등록일시,총사망자,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2020-03-10 00:00:00.000,54,7513,20200310,00:00,2021-10-07 10:30:51.51,210144.0,3.919308,0.0,0.0
2020-03-11 00:00:00.000,60,7755,20200311,00:00,2021-10-07 10:30:51.51,222395.0,3.804175,6.0,242.0
2020-03-12 00:00:00.000,66,7869,20200312,00:00,2021-10-07 10:30:51.51,234998.0,3.621744,6.0,114.0
2020-03-13 00:00:00.000,67,7979,20200313,00:00,2021-10-07 10:30:51.51,248647.0,3.458499,1.0,110.0
2020-03-14 00:00:00.000,72,8086,20200314,00:00,2021-10-07 10:30:51.51,261335.0,3.318000,5.0,107.0
...,...,...,...,...,...,...,...,...,...
2022-06-04 08:56:49.219,24238,18153814,20220604,00:00,2022-06-08 09:11:26.303,,,9.0,12039.0
2022-06-05 08:53:19.426,24258,18163648,20220605,00:00,2022-06-08 09:11:04.758,,,20.0,9834.0
2022-06-06 09:00:06.734,24279,18168670,20220606,00:00,2022-06-08 09:10:50.441,,,21.0,5022.0
2022-06-07 09:09:00.897,24299,18174842,20220607,00:00,2022-06-08 09:10:36.846,,,20.0,6172.0


In [68]:
pd.read_csv('corona.csv', index_col=[0], usecols=[0,1,2])

Unnamed: 0_level_0,총사망자,총확진자
등록일시,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-03-10 00:00:00.000,54,7513
2020-03-11 00:00:00.000,60,7755
2020-03-12 00:00:00.000,66,7869
2020-03-13 00:00:00.000,67,7979
2020-03-14 00:00:00.000,72,8086
...,...,...
2022-06-04 08:56:49.219,24238,18153814
2022-06-05 08:53:19.426,24258,18163648
2022-06-06 09:00:06.734,24279,18168670
2022-06-07 09:09:00.897,24299,18174842
