## 재색인(reindex)
- https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.reindex.html
- https://pandas.pydata.org/docs/reference/api/pandas.Series.reindex.html
- https://pandas.pydata.org/docs/reference/api/pandas.Index.reindex.html
- 인덱스를 새로운 인덱스로 변경하거나 재정렬하는 작업
- 데이터를 새로운 인덱스에 맞게 재배열하거나 누락된 값을 처리하는 데 유용
- 시리즈, 데이터프레임 모두 가능

## 재색인 간단 예제

In [1]:
import pandas as pd

### Series

In [3]:
# 기존 Series 생성
series = pd.Series([1, 2, 3], index=['a', 'b', 'c'])
series

a    1
b    2
c    3
dtype: int64

In [6]:
# 새로운 인덱스로 재정렬
new_index = ['c', 'b', 'a', 'd']
new_series = series.reindex(new_index)

In [8]:
new_series

c    3.0
b    2.0
a    1.0
d    NaN
dtype: float64

### DataFrame

In [13]:
# 기존 DataFrame 생성
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}, index=['a', 'b', 'c'])
df

Unnamed: 0,A,B
a,1,4
b,2,5
c,3,6


In [15]:
# 새로운 인덱스로 재정렬
new_index = ['c', 'b', 'a', 'd']
new_df = df.reindex(new_index)

In [17]:
new_df

Unnamed: 0,A,B
c,3.0,6.0
b,2.0,5.0
a,1.0,4.0
d,,


## 본 데이터 실습 - 시계열의 재색인

In [19]:
cols=['date', 'open', 'high', 'low', 'close']

In [21]:
stocks=pd.read_csv('AAPL.csv', usecols= cols, index_col='date',parse_dates=['date'])

In [25]:
stocks.loc['2016']

Unnamed: 0_level_0,close,high,low,open
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016-01-04 00:00:00+00:00,105.35,105.3680,102.00,102.61
2016-01-05 00:00:00+00:00,102.71,105.8500,102.41,105.75
2016-01-06 00:00:00+00:00,100.70,102.3700,99.87,100.56
2016-01-07 00:00:00+00:00,96.45,100.1300,96.43,98.68
2016-01-08 00:00:00+00:00,96.96,99.1100,96.76,98.55
...,...,...,...,...
2016-12-23 00:00:00+00:00,116.52,116.5200,115.59,115.59
2016-12-27 00:00:00+00:00,117.26,117.8000,116.49,116.52
2016-12-28 00:00:00+00:00,116.76,118.0166,116.20,117.52
2016-12-29 00:00:00+00:00,116.73,117.1095,116.40,116.45


현재 위 데이터에 없는 요일이 있다.

In [29]:
#없는 요일 확인
stocks.index.day_name().value_counts()

date
Wednesday    257
Tuesday      257
Thursday     255
Friday       253
Monday       236
Name: count, dtype: int64

In [33]:
#토일이 없네!, 요일만 가지고 할수없음 공휴일인데 평일 일수도 있음
stocks.index

DatetimeIndex(['2015-05-27 00:00:00+00:00', '2015-05-28 00:00:00+00:00',
               '2015-05-29 00:00:00+00:00', '2015-06-01 00:00:00+00:00',
               '2015-06-02 00:00:00+00:00', '2015-06-03 00:00:00+00:00',
               '2015-06-04 00:00:00+00:00', '2015-06-05 00:00:00+00:00',
               '2015-06-08 00:00:00+00:00', '2015-06-09 00:00:00+00:00',
               ...
               '2020-05-11 00:00:00+00:00', '2020-05-12 00:00:00+00:00',
               '2020-05-13 00:00:00+00:00', '2020-05-14 00:00:00+00:00',
               '2020-05-15 00:00:00+00:00', '2020-05-18 00:00:00+00:00',
               '2020-05-19 00:00:00+00:00', '2020-05-20 00:00:00+00:00',
               '2020-05-21 00:00:00+00:00', '2020-05-22 00:00:00+00:00'],
              dtype='datetime64[ns, UTC]', name='date', length=1258, freq=None)

## 월화수목금+토일

### 1. 기간 찾기

In [37]:
# 가장 과거 날짜
min_date=stocks.index.min()
min_date

Timestamp('2015-05-27 00:00:00+0000', tz='UTC')

In [39]:
# 가장 최근 날짜
max_date=stocks.index.max()
max_date

Timestamp('2020-05-22 00:00:00+0000', tz='UTC')

2개 모두 Timestamp()이니 날짜 포맷 문자열 대신 활용 가능

### 2. 비어있는 날짜까지 데이터 생성

In [45]:
all_date=pd.date_range(min_date,max_date)
all_date
#인덱스! 

DatetimeIndex(['2015-05-27 00:00:00+00:00', '2015-05-28 00:00:00+00:00',
               '2015-05-29 00:00:00+00:00', '2015-05-30 00:00:00+00:00',
               '2015-05-31 00:00:00+00:00', '2015-06-01 00:00:00+00:00',
               '2015-06-02 00:00:00+00:00', '2015-06-03 00:00:00+00:00',
               '2015-06-04 00:00:00+00:00', '2015-06-05 00:00:00+00:00',
               ...
               '2020-05-13 00:00:00+00:00', '2020-05-14 00:00:00+00:00',
               '2020-05-15 00:00:00+00:00', '2020-05-16 00:00:00+00:00',
               '2020-05-17 00:00:00+00:00', '2020-05-18 00:00:00+00:00',
               '2020-05-19 00:00:00+00:00', '2020-05-20 00:00:00+00:00',
               '2020-05-21 00:00:00+00:00', '2020-05-22 00:00:00+00:00'],
              dtype='datetime64[ns, UTC]', length=1823, freq='D')

In [49]:
all_date.day_name().value_counts()

Wednesday    261
Thursday     261
Friday       261
Saturday     260
Sunday       260
Monday       260
Tuesday      260
Name: count, dtype: int64

### 3.1 재색인(reindex) + 결측치 채우기('휴장)

In [51]:
#동일 날짜는 그대로 없는 날짜는 NA 나옴
stocks.reindex(all_date)

Unnamed: 0,close,high,low,open
2015-05-27 00:00:00+00:00,132.045,132.26,130.0500,130.34
2015-05-28 00:00:00+00:00,131.780,131.95,131.1000,131.86
2015-05-29 00:00:00+00:00,130.280,131.45,129.9000,131.23
2015-05-30 00:00:00+00:00,,,,
2015-05-31 00:00:00+00:00,,,,
...,...,...,...,...
2020-05-18 00:00:00+00:00,314.960,316.50,310.3241,313.17
2020-05-19 00:00:00+00:00,313.140,318.52,313.0100,315.03
2020-05-20 00:00:00+00:00,319.230,319.52,316.2000,316.68
2020-05-21 00:00:00+00:00,316.850,320.89,315.8700,318.66


In [59]:
# .fillna()로 채우거나, fill_value=로 채우거나
#stocks.reindex(all_date).fillna('휴장')
stocks_new1=stocks.reindex(all_date, fill_value='휴장')

### 3.2 재색인(reindex) + 결측치 채우기(이전일)

In [63]:
# 어차피 가격이 넘어와 .. 이전값으로 채우기
stocks.reindex(all_date).ffill()

Unnamed: 0,close,high,low,open
2015-05-27 00:00:00+00:00,132.045,132.26,130.0500,130.34
2015-05-28 00:00:00+00:00,131.780,131.95,131.1000,131.86
2015-05-29 00:00:00+00:00,130.280,131.45,129.9000,131.23
2015-05-30 00:00:00+00:00,130.280,131.45,129.9000,131.23
2015-05-31 00:00:00+00:00,130.280,131.45,129.9000,131.23
...,...,...,...,...
2020-05-18 00:00:00+00:00,314.960,316.50,310.3241,313.17
2020-05-19 00:00:00+00:00,313.140,318.52,313.0100,315.03
2020-05-20 00:00:00+00:00,319.230,319.52,316.2000,316.68
2020-05-21 00:00:00+00:00,316.850,320.89,315.8700,318.66


In [69]:
#reindex 메소드로 사용하기 
st_new2=stocks.reindex(all_date , method='ffill')


## 4. 요일 열 추가

In [73]:
yo=st_new2.index.day_name(locale='ko_KR.utf8')

In [75]:
st_new2.insert(0, '요일', yo)

In [77]:
st_new2

Unnamed: 0,요일,close,high,low,open
2015-05-27 00:00:00+00:00,수요일,132.045,132.26,130.0500,130.34
2015-05-28 00:00:00+00:00,목요일,131.780,131.95,131.1000,131.86
2015-05-29 00:00:00+00:00,금요일,130.280,131.45,129.9000,131.23
2015-05-30 00:00:00+00:00,토요일,130.280,131.45,129.9000,131.23
2015-05-31 00:00:00+00:00,일요일,130.280,131.45,129.9000,131.23
...,...,...,...,...,...
2020-05-18 00:00:00+00:00,월요일,314.960,316.50,310.3241,313.17
2020-05-19 00:00:00+00:00,화요일,313.140,318.52,313.0100,315.03
2020-05-20 00:00:00+00:00,수요일,319.230,319.52,316.2000,316.68
2020-05-21 00:00:00+00:00,목요일,316.850,320.89,315.8700,318.66
