# 간단한 Pandas 예제

**Series**

In [1]:
import pandas as pd

In [3]:
myData = pd.read_csv('UseData/stock.adj_close.csv', index_col=0)

In [6]:
# csv에서 Series 하나를 가져온다.
# A005930 - 삼성전자.
mySeries = myData.loc['A005930']['2020-09-09':]

In [7]:
mySeries

2020-09-09    58400.0
2020-09-10    59200.0
2020-09-11    59000.0
2020-09-14    60400.0
2020-09-15    61000.0
2020-09-16    61000.0
2020-09-17    59500.0
2020-09-18    59300.0
Name: A005930, dtype: float64

날짜에 따른 주가가 딸려온 것을 확인할 수 있다. <br>
실제 데이터는 float형태의 배열들. <br>
index가 날짜, 배열의 name이 A005930.

In [8]:
# 현재 Series의 index들엔 뭐가 있는지 확인하려면
# 단순히 .index
mySeries.index

Index(['2020-09-09', '2020-09-10', '2020-09-11', '2020-09-14', '2020-09-15',
       '2020-09-16', '2020-09-17', '2020-09-18'],
      dtype='object')

In [9]:
# name도 마찬가지로 .name
mySeries.name

'A005930'

**Series - loc, iloc을 통한 접근**

In [11]:
# loc을 이용해 특정 index를 갖는 요소에 접근.
mySeries.loc['2020-09-15']

61000.0

In [12]:
# iloc을 이용해 배열 번호로 요소에 접근.
mySeries.iloc[4]

61000.0

In [13]:
# index slicing을 통해 요소들에 접근
mySeries.loc['2020-09-15' : '2020-09-17']

2020-09-15    61000.0
2020-09-16    61000.0
2020-09-17    59500.0
Name: A005930, dtype: float64

In [14]:
# iloc을 이용, 배열 번호 slicing을 통해 요소들에 접근.
mySeries.iloc[4:7]

2020-09-15    61000.0
2020-09-16    61000.0
2020-09-17    59500.0
Name: A005930, dtype: float64

**DataFrame**

In [15]:
# 불러온 데이터에서 Frame 추출.
myDf = myData.T.loc['2020-09-09' :, 'A005900' : 'A006000']

In [16]:
myDf

Symbol,A005930,A005940,A005950,A005960,A005980,A005990
2020-09-09,58400.0,9190.0,9080.0,11000.0,671.0,7990.0
2020-09-10,59200.0,9280.0,9280.0,11150.0,671.0,7960.0
2020-09-11,59000.0,9270.0,9360.0,11050.0,671.0,8120.0
2020-09-14,60400.0,9370.0,9400.0,11200.0,671.0,8200.0
2020-09-15,61000.0,9470.0,9390.0,11350.0,671.0,8180.0
2020-09-16,61000.0,9530.0,9080.0,11250.0,671.0,8180.0
2020-09-17,59500.0,9320.0,8880.0,11000.0,671.0,8000.0
2020-09-18,59300.0,9320.0,9070.0,10850.0,671.0,8020.0


**DataFrame - loc, iloc을 통한 접근**

In [17]:
# loc을 통해 단일 원소에 접근.
myDf.loc['2020-09-10', 'A005930']

59200.0

In [18]:
# Slicing을 통해 여러 원소에 접근
myDf.loc['2020-09-10', 'A005930' : 'A005980']

Symbol
A005930    59200.0
A005940     9280.0
A005950     9280.0
A005960    11150.0
A005980      671.0
Name: 2020-09-10, dtype: float64

In [19]:
# 마찬가지로 slicing을 통해 여러 원소에 접근
myDf.loc['2020-09-10' : '2020-19-15', 'A005930']

2020-09-10    59200.0
2020-09-11    59000.0
2020-09-14    60400.0
2020-09-15    61000.0
2020-09-16    61000.0
2020-09-17    59500.0
2020-09-18    59300.0
Name: A005930, dtype: float64

In [20]:
# Slicing을 통해 여러 Series에 접근
myDf.loc['2020-09-10' : '2020-09-18' : 2, 'A005930' : 'A005980']

Symbol,A005930,A005940,A005950,A005960,A005980
2020-09-10,59200.0,9280.0,9280.0,11150.0,671.0
2020-09-14,60400.0,9370.0,9400.0,11200.0,671.0
2020-09-16,61000.0,9530.0,9080.0,11250.0,671.0
2020-09-18,59300.0,9320.0,9070.0,10850.0,671.0


In [21]:
# :: 으로 모든 원소에 접근
myDf.loc[::, 'A005930']

2020-09-09    58400.0
2020-09-10    59200.0
2020-09-11    59000.0
2020-09-14    60400.0
2020-09-15    61000.0
2020-09-16    61000.0
2020-09-17    59500.0
2020-09-18    59300.0
Name: A005930, dtype: float64

In [22]:
# iloc을 통한 접근
myDf.iloc[5:, :]

Symbol,A005930,A005940,A005950,A005960,A005980,A005990
2020-09-16,61000.0,9530.0,9080.0,11250.0,671.0,8180.0
2020-09-17,59500.0,9320.0,8880.0,11000.0,671.0,8000.0
2020-09-18,59300.0,9320.0,9070.0,10850.0,671.0,8020.0


In [23]:
# 09-09부터 4칸 건너뛰어서 가져오고, 거기서 모든 index중 3번째 column부터 추출
myDf.loc['2020-09-09'::4].iloc[:, 2:]

Symbol,A005950,A005960,A005980,A005990
2020-09-09,9080.0,11000.0,671.0,7990.0
2020-09-15,9390.0,11350.0,671.0,8180.0


In [24]:
# isna()를 통해 결측값이 존재하는지 확인 가능
myDf.isna()

Symbol,A005930,A005940,A005950,A005960,A005980,A005990
2020-09-09,False,False,False,False,False,False
2020-09-10,False,False,False,False,False,False
2020-09-11,False,False,False,False,False,False
2020-09-14,False,False,False,False,False,False
2020-09-15,False,False,False,False,False,False
2020-09-16,False,False,False,False,False,False
2020-09-17,False,False,False,False,False,False
2020-09-18,False,False,False,False,False,False


In [25]:
# A005930 컬럼을 기준으로 값을 오름차순으로 정렬?
myDf.sort_values('A005930', axis=0, ascending=True)

Symbol,A005930,A005940,A005950,A005960,A005980,A005990
2020-09-09,58400.0,9190.0,9080.0,11000.0,671.0,7990.0
2020-09-11,59000.0,9270.0,9360.0,11050.0,671.0,8120.0
2020-09-10,59200.0,9280.0,9280.0,11150.0,671.0,7960.0
2020-09-18,59300.0,9320.0,9070.0,10850.0,671.0,8020.0
2020-09-17,59500.0,9320.0,8880.0,11000.0,671.0,8000.0
2020-09-14,60400.0,9370.0,9400.0,11200.0,671.0,8200.0
2020-09-15,61000.0,9470.0,9390.0,11350.0,671.0,8180.0
2020-09-16,61000.0,9530.0,9080.0,11250.0,671.0,8180.0


날짜 index가 오름차순 정렬된 값에 따라 바뀐것을 확인할 수 있다.

In [26]:
# 축 별로 순위를 매길 수도 있음.
myDf.rank(axis=0, ascending=False)

Symbol,A005930,A005940,A005950,A005960,A005980,A005990
2020-09-09,8.0,8.0,5.5,6.5,4.5,7.0
2020-09-10,6.0,6.0,4.0,4.0,4.5,8.0
2020-09-11,7.0,7.0,3.0,5.0,4.5,4.0
2020-09-14,3.0,3.0,1.0,3.0,4.5,1.0
2020-09-15,1.5,2.0,2.0,1.0,4.5,2.5
2020-09-16,1.5,1.0,5.5,2.0,4.5,2.5
2020-09-17,4.0,4.5,8.0,6.5,4.5,6.0
2020-09-18,5.0,4.5,7.0,8.0,4.5,5.0


**DataFrame - 새로운 데이터 프레임 생성/행or열 추가**

In [27]:
pd.DataFrame([[0,1,2], [3,4,5]],
            index=['00', '11'],
            columns=['a', 'b', 'c'])

Unnamed: 0,a,b,c
0,0,1,2
11,3,4,5


In [28]:
# numpy배열을 통해 DataFrame을 생성할 수도 있다.
import numpy as np
pd.DataFrame(np.ones((3,5))) # 3*5 DataFrame생성

Unnamed: 0,0,1,2,3,4
0,1.0,1.0,1.0,1.0,1.0
1,1.0,1.0,1.0,1.0,1.0
2,1.0,1.0,1.0,1.0,1.0


In [29]:
simpleFrame = pd.DataFrame([[100, 200], [300, 400]],
                          index=['01', '02'],
                          columns=['A', 'B'])

In [30]:
simpleFrame

Unnamed: 0,A,B
1,100,200
2,300,400


In [32]:
# 새 데이터 추가? 새 Series를 추가하는 셈. 
## 즉, 새 column을 추가.
simpleFrame.loc[:, 'C'] = np.nan
simpleFrame.loc[:, 'D'] = [700, 800]
simpleFrame

Unnamed: 0,A,B,C,D
1,100,200,,700
2,300,400,,800


In [33]:
# 값을 1개만 지정하면 동일한 값으로 채워짐.
# Broadcasting되는 셈.
simpleFrame.loc[:, 'E'] = 900
simpleFrame

Unnamed: 0,A,B,C,D,E
1,100,200,,700,900
2,300,400,,800,900


In [34]:
# column이 아닌 row를 추가하려면?
# row index에 해당하는 값을 지정해 넣어준다.
## column갯수 만큼의 list를 지정해주면 됨.
simpleFrame.loc['03'] = [1000, 1100, 1200, 1300, 1400]
simpleFrame

Unnamed: 0,A,B,C,D,E
1,100,200,,700,900
2,300,400,,800,900
3,1000,1100,1200.0,1300,1400


In [35]:
# 특정 위치에 값을 넣어줄 수도 있다.
simpleFrame.loc['01', 'C'] = 500
simpleFrame

Unnamed: 0,A,B,C,D,E
1,100,200,500.0,700,900
2,300,400,,800,900
3,1000,1100,1200.0,1300,1400


### DataFrame, Series 사용에 익숙해져야 분석을 원활히 할 수 있음.
### 엑셀을 다루기 위한 지식을 배우는 셈.