## 1. Basic of Python

In [1]:
# pandas의 데이터 유형 중 기초가 되는 것이 Series이다.
# 대괄호로 만들고, list 데이터로 만들 수 있다.

import pandas as pd
import numpy as np      

s = pd.Series([1,3,5,np.nan,6,8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [2]:
# 날짜형 데이터인 date_range
# 기본 날짜를 지정하고 periods 옵션으로 6일간이라고 지정한다.

dates = pd.date_range('20200101', periods=6)
dates

DatetimeIndex(['2020-01-01', '2020-01-02', '2020-01-03', '2020-01-04',
               '2020-01-05', '2020-01-06'],
              dtype='datetime64[ns]', freq='D')

In [3]:
# DataFrame 유형의 데이터를 만든다.
# 6행 4열의 random 변수를 만들고, 컬럼에는 A,B,C,D를 넣고, 인덱스(행)는 dates 만든거를 넣는다.

df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=['A','B','C','D'])
df

Unnamed: 0,A,B,C,D
2020-01-01,0.755522,0.42446,-0.763249,0.312412
2020-01-02,-1.575852,-0.436183,-0.366291,1.185955
2020-01-03,0.947574,0.888953,1.452142,0.22717
2020-01-04,0.830922,0.083584,-0.948028,0.069074
2020-01-05,-1.240142,1.383337,-0.858488,0.38168
2020-01-06,0.686517,-0.364972,-0.265335,0.492926


In [4]:
df.index

DatetimeIndex(['2020-01-01', '2020-01-02', '2020-01-03', '2020-01-04',
               '2020-01-05', '2020-01-06'],
              dtype='datetime64[ns]', freq='D')

In [5]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [6]:
df.values

array([[ 0.75552186,  0.42445998, -0.76324934,  0.31241209],
       [-1.57585205, -0.43618279, -0.36629052,  1.18595453],
       [ 0.94757358,  0.88895255,  1.45214192,  0.22716969],
       [ 0.83092223,  0.08358401, -0.94802776,  0.0690736 ],
       [-1.24014195,  1.3833373 , -0.85848833,  0.3816796 ],
       [ 0.68651689, -0.36497221, -0.26533468,  0.49292566]])

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 6 entries, 2020-01-01 to 2020-01-06
Freq: D
Data columns (total 4 columns):
A    6 non-null float64
B    6 non-null float64
C    6 non-null float64
D    6 non-null float64
dtypes: float64(4)
memory usage: 240.0 bytes


In [8]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.067423,0.329863,-0.291541,0.444869
std,1.151037,0.715767,0.896841,0.3903
min,-1.575852,-0.436183,-0.948028,0.069074
25%,-0.758477,-0.252833,-0.834679,0.24848
50%,0.721019,0.254022,-0.56477,0.347046
75%,0.812072,0.772829,-0.290574,0.465114
max,0.947574,1.383337,1.452142,1.185955


In [9]:
df.sort_values(by='B', ascending=False)

Unnamed: 0,A,B,C,D
2020-01-05,-1.240142,1.383337,-0.858488,0.38168
2020-01-03,0.947574,0.888953,1.452142,0.22717
2020-01-01,0.755522,0.42446,-0.763249,0.312412
2020-01-04,0.830922,0.083584,-0.948028,0.069074
2020-01-06,0.686517,-0.364972,-0.265335,0.492926
2020-01-02,-1.575852,-0.436183,-0.366291,1.185955


In [10]:
df

Unnamed: 0,A,B,C,D
2020-01-01,0.755522,0.42446,-0.763249,0.312412
2020-01-02,-1.575852,-0.436183,-0.366291,1.185955
2020-01-03,0.947574,0.888953,1.452142,0.22717
2020-01-04,0.830922,0.083584,-0.948028,0.069074
2020-01-05,-1.240142,1.383337,-0.858488,0.38168
2020-01-06,0.686517,-0.364972,-0.265335,0.492926


In [11]:
df['A']

2020-01-01    0.755522
2020-01-02   -1.575852
2020-01-03    0.947574
2020-01-04    0.830922
2020-01-05   -1.240142
2020-01-06    0.686517
Freq: D, Name: A, dtype: float64

In [12]:
df[0:3]

Unnamed: 0,A,B,C,D
2020-01-01,0.755522,0.42446,-0.763249,0.312412
2020-01-02,-1.575852,-0.436183,-0.366291,1.185955
2020-01-03,0.947574,0.888953,1.452142,0.22717


In [13]:
df['2020-01-02':'20200104']

Unnamed: 0,A,B,C,D
2020-01-02,-1.575852,-0.436183,-0.366291,1.185955
2020-01-03,0.947574,0.888953,1.452142,0.22717
2020-01-04,0.830922,0.083584,-0.948028,0.069074


In [14]:
# loc는 location 옵션으로 슬라이싱할 때 loc 옵션을 이용해서 위치 값을 지정할 수 있다.

df.loc[dates[0]]

A    0.755522
B    0.424460
C   -0.763249
D    0.312412
Name: 2020-01-01 00:00:00, dtype: float64

In [15]:
df.loc[:,['A','B']]

Unnamed: 0,A,B
2020-01-01,0.755522,0.42446
2020-01-02,-1.575852,-0.436183
2020-01-03,0.947574,0.888953
2020-01-04,0.830922,0.083584
2020-01-05,-1.240142,1.383337
2020-01-06,0.686517,-0.364972


In [16]:
df.loc['20200102':'20200104',['A','C']]

Unnamed: 0,A,C
2020-01-02,-1.575852,-0.366291
2020-01-03,0.947574,1.452142
2020-01-04,0.830922,-0.948028


In [17]:
df.loc['20200102':'20200104',['A','B']]

Unnamed: 0,A,B
2020-01-02,-1.575852,-0.436183
2020-01-03,0.947574,0.888953
2020-01-04,0.830922,0.083584


In [18]:
df.loc['20200102',['A','B']]

A   -1.575852
B   -0.436183
Name: 2020-01-02 00:00:00, dtype: float64

In [19]:
df.loc[dates[0],'A']

0.7555218640502255

In [20]:
# loc 명령과 달리 행과 열의 번호를 이용해서 데이터에 바로 접근하고 싶을 수 있는데,
# 그 명령이 iloc이다.
# iloc을 사용하면 행이나 열의 범위를 지정하면 된다.

df.iloc[3]

A    0.830922
B    0.083584
C   -0.948028
D    0.069074
Name: 2020-01-04 00:00:00, dtype: float64

In [21]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2020-01-04,0.830922,0.083584
2020-01-05,-1.240142,1.383337


In [22]:
df.iloc[[1,2,4],[0,2]]

Unnamed: 0,A,C
2020-01-02,-1.575852,-0.366291
2020-01-03,0.947574,1.452142
2020-01-05,-1.240142,-0.858488


In [23]:
df.iloc[1:3,:]

Unnamed: 0,A,B,C,D
2020-01-02,-1.575852,-0.436183,-0.366291,1.185955
2020-01-03,0.947574,0.888953,1.452142,0.22717


In [24]:
df.iloc[:,1:3]

Unnamed: 0,B,C
2020-01-01,0.42446,-0.763249
2020-01-02,-0.436183,-0.366291
2020-01-03,0.888953,1.452142
2020-01-04,0.083584,-0.948028
2020-01-05,1.383337,-0.858488
2020-01-06,-0.364972,-0.265335


In [25]:
df

Unnamed: 0,A,B,C,D
2020-01-01,0.755522,0.42446,-0.763249,0.312412
2020-01-02,-1.575852,-0.436183,-0.366291,1.185955
2020-01-03,0.947574,0.888953,1.452142,0.22717
2020-01-04,0.830922,0.083584,-0.948028,0.069074
2020-01-05,-1.240142,1.383337,-0.858488,0.38168
2020-01-06,0.686517,-0.364972,-0.265335,0.492926


In [26]:
# 특정 조건을 만족하는 데이터만 얻을 수 있다.
# 컬럼을 지정할 때 df['A']처럼 할 수도 있고, df.A와 같이 할 수도 있다.

df[df.A > 0]

Unnamed: 0,A,B,C,D
2020-01-01,0.755522,0.42446,-0.763249,0.312412
2020-01-03,0.947574,0.888953,1.452142,0.22717
2020-01-04,0.830922,0.083584,-0.948028,0.069074
2020-01-06,0.686517,-0.364972,-0.265335,0.492926


In [27]:
# 데이터 전체에서 조건을 걸면 만족하지 않은 곳은 NaN 처리가 된다.

df[df > 0]

Unnamed: 0,A,B,C,D
2020-01-01,0.755522,0.42446,,0.312412
2020-01-02,,,,1.185955
2020-01-03,0.947574,0.888953,1.452142,0.22717
2020-01-04,0.830922,0.083584,,0.069074
2020-01-05,,1.383337,,0.38168
2020-01-06,0.686517,,,0.492926


In [28]:
# DataFrame을 복사할 때 그냥 = 기호를 이용해서 복사하면 실제 데이터의 내용이 복사되는 것이 아니라
# 데이터 위치만 복사되기 때문에 원본 데이터는 하나만 있게 된다.

df2 = df.copy()

In [29]:
# 원래 있는 DataFrame에 새로운 컬럼 추가하기

df2['E'] = ['one','one','two','three','four','three']
df2

Unnamed: 0,A,B,C,D,E
2020-01-01,0.755522,0.42446,-0.763249,0.312412,one
2020-01-02,-1.575852,-0.436183,-0.366291,1.185955,one
2020-01-03,0.947574,0.888953,1.452142,0.22717,two
2020-01-04,0.830922,0.083584,-0.948028,0.069074,three
2020-01-05,-1.240142,1.383337,-0.858488,0.38168,four
2020-01-06,0.686517,-0.364972,-0.265335,0.492926,three


In [30]:
# 컬럼에서 데이터 있는지 조건을 걸고 싶을 때 isin 사용

df2['E'].isin(['two','four'])

2020-01-01    False
2020-01-02    False
2020-01-03     True
2020-01-04    False
2020-01-05     True
2020-01-06    False
Freq: D, Name: E, dtype: bool

In [31]:
df2[df2['E'].isin(['two','four'])]

Unnamed: 0,A,B,C,D,E
2020-01-03,0.947574,0.888953,1.452142,0.22717,two
2020-01-05,-1.240142,1.383337,-0.858488,0.38168,four


In [32]:
df

Unnamed: 0,A,B,C,D
2020-01-01,0.755522,0.42446,-0.763249,0.312412
2020-01-02,-1.575852,-0.436183,-0.366291,1.185955
2020-01-03,0.947574,0.888953,1.452142,0.22717
2020-01-04,0.830922,0.083584,-0.948028,0.069074
2020-01-05,-1.240142,1.383337,-0.858488,0.38168
2020-01-06,0.686517,-0.364972,-0.265335,0.492926


In [33]:
# 통계 느낌의 데이터를 볼 때는 특정 함수를 적용시킨다. => apply 명령
# 누적합을 알고 싶을 때, numpy의 cumsum 사용

df.apply(np.cumsum)

Unnamed: 0,A,B,C,D
2020-01-01,0.755522,0.42446,-0.763249,0.312412
2020-01-02,-0.82033,-0.011723,-1.12954,1.498367
2020-01-03,0.127243,0.87723,0.322602,1.725536
2020-01-04,0.958166,0.960814,-0.625426,1.79461
2020-01-05,-0.281976,2.344151,-1.483914,2.17629
2020-01-06,0.404541,1.979179,-1.749249,2.669215


In [34]:
# 최대값과 최소값의 차이를 알고 싶다면 one-line 함수인 lambda 사용

df.apply(lambda x: x.max() - x.min())

A    2.523426
B    1.819520
C    2.400170
D    1.116881
dtype: float64