# DataFrame 조작하기

In [43]:
import pandas as pd
import numpy as np
import tensorflow as tf

In [44]:
df = pd.DataFrame(np.random.randn(6,4))

In [45]:
df

Unnamed: 0,0,1,2,3
0,-0.286711,-1.849803,-0.067774,-0.358252
1,0.900873,-0.66964,-1.205555,-0.401393
2,0.625984,-0.11485,0.815636,-1.464595
3,-1.734248,0.688749,-0.746847,0.108838
4,-1.031167,0.653194,-2.641261,-0.158522
5,0.984634,-0.940848,0.529024,0.10902


In [51]:
df.columns = ['a', 'b','c','d']
df.index = pd.date_range('20170701', periods = 6)

In [52]:
df.index

DatetimeIndex(['2017-07-01', '2017-07-02', '2017-07-03', '2017-07-04',
               '2017-07-05', '2017-07-06'],
              dtype='datetime64[ns]', freq='D')

In [53]:
df

Unnamed: 0,a,b,c,d
2017-07-01,-0.286711,-1.849803,-0.067774,-0.358252
2017-07-02,0.900873,-0.66964,-1.205555,-0.401393
2017-07-03,0.625984,-0.11485,0.815636,-1.464595
2017-07-04,-1.734248,0.688749,-0.746847,0.108838
2017-07-05,-1.031167,0.653194,-2.641261,-0.158522
2017-07-06,0.984634,-0.940848,0.529024,0.10902


In [28]:
df['f'] = [1.0, np.nan, 3.5, 6.1, np.nan, 7.0]
df

Unnamed: 0,a,b,c,d,f
2017-07-01,0.29767,0.329554,0.124229,-0.664267,1.0
2017-07-02,0.377136,0.704876,-0.045853,1.08719,
2017-07-03,-0.869882,0.100878,-2.907582,0.049096,3.5
2017-07-04,-0.105144,-0.0582,-1.380671,-1.818447,6.1
2017-07-05,1.106576,-0.382273,0.132159,-0.118904,
2017-07-06,-2.019374,-0.027466,-0.627642,-0.377644,7.0


In [29]:
#NAN 제거하기
df.dropna(how='any')   # nan이 하나라도 포함된 경우 그 행을 삭제한다.

Unnamed: 0,a,b,c,d,f
2017-07-01,0.29767,0.329554,0.124229,-0.664267,1.0
2017-07-03,-0.869882,0.100878,-2.907582,0.049096,3.5
2017-07-04,-0.105144,-0.0582,-1.380671,-1.818447,6.1
2017-07-06,-2.019374,-0.027466,-0.627642,-0.377644,7.0


In [30]:
df.dropna(how='all')   # nan이 행 전체에 있을경우, 그 행을 삭제한다. 아래는 행 전체에 nan이 없으므로 해당되지 않음

Unnamed: 0,a,b,c,d,f
2017-07-01,0.29767,0.329554,0.124229,-0.664267,1.0
2017-07-02,0.377136,0.704876,-0.045853,1.08719,
2017-07-03,-0.869882,0.100878,-2.907582,0.049096,3.5
2017-07-04,-0.105144,-0.0582,-1.380671,-1.818447,6.1
2017-07-05,1.106576,-0.382273,0.132159,-0.118904,
2017-07-06,-2.019374,-0.027466,-0.627642,-0.377644,7.0


In [31]:
#nan 값 대체하기
df.fillna(value=5.0)

Unnamed: 0,a,b,c,d,f
2017-07-01,0.29767,0.329554,0.124229,-0.664267,1.0
2017-07-02,0.377136,0.704876,-0.045853,1.08719,5.0
2017-07-03,-0.869882,0.100878,-2.907582,0.049096,3.5
2017-07-04,-0.105144,-0.0582,-1.380671,-1.818447,6.1
2017-07-05,1.106576,-0.382273,0.132159,-0.118904,5.0
2017-07-06,-2.019374,-0.027466,-0.627642,-0.377644,7.0


In [32]:
#불리어 마스크로 값 제어하기
df.isnull()

Unnamed: 0,a,b,c,d,f
2017-07-01,False,False,False,False,False
2017-07-02,False,False,False,False,True
2017-07-03,False,False,False,False,False
2017-07-04,False,False,False,False,False
2017-07-05,False,False,False,False,True
2017-07-06,False,False,False,False,False


In [33]:
#nan이 있는 행 얻어오기
df.loc[df.isnull()['f'],:]

Unnamed: 0,a,b,c,d,f
2017-07-02,0.377136,0.704876,-0.045853,1.08719,
2017-07-05,1.106576,-0.382273,0.132159,-0.118904,


In [34]:
df

Unnamed: 0,a,b,c,d,f
2017-07-01,0.29767,0.329554,0.124229,-0.664267,1.0
2017-07-02,0.377136,0.704876,-0.045853,1.08719,
2017-07-03,-0.869882,0.100878,-2.907582,0.049096,3.5
2017-07-04,-0.105144,-0.0582,-1.380671,-1.818447,6.1
2017-07-05,1.106576,-0.382273,0.132159,-0.118904,
2017-07-06,-2.019374,-0.027466,-0.627642,-0.377644,7.0


In [35]:
# 인덱스 기준으로 행 삭제하기
pd.to_datetime('20170701')

Timestamp('2017-07-01 00:00:00')

In [36]:
df.drop(pd.to_datetime('20170701'))

Unnamed: 0,a,b,c,d,f
2017-07-02,0.377136,0.704876,-0.045853,1.08719,
2017-07-03,-0.869882,0.100878,-2.907582,0.049096,3.5
2017-07-04,-0.105144,-0.0582,-1.380671,-1.818447,6.1
2017-07-05,1.106576,-0.382273,0.132159,-0.118904,
2017-07-06,-2.019374,-0.027466,-0.627642,-0.377644,7.0


In [37]:
#인덱스 기준으로 두 개 이상의 행 삭제하기
df.drop([pd.to_datetime('20170701'), pd.to_datetime('20170702')])

Unnamed: 0,a,b,c,d,f
2017-07-03,-0.869882,0.100878,-2.907582,0.049096,3.5
2017-07-04,-0.105144,-0.0582,-1.380671,-1.818447,6.1
2017-07-05,1.106576,-0.382273,0.132159,-0.118904,
2017-07-06,-2.019374,-0.027466,-0.627642,-0.377644,7.0


In [38]:
# 열 삭제하기  
# del df['f']
df.drop('f', axis=1)


Unnamed: 0,a,b,c,d
2017-07-01,0.29767,0.329554,0.124229,-0.664267
2017-07-02,0.377136,0.704876,-0.045853,1.08719
2017-07-03,-0.869882,0.100878,-2.907582,0.049096
2017-07-04,-0.105144,-0.0582,-1.380671,-1.818447
2017-07-05,1.106576,-0.382273,0.132159,-0.118904
2017-07-06,-2.019374,-0.027466,-0.627642,-0.377644


In [39]:
df.drop(['b','d'], axis=1)

Unnamed: 0,a,c,f
2017-07-01,0.29767,0.124229,1.0
2017-07-02,0.377136,-0.045853,
2017-07-03,-0.869882,-2.907582,3.5
2017-07-04,-0.105144,-1.380671,6.1
2017-07-05,1.106576,0.132159,
2017-07-06,-2.019374,-0.627642,7.0
