In [2]:
import numpy as np
import pandas as pd

In [3]:
# DataFrame을 만들 때 index,columns을 설정하지 않으면 기본적으로 0부터 인덱싱 됨
df = pd.DataFrame(np.random.randn(6,4))
df

Unnamed: 0,0,1,2,3
0,0.777918,-1.286055,-0.775266,0.12251
1,0.009682,0.160314,0.047396,-0.026579
2,-2.061808,0.336126,1.566888,1.122732
3,-0.700213,-1.226649,1.513534,0.315415
4,-0.462999,0.090566,1.885927,1.969503
5,0.070185,-0.711787,-0.431826,-0.365646


In [5]:
df.columns = ['A','B','C','D']
df.index = pd.date_range('20160707', periods=6)
# pandas의 date_range함수는 datetime자료형으로 구성된 날짜, 시각 등을 알 수 있는 자료형 만드는 함수
df.index

DatetimeIndex(['2016-07-07', '2016-07-08', '2016-07-09', '2016-07-10',
               '2016-07-11', '2016-07-12'],
              dtype='datetime64[ns]', freq='D')

In [7]:
df

Unnamed: 0,A,B,C,D
2016-07-07,0.777918,-1.286055,-0.775266,0.12251
2016-07-08,0.009682,0.160314,0.047396,-0.026579
2016-07-09,-2.061808,0.336126,1.566888,1.122732
2016-07-10,-0.700213,-1.226649,1.513534,0.315415
2016-07-11,-0.462999,0.090566,1.885927,1.969503
2016-07-12,0.070185,-0.711787,-0.431826,-0.365646


In [8]:
# np.nan == NaN값
df['F'] = [1.0, np.nan, 3.5, 6.1, np.nan, 7.0]
df

Unnamed: 0,A,B,C,D,F
2016-07-07,0.777918,-1.286055,-0.775266,0.12251,1.0
2016-07-08,0.009682,0.160314,0.047396,-0.026579,
2016-07-09,-2.061808,0.336126,1.566888,1.122732,3.5
2016-07-10,-0.700213,-1.226649,1.513534,0.315415,6.1
2016-07-11,-0.462999,0.090566,1.885927,1.969503,
2016-07-12,0.070185,-0.711787,-0.431826,-0.365646,7.0


#### NaN 없애기

In [9]:
# 행의 값 중 하나라도 nan이 있는 경우, 그 row를 없앤다
df.dropna(how='any')

Unnamed: 0,A,B,C,D,F
2016-07-07,0.777918,-1.286055,-0.775266,0.12251,1.0
2016-07-09,-2.061808,0.336126,1.566888,1.122732,3.5
2016-07-10,-0.700213,-1.226649,1.513534,0.315415,6.1
2016-07-12,0.070185,-0.711787,-0.431826,-0.365646,7.0


In [10]:
# row의 값의 모든 값이 nan인 경우, 그 row를 없앤다
df.dropna(how='all')

Unnamed: 0,A,B,C,D,F
2016-07-07,0.777918,-1.286055,-0.775266,0.12251,1.0
2016-07-08,0.009682,0.160314,0.047396,-0.026579,
2016-07-09,-2.061808,0.336126,1.566888,1.122732,3.5
2016-07-10,-0.700213,-1.226649,1.513534,0.315415,6.1
2016-07-11,-0.462999,0.090566,1.885927,1.969503,
2016-07-12,0.070185,-0.711787,-0.431826,-0.365646,7.0


#### 주의
drop함수는 특정 row 또는 columns를 drop하고 난 DataFrame을 반환함
즉, 반환을 받지 않으면 기존의 DataFrame은 그대로임
아니면, inplace=True 라는 인자를 추가하여 반환을 받지 않고서도
기존의 DataFrame이 변경되도록 한다

In [11]:
# nan값에 다른 값 넣기
df.fillna(value = 0.5)

Unnamed: 0,A,B,C,D,F
2016-07-07,0.777918,-1.286055,-0.775266,0.12251,1.0
2016-07-08,0.009682,0.160314,0.047396,-0.026579,0.5
2016-07-09,-2.061808,0.336126,1.566888,1.122732,3.5
2016-07-10,-0.700213,-1.226649,1.513534,0.315415,6.1
2016-07-11,-0.462999,0.090566,1.885927,1.969503,0.5
2016-07-12,0.070185,-0.711787,-0.431826,-0.365646,7.0


In [12]:
# nan값인지 확인하기
df.isnull()

Unnamed: 0,A,B,C,D,F
2016-07-07,False,False,False,False,False
2016-07-08,False,False,False,False,True
2016-07-09,False,False,False,False,False
2016-07-10,False,False,False,False,False
2016-07-11,False,False,False,False,True
2016-07-12,False,False,False,False,False


In [13]:
# F열에서 nan값을 포함하는 row만 추출
df.loc[df.isnull()['F'],:]

Unnamed: 0,A,B,C,D,F
2016-07-08,0.009682,0.160314,0.047396,-0.026579,
2016-07-11,-0.462999,0.090566,1.885927,1.969503,


In [15]:
pd.to_datetime('20160707')

Timestamp('2016-07-07 00:00:00')

In [16]:
# 특정 행 drop 하기
df.drop(pd.to_datetime('20160707'))

Unnamed: 0,A,B,C,D,F
2016-07-08,0.009682,0.160314,0.047396,-0.026579,
2016-07-09,-2.061808,0.336126,1.566888,1.122732,3.5
2016-07-10,-0.700213,-1.226649,1.513534,0.315415,6.1
2016-07-11,-0.462999,0.090566,1.885927,1.969503,
2016-07-12,0.070185,-0.711787,-0.431826,-0.365646,7.0


In [17]:
# 2개 이상도 가능
df.drop([pd.to_datetime('20160709'), pd.to_datetime('20160711')])

Unnamed: 0,A,B,C,D,F
2016-07-07,0.777918,-1.286055,-0.775266,0.12251,1.0
2016-07-08,0.009682,0.160314,0.047396,-0.026579,
2016-07-10,-0.700213,-1.226649,1.513534,0.315415,6.1
2016-07-12,0.070185,-0.711787,-0.431826,-0.365646,7.0


In [18]:
# 특정 열 삭제
df.drop('F', axis = 1)

Unnamed: 0,A,B,C,D
2016-07-07,0.777918,-1.286055,-0.775266,0.12251
2016-07-08,0.009682,0.160314,0.047396,-0.026579
2016-07-09,-2.061808,0.336126,1.566888,1.122732
2016-07-10,-0.700213,-1.226649,1.513534,0.315415
2016-07-11,-0.462999,0.090566,1.885927,1.969503
2016-07-12,0.070185,-0.711787,-0.431826,-0.365646


In [21]:
# 2개 이상 열 삭제 가능
df.drop(['B','D'], axis = 1)

Unnamed: 0,A,C,F
2016-07-07,0.777918,-0.775266,1.0
2016-07-08,0.009682,0.047396,
2016-07-09,-2.061808,1.566888,3.5
2016-07-10,-0.700213,1.513534,6.1
2016-07-11,-0.462999,1.885927,
2016-07-12,0.070185,-0.431826,7.0
