## 12. 확인


## 참고자료
* [Python 완전정복 시리즈] 2편 : Pandas DataFrame 완전정복 : https://wikidocs.net/book/7188

In [1]:
import pandas as pd
import numpy as np

## 최대/최솟값이 포함된 행/열 (idxmax / idxmin)

In [2]:
n=np.NaN
idx =  ['row1','row2','row3']
col =  ['col1','col2','col3']
data = [[1,2,200],[100,5,6],[7,300,n]]
df = pd.DataFrame(data, idx, col)
df

Unnamed: 0,col1,col2,col3
row1,1,2,200.0
row2,100,5,6.0
row3,7,300,


In [3]:
df.idxmax(axis=0)

col1    row2
col2    row3
col3    row1
dtype: object

In [4]:
df.idxmin(axis=0)

col1    row1
col2    row1
col3    row2
dtype: object

In [7]:
df.idxmax(axis=1)

row1    col3
row2    col1
row3    col2
dtype: object

In [6]:
df.idxmin(axis=1)

row1    col1
row2    col2
row3    col1
dtype: object

In [8]:
df.idxmax(axis=1, skipna=False)

row1    col3
row2    col1
row3     NaN
dtype: object

## 비어있는지 확인 (empty)

In [9]:
data_empty=[['','',''],['','',''],['','','']]
df = pd.DataFrame(data = data_empty, index = ['row1', 'row2', 'row3'], columns=['col1','col2','col3'])
df

Unnamed: 0,col1,col2,col3
row1,,,
row2,,,
row3,,,


In [10]:
df.empty

False

In [11]:
df = pd.DataFrame(data = np.nan, index = ['row1', 'row2', 'row3'], columns=['col1','col2','col3'])
df

Unnamed: 0,col1,col2,col3
row1,,,
row2,,,
row3,,,


In [12]:
df.empty

False

In [13]:
df1 = pd.DataFrame(index = ['row1', 'row2', 'row3'])
df1

row1
row2
row3


In [14]:
df1.empty

True

In [15]:
df2 = pd.DataFrame(columns=['col1','col2','col3'])
df2

Unnamed: 0,col1,col2,col3


In [16]:
df2.empty

True

## 일치하는 요소 확인 (isin)

In [17]:
data = [[1,1,1],[2,3,4],[5,3,6]]
col = ['col1','col2','col3']
row = ['row1','row2','row3']
df = pd.DataFrame(data=data,index=row,columns=col)
df

Unnamed: 0,col1,col2,col3
row1,1,1,1
row2,2,3,4
row3,5,3,6


In [18]:
df.isin([1,3]) # 1과 3인 경우 True 반환

Unnamed: 0,col1,col2,col3
row1,True,True,True
row2,False,True,False
row3,False,True,False


In [19]:
df.isin({'col1': [2,3], 'col3': [1,6]})

Unnamed: 0,col1,col2,col3
row1,False,False,True
row2,True,False,False
row3,False,False,True


In [20]:
match_col = ['col1','col3']
match_row = ['row1','row3']
match_data = [[2,3],[1,6]]
match_df = pd.DataFrame(data=match_data,index=match_row,columns=match_col)
match_df

Unnamed: 0,col1,col3
row1,2,3
row3,1,6


In [21]:
df.isin(match_df) # match_df에 지정된 row,col의 값과 일치할 때만 True를 반환

Unnamed: 0,col1,col2,col3
row1,False,False,False
row2,False,False,False
row3,False,False,True


## 요소의 True/False 확인 (all / any)

In [22]:
[N,T,F]=[pd.NA,True,False]
idx = ['row1','row2','row3','row4']
data = {'col1':[T,T,T,T], 'col2':[F,F,F,F],'col3':[F,T,T,T],'col4':[T,N,T,T],'col5':[T,T,'',T],'col6':[T,T,T,0]}
df = pd.DataFrame(data=data, index=idx)
df

Unnamed: 0,col1,col2,col3,col4,col5,col6
row1,True,False,False,True,True,True
row2,True,False,True,,True,True
row3,True,False,True,True,,True
row4,True,False,True,True,True,0


In [23]:
df.all()

# col1     True #모두 True
# col2    False #모두 False
# col3    False #1개가 Ture
# col4     True #<NA>포함(나머지True)
# col5    False #공백 포함(나머지True)
# col6    False #0포함(나머지True)
# dtype: bool

col1     True
col2    False
col3    False
col4     True
col5    False
col6    False
dtype: bool

In [24]:
df.any()

col1     True
col2    False
col3     True
col4     True
col5     True
col6     True
dtype: bool

In [25]:
df.any(axis=1)

row1    True
row2    True
row3    True
row4    True
dtype: bool

In [26]:
df.all(bool_only=True)

col1     True
col2    False
col3    False
dtype: bool

In [27]:
df.all(skipna=True)

col1     True
col2    False
col3    False
col4     True
col5    False
col6    False
dtype: bool

In [28]:
df.all(skipna=False) # 결측치 포함된 축이 계산에서 제외

  df.all(skipna=False)


col1     True
col2    False
col3    False
col5    False
col6    False
dtype: bool

## 결측값이 아닌 요소의 수 (count)

In [29]:
[N,T,F]=[pd.NA,True,False]
idx = ['row1','row2','row3','row4']
data = {'col1':[1,N,N,4.0],'col2':['A','B','C',N],'col3':[N,N,N,7],'col4':[1,2.4,3.6,4]}
df = pd.DataFrame(data,idx)
df

Unnamed: 0,col1,col2,col3,col4
row1,1.0,A,,1.0
row2,,B,,2.4
row3,,C,,3.6
row4,4.0,,7.0,4.0


In [30]:
df.count()

col1    2
col2    3
col3    1
col4    4
dtype: int64

In [31]:
df.count(axis=1)

row1    3
row2    2
row3    2
row4    3
dtype: int64

In [32]:
df.count(numeric_only=True)

col4    4
dtype: int64

## 일치 여부 (equals)

In [36]:
df1 = pd.DataFrame(data=[[1,N],[3,T]])
df2 = pd.DataFrame(data=[[1,N],[3,T]])
df3 = pd.DataFrame(data=[[1,N],[3.0,T]])

In [34]:
df1

Unnamed: 0,0,1
0,1,
1,3,True


In [35]:
df2

Unnamed: 0,0,1
0,1,
1,3,True


In [37]:
df3

Unnamed: 0,0,1
0,1.0,
1,3.0,True


In [38]:
df1.equals(df2)

True

In [40]:
df2.equals(df3) # 요소의 type이 달라도 flase 반환

False

## 1칸 객체의 bool 확인 (bool)

In [41]:
df1 = pd.DataFrame([True],['row'],['col'])
df1

Unnamed: 0,col
row,True


In [42]:
df2 = pd.DataFrame([False],['row'],['col'])
df2

Unnamed: 0,col
row,False


In [43]:
df1.bool()

True

In [44]:
df2.bool()

False

## 중복행 확인 (duplicated)