## 07. 정렬

## 참고자료
* [Python 완전정복 시리즈] 2편 : Pandas DataFrame 완전정복 : https://wikidocs.net/book/7188

In [2]:
import pandas as pd
import numpy as np

## 값 기준 정렬

In [3]:
na = np.NaN
data = [[-3,'A',17],
        [na,'D',31],
        [ 7,'D',-8],
        [15,'Z', 3],
        [ 0, na,-7]]
col = ['col1','col2','col3']
row = ['row1','row2','row3','row4','row5']
df = pd.DataFrame(data = data, index = row, columns= col)
df

Unnamed: 0,col1,col2,col3
row1,-3.0,A,17
row2,,D,31
row3,7.0,D,-8
row4,15.0,Z,3
row5,0.0,,-7


In [4]:
df.sort_values(by='col3')

Unnamed: 0,col1,col2,col3
row3,7.0,D,-8
row5,0.0,,-7
row4,15.0,Z,3
row1,-3.0,A,17
row2,,D,31


In [5]:
df.sort_values(by=['col2', 'col3'])

Unnamed: 0,col1,col2,col3
row1,-3.0,A,17
row3,7.0,D,-8
row2,,D,31
row4,15.0,Z,3
row5,0.0,,-7


In [7]:
# axis 인수로 어떤 축을 기준으로 정렬할지 설정 가능

df.sort_values(by='col3', axis=0)

Unnamed: 0,col1,col2,col3
row3,7.0,D,-8
row5,0.0,,-7
row4,15.0,Z,3
row1,-3.0,A,17
row2,,D,31


In [8]:
# 행 기준 row1의 오름차순 정렬. 문자와 숫자의 혼용으로 오류 발생

df.sort_values(by='row1', axis=1)

TypeError: '>' not supported between instances of 'numpy.ndarray' and 'str'

In [9]:
# 행 기준 row5의 오름차순 정렬. 문자와 NaN의 혼용이므로 정상 정렬

df.sort_values(by='row5', axis=1)

Unnamed: 0,col3,col1,col2
row1,17,-3.0,A
row2,31,,D
row3,-8,7.0,D
row4,3,15.0,Z
row5,-7,0.0,


In [11]:
# ascending 인수 사용하여 오름차순 / 내림차순 설정
# default : True(오름차순)

df.sort_values(by='col3', ascending=False)

Unnamed: 0,col1,col2,col3
row2,,D,31
row1,-3.0,A,17
row4,15.0,Z,3
row5,0.0,,-7
row3,7.0,D,-8


In [12]:
# na_position인수로 결측값 위치 지정
df.sort_values(by='col1', na_position='last')

Unnamed: 0,col1,col2,col3
row1,-3.0,A,17
row5,0.0,,-7
row3,7.0,D,-8
row4,15.0,Z,3
row2,,D,31


In [14]:
df.sort_values(by='col1', na_position='first')

Unnamed: 0,col1,col2,col3
row2,,D,31
row1,-3.0,A,17
row5,0.0,,-7
row3,7.0,D,-8
row4,15.0,Z,3


In [15]:
# ignore_index로 인덱스 미사용
df.sort_values(by='col3', ignore_index=True)

Unnamed: 0,col1,col2,col3
0,7.0,D,-8
1,0.0,,-7
2,15.0,Z,3
3,-3.0,A,17
4,,D,31


In [17]:
df.sort_values(by='col2', key=lambda col: col.str.lower())

Unnamed: 0,col1,col2,col3
row1,-3.0,A,17
row2,,D,31
row3,7.0,D,-8
row4,15.0,Z,3
row5,0.0,,-7


In [18]:
df.sort_values(by='col3', inplace=True)
df

Unnamed: 0,col1,col2,col3
row3,7.0,D,-8
row5,0.0,,-7
row4,15.0,Z,3
row1,-3.0,A,17
row2,,D,31


## 인덱스 기준 정렬

##### multi index에서 na_position, ignore_index가 작동하지 않는 것 외에 sort_value와 비슷

In [19]:
na = np.NaN
index_tuples = [('row1', 'val1'), ('row1', 'val2'), ('row3', 'val3'), ('row3', 'val1'), ('row3', 'val2'), ('row2', 'val5'),('row2', 'val2')]
values = [ [1,2,3], [4,na,6], [7,8,9], [na,11,12], [13,14,15], [16,17,18], [19,20,21]]
index = pd.MultiIndex.from_tuples(index_tuples) # 인덱스 설정
df = pd.DataFrame(values, columns=['col4', 'col1', 'col2'], index = index)
df

Unnamed: 0,Unnamed: 1,col4,col1,col2
row1,val1,1.0,2.0,3
row1,val2,4.0,,6
row3,val3,7.0,8.0,9
row3,val1,,11.0,12
row3,val2,13.0,14.0,15
row2,val5,16.0,17.0,18
row2,val2,19.0,20.0,21


In [21]:
# level을 지정함으로써 해당 level에서의 정렬
# 0 : row, 1 : val

df.sort_index(axis=0, level=0)

Unnamed: 0,Unnamed: 1,col4,col1,col2
row1,val1,1.0,2.0,3
row1,val2,4.0,,6
row2,val2,19.0,20.0,21
row2,val5,16.0,17.0,18
row3,val1,,11.0,12
row3,val2,13.0,14.0,15
row3,val3,7.0,8.0,9


In [22]:
df.sort_index(axis=0, level=1)

Unnamed: 0,Unnamed: 1,col4,col1,col2
row1,val1,1.0,2.0,3
row3,val1,,11.0,12
row1,val2,4.0,,6
row2,val2,19.0,20.0,21
row3,val2,13.0,14.0,15
row3,val3,7.0,8.0,9
row2,val5,16.0,17.0,18


In [23]:
# level별 ascending의 병용
df.sort_index(axis=0, level=[1,0], ascending=[False,True])

Unnamed: 0,Unnamed: 1,col4,col1,col2
row2,val5,16.0,17.0,18
row3,val3,7.0,8.0,9
row1,val2,4.0,,6
row2,val2,19.0,20.0,21
row3,val2,13.0,14.0,15
row1,val1,1.0,2.0,3
row3,val1,,11.0,12


In [24]:
# sort_remaing 인수의 사용

df.sort_index(axis=0, sort_remaining=True)

Unnamed: 0,Unnamed: 1,col4,col1,col2
row1,val1,1.0,2.0,3
row1,val2,4.0,,6
row2,val2,19.0,20.0,21
row2,val5,16.0,17.0,18
row3,val1,,11.0,12
row3,val2,13.0,14.0,15
row3,val3,7.0,8.0,9


## 정렬 후 추출


In [25]:
col = ['col1','col2','col3']
row = ['row3','row5','row1','row4','row2']
data = [[ 1, 21, 7],
        [ 2, 33, 3],
        [ 2,  7,97],
        [ 4, 56,31],
        [ 5, 18, 5]]
df = pd.DataFrame(data=data, index=row, columns=col)
df

Unnamed: 0,col1,col2,col3
row3,1,21,7
row5,2,33,3
row1,2,7,97
row4,4,56,31
row2,5,18,5


In [27]:
# keep 인수를 통해 중복일 경우 출력할 행 지정

df.nlargest(n=3, columns='col1', keep='first')

Unnamed: 0,col1,col2,col3
row2,5,18,5
row4,4,56,31
row5,2,33,3


In [28]:
df.nlargest(n=3, columns='col1', keep='last')

Unnamed: 0,col1,col2,col3
row2,5,18,5
row4,4,56,31
row1,2,7,97


In [29]:
df.nlargest(n=3, columns='col1', keep='all')

Unnamed: 0,col1,col2,col3
row2,5,18,5
row4,4,56,31
row5,2,33,3
row1,2,7,97


In [31]:
# 여러 열을 동시에 정렬

df.nlargest(n=3, columns=['col1', 'col3'])
#col1에서 중복인 row1, row5에 대해 col3의 값이 더 큰 row1이 출력

Unnamed: 0,col1,col2,col3
row2,5,18,5
row4,4,56,31
row1,2,7,97
