In [1]:
import numpy as np
import pandas as pd


In [3]:
print('DataFrame')

df = pd.DataFrame({
      'col1' : [2,1,9,8,7,4],
      'col2' : ['a','a','b',np.nan,'d','c'],
      'col3' : [0,1,9,4,2,3]
  })

df

DataFrame


Unnamed: 0,col1,col2,col3
0,2,a,0
1,1,a,1
2,9,b,9
3,8,,4
4,7,d,2
5,4,c,3


In [5]:
# 정렬

# 1. col1을 기준, 오름차순 정렬하기

df.sort_values('col1',ascending = True) # default : 오름차순

Unnamed: 0,col1,col2,col3
1,1,a,1
0,2,a,0
5,4,c,3
4,7,d,2
3,8,,4
2,9,b,9


In [6]:
df.sort_values('col1',ascending = False) # 내림차순

Unnamed: 0,col1,col2,col3
2,9,b,9
3,8,,4
4,7,d,2
5,4,c,3
0,2,a,0
1,1,a,1


In [7]:
df.sort_values(['col2','col1'],ascending = [True,False])
# col2 오름차순 먼저하고 col1 내림차순

Unnamed: 0,col1,col2,col3
0,2,a,0
1,1,a,1
2,9,b,9
5,4,c,3
4,7,d,2
3,8,,4


집계함수

In [23]:
data = {
    'korea' : [50,60,70],
    'math' : [10, np.nan, 40]
}
data

{'korea': [50, 60, 70], 'math': [10, nan, 40]}

In [24]:
pd.DataFrame(data, index = ['a','b','c'])

Unnamed: 0,korea,math
a,50,10.0
b,60,
c,70,40.0


In [25]:
df = pd.DataFrame(data, index = ['a','b','c'])
df

Unnamed: 0,korea,math
a,50,10.0
b,60,
c,70,40.0


In [26]:
# 각 컬럼별 데이터 개수

df.count() # axis = 0 : default

korea    3
math     2
dtype: int64

In [27]:
# 각 행별 데이터 개수

df.count(axis = 1)

a    2
b    1
c    2
dtype: int64

In [28]:
# 각 컬럼별 최대값

df.max()

korea    70.0
math     40.0
dtype: float64

In [29]:
# 각 컬럼별 최솟값

df.min()


korea    50.0
math     10.0
dtype: float64

In [30]:
# 각 컬럼별 합계

df.sum()

korea    180.0
math      50.0
dtype: float64

In [31]:
# NAN값 컬럼의 최솟값으로 대체

df['math'].fillna(df['math'].min())


a    10.0
b    10.0
c    40.0
Name: math, dtype: float64

In [32]:
df_copy = df.copy()

In [33]:
df_copy['math'] = df['math'].fillna(df['math'].min())
df_copy

Unnamed: 0,korea,math
a,50,10.0
b,60,10.0
c,70,40.0


In [35]:
# NAN값 > 컬럼의 평균값으로 대체
df_copy2 = df.copy()

df_copy2['math'].fillna(df_copy2['math'].mean(), inplace = True)
df_copy2

Unnamed: 0,korea,math
a,50,10.0
b,60,25.0
c,70,40.0


그룹으로 묶기

In [37]:
df = pd.DataFrame({
      'key' : ['a','b','c','a','b','c'],
      'data1' :[1,2,3,1,2,3],
      'data2' :[4,4,6,0,6,1]
  })


In [38]:
print('DataFrame\n')
df

DataFrame



Unnamed: 0,key,data1,data2
0,a,1,4
1,b,2,4
2,c,3,6
3,a,1,0
4,b,2,6
5,c,3,1


In [39]:
# groupby

df.groupby('key').sum()

Unnamed: 0_level_0,data1,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
a,2,4
b,4,10
c,6,7


In [41]:
# key, data1 기준, 합계 구하기
df.groupby(['key','data1']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key,data1,Unnamed: 2_level_1
a,1,4
b,2,10
c,3,7


In [42]:
df.groupby(['key','data2']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1
key,data2,Unnamed: 2_level_1
a,0,1
a,4,1
b,4,2
b,6,2
c,1,3
c,6,3


In [43]:
# aggregate 함수 이용, 요양통계량 산출

# 'key' 컬럼을 묶은 후, data1, data2, 각각의 최솟값, 중앙값, 최대값 출력

df.groupby('key').aggregate([min, np.median, max])

Unnamed: 0_level_0,data1,data1,data1,data2,data2,data2
Unnamed: 0_level_1,min,median,max,min,median,max
key,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
a,1,1.0,1,0,2.0,4
b,2,2.0,2,4,5.0,6
c,3,3.0,3,1,3.5,6


In [48]:
# 데이터프레임 'key' 컬럼으로 묶고, data1의 최소값, data2는 합계 출력하세요

df.groupby('key').aggregate({'data1' : min, 'data2' : sum})

Unnamed: 0_level_0,data1,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
a,1,4
b,2,10
c,3,7
