In [3]:
import numpy as np
import pandas as pd

In [4]:
data = [[1.4, np.nan],
        [7.1, -4.5], 
        [np.nan, np.nan], 
       [0.75, -1.3]]
df = pd.DataFrame(data, columns=['one','two'], index = ['a','b','c','d'])

In [5]:
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [6]:
# 행 방향으로의 합(즉 각 열의 합)
df.sum(axis=0)

one    9.25
two   -5.80
dtype: float64

In [7]:
# 열 방향으로의 합(즉 각 행의 합)
df.sum(axis=1)

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

위에서 볼 수 있듯 NaN의 값은 제외하고 계산함
NaN값을 배제하지 않고 계산하려면 아래와 같이 skipna에 대해 false를 지정해준다

In [8]:
df.sum(axis=1, skipna=False)

a     NaN
b    2.60
c     NaN
d   -0.55
dtype: float64

In [9]:
# 특정 행 또는 특정 열에서만 계산하기

df['one'].sum()
# 특정 열 계산

9.25

In [11]:
df.loc['a'].sum()
# 특정 행 계산

1.4

#### pandas에서 DataFrame에 적용되는 함수들
sum() 함수 이외에도 pandas에서 DataFrame에 적용되는 함수는 다음의 것들이 있다<br><br>
count: 전체 성분의 (NaN이 아닌) 값의 갯수 계산<br>
max,min: 전체 성분의 최대 최솟값을 계산<br>
argmin, argmax: 전체 성분의 최대 최솟값이 위치한 (정수)인덱스를 반환<br>
idxmin, idxmax: 전체 인덱스 중 최대 최솟값 반환<br>
quantile: 전체 성분의 특정 사분위수에 해당하는 값 반환(0~1)<br>
sum: 전체 성분의 합<br>
mean: 전체 성분의 평균<br>
median: 전체 성분의 중간값 반환<br>
mad: 전체 성분의 평균값으로부터의 절대 편차(absolute deviation)의 평균을 계산<br>
std, var: 전체 성분의 표준편차, 분산 계산<br>
cumsum: 맨 첫 번째 성분부터 각 성분까지의 *누적합 계산(0부터 계속 더해짐)<br>
cumprod: 맨 첫 번째 성분부터 각 성분까지의 *누적곱 계산(1부터 계속 곱해짐)

In [13]:
df2 = pd.DataFrame(np.random.randn(6,4), # 6x4
                  columns=['A','B','C','D'],
                  index=pd.date_range('20160701', periods=6))
                  # 20160701부터 6일간 
df2

Unnamed: 0,A,B,C,D
2016-07-01,-2.416684,1.635784,-0.419866,0.065413
2016-07-02,-1.092334,-0.370611,-0.250897,2.285849
2016-07-03,-0.511805,-1.666009,0.694237,-1.148704
2016-07-04,-0.781373,-0.490335,0.227168,0.526725
2016-07-05,0.660624,0.447568,-0.209741,0.852217
2016-07-06,-0.505383,0.390815,-1.306933,1.262629


In [14]:
# A열과 B열의 상관계수 구하기
df2['A'].corr(df2['B'])

-0.40570041425111103

In [15]:
# B열과 C열의 공분산 구하기
df2['B'].cov(df2['C'])

-0.49551990587524064

#### 정렬함수 및 기타함수

In [17]:
dates = df2.index
random_dates = np.random.permutation(dates)
# np.random.permutation()은 무작위로 섞인 배열 만듦

df2 = df2.reindex(index=random_dates, columns=
                 ['D','B','C','A'])
df2

Unnamed: 0,D,B,C,A
2016-07-06,1.262629,0.390815,-1.306933,-0.505383
2016-07-01,0.065413,1.635784,-0.419866,-2.416684
2016-07-05,0.852217,0.447568,-0.209741,0.660624
2016-07-02,2.285849,-0.370611,-0.250897,-1.092334
2016-07-03,-1.148704,-1.666009,0.694237,-0.511805
2016-07-04,0.526725,-0.490335,0.227168,-0.781373


In [21]:
# index와 columns의 순서가 섞임
# 이때 *index가 오름차순이 되도록 정렬해보자

df2.sort_index(axis=0)

Unnamed: 0,D,B,C,A
2016-07-01,0.065413,1.635784,-0.419866,-2.416684
2016-07-02,2.285849,-0.370611,-0.250897,-1.092334
2016-07-03,-1.148704,-1.666009,0.694237,-0.511805
2016-07-04,0.526725,-0.490335,0.227168,-0.781373
2016-07-05,0.852217,0.447568,-0.209741,0.660624
2016-07-06,1.262629,0.390815,-1.306933,-0.505383


In [22]:
# columns를 기준으로 정렬하려면

df2.sort_index(axis=1)

Unnamed: 0,A,B,C,D
2016-07-06,-0.505383,0.390815,-1.306933,1.262629
2016-07-01,-2.416684,1.635784,-0.419866,0.065413
2016-07-05,0.660624,0.447568,-0.209741,0.852217
2016-07-02,-1.092334,-0.370611,-0.250897,2.285849
2016-07-03,-0.511805,-1.666009,0.694237,-1.148704
2016-07-04,-0.781373,-0.490335,0.227168,0.526725


In [23]:
# index를 기준으로 *내림차순 정렬

df2.sort_index(axis=0, ascending=False)
# ascending의 의미: 오름차순
# 그것을 false로 설정함으로써 내림차순이 됨

Unnamed: 0,D,B,C,A
2016-07-06,1.262629,0.390815,-1.306933,-0.505383
2016-07-05,0.852217,0.447568,-0.209741,0.660624
2016-07-04,0.526725,-0.490335,0.227168,-0.781373
2016-07-03,-1.148704,-1.666009,0.694237,-0.511805
2016-07-02,2.285849,-0.370611,-0.250897,-1.092334
2016-07-01,0.065413,1.635784,-0.419866,-2.416684


In [24]:
# 값을 기준으로 정렬하기
# D열의 값이 오름차순(작->큰)이 되도록 정렬

df2.sort_values(by="D")

Unnamed: 0,D,B,C,A
2016-07-03,-1.148704,-1.666009,0.694237,-0.511805
2016-07-01,0.065413,1.635784,-0.419866,-2.416684
2016-07-04,0.526725,-0.490335,0.227168,-0.781373
2016-07-05,0.852217,0.447568,-0.209741,0.660624
2016-07-06,1.262629,0.390815,-1.306933,-0.505383
2016-07-02,2.285849,-0.370611,-0.250897,-1.092334


In [25]:
# B열의 값이 내림차순(큰->작)이 되도록 정렬

df2.sort_values(by='B', ascending=False)

Unnamed: 0,D,B,C,A
2016-07-01,0.065413,1.635784,-0.419866,-2.416684
2016-07-05,0.852217,0.447568,-0.209741,0.660624
2016-07-06,1.262629,0.390815,-1.306933,-0.505383
2016-07-02,2.285849,-0.370611,-0.250897,-1.092334
2016-07-04,0.526725,-0.490335,0.227168,-0.781373
2016-07-03,-1.148704,-1.666009,0.694237,-0.511805


In [26]:
df2['E'] = np.random.randint(0,6, size = 6) # 0~5까지 정수 랜덤
df2['F'] = ['alpha','beta','gamma','gamma','alpha','gamma']
df2

Unnamed: 0,D,B,C,A,E,F
2016-07-06,1.262629,0.390815,-1.306933,-0.505383,4,alpha
2016-07-01,0.065413,1.635784,-0.419866,-2.416684,4,beta
2016-07-05,0.852217,0.447568,-0.209741,0.660624,3,gamma
2016-07-02,2.285849,-0.370611,-0.250897,-1.092334,0,gamma
2016-07-03,-1.148704,-1.666009,0.694237,-0.511805,2,alpha
2016-07-04,0.526725,-0.490335,0.227168,-0.781373,5,gamma


In [28]:
# E열과 F열을 동시에 고려하여 오름차순으로 하기
df2.sort_values(by=['E','F'])

Unnamed: 0,D,B,C,A,E,F
2016-07-02,2.285849,-0.370611,-0.250897,-1.092334,0,gamma
2016-07-03,-1.148704,-1.666009,0.694237,-0.511805,2,alpha
2016-07-05,0.852217,0.447568,-0.209741,0.660624,3,gamma
2016-07-06,1.262629,0.390815,-1.306933,-0.505383,4,alpha
2016-07-01,0.065413,1.635784,-0.419866,-2.416684,4,beta
2016-07-04,0.526725,-0.490335,0.227168,-0.781373,5,gamma


In [29]:
# 지정한 행 또는 열에서 중복값을 제외한 유니크한 값만 얻기(unique)
df2['F'].unique()

array(['alpha', 'beta', 'gamma'], dtype=object)

In [32]:
# 지정한 행 또는 열에서 값에 따른 개수 얻기(value_counts)
df2['F'].value_counts()

gamma    3
alpha    2
beta     1
Name: F, dtype: int64

In [33]:
# 지정한 행 또는 열에서 입력한 값이 있는지 확인하기(isin)
df2['F'].isin(['alpha','beta'])

2016-07-06     True
2016-07-01     True
2016-07-05    False
2016-07-02    False
2016-07-03     True
2016-07-04    False
Name: F, dtype: bool

In [34]:
# F열의 값이 'alpha'나 'beta'인 모든 행 구하기

df2.loc[df2['F'].isin(['alpha','beta']), :]
# loc: 값에 접근

Unnamed: 0,D,B,C,A,E,F
2016-07-06,1.262629,0.390815,-1.306933,-0.505383,4,alpha
2016-07-01,0.065413,1.635784,-0.419866,-2.416684,4,beta
2016-07-03,-1.148704,-1.666009,0.694237,-0.511805,2,alpha


사용자가 직접 만든 함수 적용하기

In [35]:
df3 = pd.DataFrame(np.random.randn(4,3), columns=['b','d','e'],
                  index=['Seoul','Incheon','Busan','Ulsan'])
df3

Unnamed: 0,b,d,e
Seoul,-0.631004,-0.415708,1.088427
Incheon,1.399068,-1.13004,-0.11673
Busan,1.172673,0.238272,-2.754068
Ulsan,0.401386,-0.813904,-0.460921


In [36]:
func = lambda x: x.max() - x.min()

In [38]:
df3.apply(func, axis=0)

b    2.030072
d    1.368312
e    3.842495
dtype: float64