In [1]:
import pandas as pd

In [2]:
import numpy as np

In [3]:
data = [[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75,-1.3]]

In [4]:
df = pd.DataFrame(data, columns=['one', 'two'], index=['a', 'b', 'c', 'd'])

In [5]:
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [6]:
df.sum(axis=0)

one    9.25
two   -5.80
dtype: float64

In [7]:
df.sum(axis=1)

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [8]:
df.sum()

one    9.25
two   -5.80
dtype: float64

In [9]:
df['one'].sum()

9.25

In [10]:
df.loc['d'].sum()

-0.55

In [13]:
df.mean(axis=1, skipna='False')

a    1.400
b    1.300
c      NaN
d   -0.275
dtype: float64

In [14]:
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [15]:
one_mean = df.mean(axis=0)['one']

In [16]:
one_mean

3.0833333333333335

In [17]:
two_mean = df.mean(axis=0)['two']

In [18]:
two_mean

-2.9

In [20]:
df['one'] = df['one'].fillna(value=one_mean)

In [21]:
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,3.083333,
d,0.75,-1.3


In [22]:
df['two'] = df['two'].fillna(value=two_mean)

In [23]:
df

Unnamed: 0,one,two
a,1.4,-2.9
b,7.1,-4.5
c,3.083333,-2.9
d,0.75,-1.3


In [24]:
df2 = pd.DataFrame(np.random.randn(6,4), columns=['A', 'B','C', 'D'],
                  index=pd.date_range('20190101', periods=6))

In [25]:
df2

Unnamed: 0,A,B,C,D
2019-01-01,0.565362,-0.27086,1.993806,-2.710843
2019-01-02,-0.098093,1.7705,-0.013342,-0.041578
2019-01-03,0.477981,1.033943,0.143029,0.662872
2019-01-04,0.681722,-0.60158,0.636263,0.651021
2019-01-05,0.469159,-0.26152,-0.459033,0.073569
2019-01-06,-0.105936,-0.118736,0.385042,0.200631


In [29]:
df2['A'].corr(df2['B'])

-0.5417201669545558

In [31]:
df2['B'].cov(df2['C'])

-0.28376395976104724

In [33]:
df2.corr()

Unnamed: 0,A,B,C,D
A,1.0,-0.54172,0.332798,-0.168699
B,-0.54172,1.0,-0.361916,0.205808
C,0.332798,-0.361916,1.0,-0.81009
D,-0.168699,0.205808,-0.81009,1.0


In [34]:
dates = df2.index
random_dates = np.random.permutation(dates)
df2 = df2.reindex(index=random_dates, columns=['D', 'B', 'C', 'A'])
#기존 값을 랜덤하게 섞는 방법

In [37]:
df2

Unnamed: 0,D,B,C,A
2019-01-06,0.200631,-0.118736,0.385042,-0.105936
2019-01-02,-0.041578,1.7705,-0.013342,-0.098093
2019-01-05,0.073569,-0.26152,-0.459033,0.469159
2019-01-03,0.662872,1.033943,0.143029,0.477981
2019-01-04,0.651021,-0.60158,0.636263,0.681722
2019-01-01,-2.710843,-0.27086,1.993806,0.565362


In [38]:
#정렬되지 않은 '정렬' 데이터를 다시 정렬
df2.sort_index(axis=0) #행 기준으로 

Unnamed: 0,D,B,C,A
2019-01-01,-2.710843,-0.27086,1.993806,0.565362
2019-01-02,-0.041578,1.7705,-0.013342,-0.098093
2019-01-03,0.662872,1.033943,0.143029,0.477981
2019-01-04,0.651021,-0.60158,0.636263,0.681722
2019-01-05,0.073569,-0.26152,-0.459033,0.469159
2019-01-06,0.200631,-0.118736,0.385042,-0.105936


In [39]:
df2.sort_index(axis=1) #열기준으로, 오름차순

Unnamed: 0,A,B,C,D
2019-01-06,-0.105936,-0.118736,0.385042,0.200631
2019-01-02,-0.098093,1.7705,-0.013342,-0.041578
2019-01-05,0.469159,-0.26152,-0.459033,0.073569
2019-01-03,0.477981,1.033943,0.143029,0.662872
2019-01-04,0.681722,-0.60158,0.636263,0.651021
2019-01-01,0.565362,-0.27086,1.993806,-2.710843


In [42]:
#내림차순 정렬
df2.sort_index(axis=0, ascending=False)

Unnamed: 0,D,B,C,A
2019-01-06,0.200631,-0.118736,0.385042,-0.105936
2019-01-05,0.073569,-0.26152,-0.459033,0.469159
2019-01-04,0.651021,-0.60158,0.636263,0.681722
2019-01-03,0.662872,1.033943,0.143029,0.477981
2019-01-02,-0.041578,1.7705,-0.013342,-0.098093
2019-01-01,-2.710843,-0.27086,1.993806,0.565362


In [43]:
#특정열 기준으로 정렬
df2.sort_values(by='D')

Unnamed: 0,D,B,C,A
2019-01-01,-2.710843,-0.27086,1.993806,0.565362
2019-01-02,-0.041578,1.7705,-0.013342,-0.098093
2019-01-05,0.073569,-0.26152,-0.459033,0.469159
2019-01-06,0.200631,-0.118736,0.385042,-0.105936
2019-01-04,0.651021,-0.60158,0.636263,0.681722
2019-01-03,0.662872,1.033943,0.143029,0.477981


In [44]:
df2['E'] = np.random.randint(0,6, size=6)
df2['F'] = ['alpha', 'beta', 'gamma', 'gamma', 'alpha', 'gamma']

In [45]:
df2

Unnamed: 0,D,B,C,A,E,F
2019-01-06,0.200631,-0.118736,0.385042,-0.105936,2,alpha
2019-01-02,-0.041578,1.7705,-0.013342,-0.098093,4,beta
2019-01-05,0.073569,-0.26152,-0.459033,0.469159,3,gamma
2019-01-03,0.662872,1.033943,0.143029,0.477981,4,gamma
2019-01-04,0.651021,-0.60158,0.636263,0.681722,4,alpha
2019-01-01,-2.710843,-0.27086,1.993806,0.565362,1,gamma


In [46]:
df2.sort_values(by=['E', 'F'])

Unnamed: 0,D,B,C,A,E,F
2019-01-01,-2.710843,-0.27086,1.993806,0.565362,1,gamma
2019-01-06,0.200631,-0.118736,0.385042,-0.105936,2,alpha
2019-01-05,0.073569,-0.26152,-0.459033,0.469159,3,gamma
2019-01-04,0.651021,-0.60158,0.636263,0.681722,4,alpha
2019-01-02,-0.041578,1.7705,-0.013342,-0.098093,4,beta
2019-01-03,0.662872,1.033943,0.143029,0.477981,4,gamma


df2['F'].unique() #중복없는 값

In [48]:
df2['F'].value_counts() #값에 따른 갯수 

gamma    3
alpha    2
beta     1
Name: F, dtype: int64

In [49]:
df2['F'].isin(['alpha', 'beta']) #특정한 값 존재여부

2019-01-06     True
2019-01-02     True
2019-01-05    False
2019-01-03    False
2019-01-04     True
2019-01-01    False
Name: F, dtype: bool

In [50]:
df2.loc[df2['F'].isin(['alpha', 'beta']), :] #특정한 값이 있는 행 전체를 추출

Unnamed: 0,D,B,C,A,E,F
2019-01-06,0.200631,-0.118736,0.385042,-0.105936,2,alpha
2019-01-02,-0.041578,1.7705,-0.013342,-0.098093,4,beta
2019-01-04,0.651021,-0.60158,0.636263,0.681722,4,alpha


## 사용자가 직접정의 한 함수를 적용하여 데이터 추출

In [52]:
df3 = pd.DataFrame(np.random.randn(4,3), columns=['b','d', 'e'], index=['seoul', 'incheon', 'busan', 'daegu'])

In [56]:
df3

Unnamed: 0,b,d,e
seoul,-1.781717,0.476224,-0.327177
incheon,-0.956613,0.322723,0.041688
busan,1.789346,-0.754608,-0.075721
daegu,0.477799,0.414553,-1.29196


#### 특정 함수 생성

In [54]:
func = lambda x : x.max()-x.min() 

> 함수 적용(apply)

In [55]:
df3.apply(func, axis=0) #열값

b    3.571063
d    1.230832
e    1.333648
dtype: float64

In [57]:
df3.apply(func, axis=1) #행값

seoul      2.257940
incheon    1.279336
busan      2.543954
daegu      1.769759
dtype: float64