# 1. Overview

- 구조화된 데이터의 처리를 지원하는 파이썬 라이브러리
- 고성능 array 계산 라이브러리인 numpy와 통합하여 강력한 "스프레드시트" 처리 기능 제공
- 인덱싱, 연산용 함수, 전처리 함수 등을 제공함
- R의 **dataframe** 개념
---

# 2. Series

## Pandas의 구성

- Series: DataFrame 중 하나의 column에 해당하는 데이터 모음 Object
- DataFrame: Data Table 전체를 포함하는 Object 


### Series
- Column vector를 표현하는 object
- Subclass of numpy.ndarray
- Data: any type
- Index labels need not be ordered
- Duplicates are possible but results in reduced functionality
- (index가 추가된 numpy)

In [9]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np

In [4]:
# index | data로 구성
# index는 raw instance에서 각 instance를 불러주는 역할
# index를 기준으로 data join
list_data = [1,2,3,4,5]
example_obj = Series(data = list_data)
example_obj

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [6]:
# series 이름 설정
list_data = [1,2,3,4,5]
list_name = ['a', 'b', 'c', 'd', 'e']
example_obj = Series(data = list_data, index = list_name)
example_obj

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [11]:
#이름도 설정 가능
dict_data = {'a':1, 'b':2, 'c':3, 'd':4, 'e':5}
example_obj = Series(dict_data, dtype=np.float32, name="example_data")
example_obj

a    1.0
b    2.0
c    3.0
d    4.0
e    5.0
Name: example_data, dtype: float32

In [13]:
# data index 접근 - key value처럼
example_obj['a']

1.0

In [15]:
# data index 값 할당 가능
example_obj['a'] = 5
example_obj['a']

5.0

In [16]:
# 값만
example_obj.values

array([5., 2., 3., 4., 5.], dtype=float32)

In [18]:
#index
example_obj.index

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [19]:
# metadata
example_obj.name = 'number'
example_obj.index.name = 'alphabet'
example_obj

alphabet
a    5.0
b    2.0
c    3.0
d    4.0
e    5.0
Name: number, dtype: float32

In [20]:
# index 기준 series 생성
dict_data = {'a':1, 'b':2, 'c':3, 'd':4, 'e':5}
indexes = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h']
series_obj_1 = Series(dict_data, index=indexes)
series_obj_1

a    1.0
b    2.0
c    3.0
d    4.0
e    5.0
f    NaN
g    NaN
h    NaN
dtype: float64

In [22]:
series_obj_1[series_obj_1>3]

d    4.0
e    5.0
dtype: float64

In [24]:
# broadcasting 가능
series_obj_1 * 2

a     2.0
b     4.0
c     6.0
d     8.0
e    10.0
f     NaN
g     NaN
h     NaN
dtype: float64

---
# 3. DataFrame Overview

## DataFrame
- NumPy array-like
- **Each column can have a different type** 
- Row and column index
- Size mutable: insert and delete columns


- Series를 모아서 만든 Data Table = 2 Dimension

In [25]:
from pandas import Series ,DataFrame
import pandas as pd
import numpy as np

In [28]:
# 보통 csv file 로드하는 방식일 것
raw_data = {
    'first_name' : ['Jason','Molly', 'Tina', 'Jake', 'Amy'],
    'last_name' : ['Miller', 'Jacobson', 'Ali', 'Milner', 'Cooze'],
    'age' : [42, 52, 36, 24, 73],
    'city' : ['San Francisco', 'Baltimore', 'Miami','Douglas','Boston']
}
df = pd.DataFrame(raw_data, columns = ['first_name','last_name','age','city'])
df

Unnamed: 0,first_name,last_name,age,city
0,Jason,Miller,42,San Francisco
1,Molly,Jacobson,52,Baltimore
2,Tina,Ali,36,Miami
3,Jake,Milner,24,Douglas
4,Amy,Cooze,73,Boston


In [29]:
DataFrame(raw_data, columns=['age','city'])

Unnamed: 0,age,city
0,42,San Francisco
1,52,Baltimore
2,36,Miami
3,24,Douglas
4,73,Boston


In [30]:
DataFrame(raw_data, columns=['first_name', 'last_name','age','city', 'debt'])

Unnamed: 0,first_name,last_name,age,city,debt
0,Jason,Miller,42,San Francisco,
1,Molly,Jacobson,52,Baltimore,
2,Tina,Ali,36,Miami,
3,Jake,Milner,24,Douglas,
4,Amy,Cooze,73,Boston,


In [33]:
# column 선택 - Series 추출
df = DataFrame(raw_data, columns=['first_name', 'last_name','age','city', 'debt'])
df.first_name

0    Jason
1    Molly
2     Tina
3     Jake
4      Amy
Name: first_name, dtype: object

In [34]:
df['first_name']

0    Jason
1    Molly
2     Tina
3     Jake
4      Amy
Name: first_name, dtype: object

#### loc - index location(index 이름) / iloc - index position (index number)

In [35]:
df

Unnamed: 0,first_name,last_name,age,city,debt
0,Jason,Miller,42,San Francisco,
1,Molly,Jacobson,52,Baltimore,
2,Tina,Ali,36,Miami,
3,Jake,Milner,24,Douglas,
4,Amy,Cooze,73,Boston,


In [37]:
df.loc[1] #1st row vector

first_name        Molly
last_name      Jacobson
age                  52
city          Baltimore
debt                NaN
Name: 1, dtype: object

In [38]:
df['age'].iloc[1:]

1    52
2    36
3    24
4    73
Name: age, dtype: int64

In [42]:
s = pd.Series(np.nan, index=[49, 48, 47, 46, 45, 1, 2, 3, 4, 5])
s.loc[:3]

49   NaN
48   NaN
47   NaN
46   NaN
45   NaN
1    NaN
2    NaN
3    NaN
dtype: float64

In [43]:
s.iloc[:3]

49   NaN
48   NaN
47   NaN
dtype: float64



## Column에 새로운 데이터 할당

In [44]:
df.debt = df.age > 40
df

Unnamed: 0,first_name,last_name,age,city,debt
0,Jason,Miller,42,San Francisco,True
1,Molly,Jacobson,52,Baltimore,True
2,Tina,Ali,36,Miami,False
3,Jake,Milner,24,Douglas,False
4,Amy,Cooze,73,Boston,True


In [46]:
df.T # Transpose

Unnamed: 0,0,1,2,3,4
first_name,Jason,Molly,Tina,Jake,Amy
last_name,Miller,Jacobson,Ali,Milner,Cooze
age,42,52,36,24,73
city,San Francisco,Baltimore,Miami,Douglas,Boston
debt,True,True,False,False,True


In [47]:
df.values

array([['Jason', 'Miller', 42, 'San Francisco', True],
       ['Molly', 'Jacobson', 52, 'Baltimore', True],
       ['Tina', 'Ali', 36, 'Miami', False],
       ['Jake', 'Milner', 24, 'Douglas', False],
       ['Amy', 'Cooze', 73, 'Boston', True]], dtype=object)

In [48]:
df.keys

<bound method NDFrame.keys of   first_name last_name  age           city   debt
0      Jason    Miller   42  San Francisco   True
1      Molly  Jacobson   52      Baltimore   True
2       Tina       Ali   36          Miami  False
3       Jake    Milner   24        Douglas  False
4        Amy     Cooze   73         Boston   True>

In [49]:
df.to_csv()

',first_name,last_name,age,city,debt\r\n0,Jason,Miller,42,San Francisco,True\r\n1,Molly,Jacobson,52,Baltimore,True\r\n2,Tina,Ali,36,Miami,False\r\n3,Jake,Milner,24,Douglas,False\r\n4,Amy,Cooze,73,Boston,True\r\n'

In [50]:
df.as_matrix()

  """Entry point for launching an IPython kernel.


array([['Jason', 'Miller', 42, 'San Francisco', True],
       ['Molly', 'Jacobson', 52, 'Baltimore', True],
       ['Tina', 'Ali', 36, 'Miami', False],
       ['Jake', 'Milner', 24, 'Douglas', False],
       ['Amy', 'Cooze', 73, 'Boston', True]], dtype=object)

In [51]:
del df['debt']
df

Unnamed: 0,first_name,last_name,age,city
0,Jason,Miller,42,San Francisco
1,Molly,Jacobson,52,Baltimore
2,Tina,Ali,36,Miami
3,Jake,Milner,24,Douglas
4,Amy,Cooze,73,Boston


In [52]:
# Nested Dict에서는 {'(column)':{(index):values, (index):values}}
pop = {
    'Nevada': {2001: 2.4, 2002: 2.9},
    'Ohio' : {2000: 1.5, 2001: 1.7, 2002: 3.6}
}
DataFrame(pop)

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [56]:
values = Series(data=['M','F','F'], index=[0,1,2])
values

0    M
1    F
2    F
dtype: object

In [58]:
df['sex'] = values # column 추가
df

Unnamed: 0,first_name,last_name,age,city,sex
0,Jason,Miller,42,San Francisco,M
1,Molly,Jacobson,52,Baltimore,F
2,Tina,Ali,36,Miami,F
3,Jake,Milner,24,Douglas,
4,Amy,Cooze,73,Boston,


---
# 4. Selection & Drop

## Selection with column names

In [60]:
df['city'].head(3) # 한 개 column 선택 시 

0    San Francisco
1        Baltimore
2            Miami
Name: city, dtype: object

In [62]:
df[['city', 'age']].head(3) # 여러 개 column 선택

Unnamed: 0,city,age
0,San Francisco,42
1,Baltimore,52
2,Miami,36


## Selection with index number

In [63]:
df[:3]

Unnamed: 0,first_name,last_name,age,city,sex
0,Jason,Miller,42,San Francisco,M
1,Molly,Jacobson,52,Baltimore,F
2,Tina,Ali,36,Miami,F


In [65]:
df['first_name'][:2]

0    Jason
1    Molly
Name: first_name, dtype: object

## Series Selection

In [66]:
account_series = df['age']
account_series[:3]

0    42
1    52
2    36
Name: age, dtype: int64

In [68]:
account_series[[0,1,2]] #한 개 이상의 index

0    42
1    52
2    36
Name: age, dtype: int64

In [70]:
account_series[account_series<50]# Boolean index

0    42
2    36
3    24
Name: age, dtype: int64

## Index 변경

In [71]:
df.index = df['age']
del df['age']
df.head()

Unnamed: 0_level_0,first_name,last_name,city,sex
age,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
42,Jason,Miller,San Francisco,M
52,Molly,Jacobson,Baltimore,F
36,Tina,Ali,Miami,F
24,Jake,Milner,Douglas,
73,Amy,Cooze,Boston,


## Basic, loc, iloc selection

In [73]:
df[['first_name', 'last_name']][:2]

Unnamed: 0_level_0,first_name,last_name
age,Unnamed: 1_level_1,Unnamed: 2_level_1
42,Jason,Miller
52,Molly,Jacobson


In [75]:
df.loc[[42, 36], ['city', 'sex']] # index name, column

Unnamed: 0_level_0,city,sex
age,Unnamed: 1_level_1,Unnamed: 2_level_1
42,San Francisco,M
36,Miami,F


In [76]:
df.iloc[:2, :2] # index number, column number

Unnamed: 0_level_0,first_name,last_name
age,Unnamed: 1_level_1,Unnamed: 2_level_1
42,Jason,Miller
52,Molly,Jacobson


In [78]:
df[['first_name', 'last_name']].iloc[:3]

Unnamed: 0_level_0,first_name,last_name
age,Unnamed: 1_level_1,Unnamed: 2_level_1
42,Jason,Miller
52,Molly,Jacobson
36,Tina,Ali


## Index 재설정

In [80]:
df.index = list(range(0,5))
df.head()

Unnamed: 0,first_name,last_name,city,sex
0,Jason,Miller,San Francisco,M
1,Molly,Jacobson,Baltimore,F
2,Tina,Ali,Miami,F
3,Jake,Milner,Douglas,
4,Amy,Cooze,Boston,


## Data Drop

In [81]:
df.drop(1) # index number

Unnamed: 0,first_name,last_name,city,sex
0,Jason,Miller,San Francisco,M
2,Tina,Ali,Miami,F
3,Jake,Milner,Douglas,
4,Amy,Cooze,Boston,


In [82]:
df.drop([0,2])

Unnamed: 0,first_name,last_name,city,sex
1,Molly,Jacobson,Baltimore,F
3,Jake,Milner,Douglas,
4,Amy,Cooze,Boston,


In [89]:
# column 중 'first_name' drop. default가 axis=0이니까 column vector drop 하려면 axis 지정 필요
df.drop('first_name', axis=1) 

Unnamed: 0,last_name,city,sex
0,Miller,San Francisco,M
1,Jacobson,Baltimore,F
2,Ali,Miami,F
3,Milner,Douglas,
4,Cooze,Boston,


In [87]:
df.drop(0, axis=0) # row 중 '0' drop

Unnamed: 0,first_name,last_name,city,sex
1,Molly,Jacobson,Baltimore,F
2,Tina,Ali,Miami,F
3,Jake,Milner,Douglas,
4,Amy,Cooze,Boston,


---
# Dataframe operations

## Series Operation

- Index 기준으로 연산수행
- 겹치는 Index 없을 경우 NaN 반환

In [91]:
s1 = Series(range(1,6), index=list('abcde'))
s1

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [93]:
s2 = Series(range(5,11), index=list('bcedef'))
s2

b     5
c     6
e     7
d     8
e     9
f    10
dtype: int64

In [94]:
s1.add(s2) # fill_value = 0: NaN에 0 넣음

a     NaN
b     7.0
c     9.0
d    12.0
e    12.0
e    14.0
f     NaN
dtype: float64

In [95]:
s1 + s2

a     NaN
b     7.0
c     9.0
d    12.0
e    12.0
e    14.0
f     NaN
dtype: float64

## Dataframe operation

- column, index 모두 고려
- add operation 쓰면 NaN값 0으로 변환
- Operation types: add, sub, div, mul

In [96]:
df1 = DataFrame(np.arange(9).reshape(3,3), columns = list('abc'))
df1

Unnamed: 0,a,b,c
0,0,1,2
1,3,4,5
2,6,7,8


In [97]:
df2 = DataFrame(np.arange(16).reshape(4,4), columns = list('abcd'))
df2

Unnamed: 0,a,b,c,d
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15


In [98]:
df1+df2

Unnamed: 0,a,b,c,d
0,0.0,2.0,4.0,
1,7.0,9.0,11.0,
2,14.0,16.0,18.0,
3,,,,


In [99]:
df1.add(df2, fill_value=0)

Unnamed: 0,a,b,c,d
0,0.0,2.0,4.0,3.0
1,7.0,9.0,11.0,7.0
2,14.0,16.0,18.0,11.0
3,12.0,13.0,14.0,15.0


## Series + Dataframe
- column 기준으로 broadcasting

In [100]:
df = DataFrame(np.arange(16).reshape(4,4), columns = list('abcd'))
df

Unnamed: 0,a,b,c,d
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15


In [101]:
s = Series(np.arange(10,14), index=list('abcd'))
s

a    10
b    11
c    12
d    13
dtype: int32

In [103]:
df + s # index abcd 기준으로 각 broadcasting

Unnamed: 0,a,b,c,d
0,10,12,14,16
1,14,16,18,20
2,18,20,22,24
3,22,24,26,28


#### axis 기준으로 row broadcasting 실행

In [107]:
s = Series(np.arange(10,14))
s

0    10
1    11
2    12
3    13
dtype: int32

In [108]:
df

Unnamed: 0,a,b,c,d
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15


In [111]:
df + s

Unnamed: 0,a,b,c,d,0,1,2,3
0,,,,,,,,
1,,,,,,,,
2,,,,,,,,
3,,,,,,,,


In [115]:
df.add(s, axis=0) #default는 axis=1이었음

Unnamed: 0,a,b,c,d
0,10,11,12,13
1,15,16,17,18
2,20,21,22,23
3,25,26,27,28


---
# 6. lambda, map, apply

## lambda

- 한 줄로 함수 표현하는 익명 함수 기법
- lisp 언어에서 시작된 기법으로 오늘날 현대 언어에 많이 사용
- *lambda argument : expression*

In [117]:
f = lambda x,y : x+y
f(1,2)

3

In [118]:
# one argument
f = lambda x: x/2
f(3)

1.5

In [119]:
# 이름 할당하지 않는 lambda
(lambda x: x+1)(5)

6

## map

- 함수와 sequence 형 데이터를 인자로 받아
- 각 element마다 입력받은 함수 적용하여 리스트로 반환
- 일반적으로 함수를 lambda 형태로 표현함
- *map(function, sequence)*

In [120]:
ex = [1,2,3,4,5]
f = lambda x: x**2
list(map(f,ex))

[1, 4, 9, 16, 25]

In [121]:
# 두 개 이상의 argument가 있을 때는 두 개의 sequence형을 써야 함
f = lambda x, y: x+y
list(map(f, ex, ex))

[2, 4, 6, 8, 10]

In [122]:
list(map(lambda x: x+x, ex)) # list로 해야 함

[2, 4, 6, 8, 10]

### Map for Series
- pandas series type data도 map 사용 가능
- function 대신 dict, sequence 형 자료 등으로 대체 가능

In [123]:
s1 = Series(np.arange(10))
s1.head(5)

0    0
1    1
2    2
3    3
4    4
dtype: int32

In [124]:
s1.map(lambda x: x**2).head(5)

0     0
1     1
2     4
3     9
4    16
dtype: int64

In [125]:
# dict type으로 데이터 교체. 없는 값은 NaN
# dummy variable 만들 때 사용
z = {1:'A', 2:'B', 3:'C'}
s1.map(z).head(5)

0    NaN
1      A
2      B
3      C
4    NaN
dtype: object

In [126]:
s2 = Series(np.arange(10, 20))
s2.map(s2).head(5)

0   NaN
1   NaN
2   NaN
3   NaN
4   NaN
dtype: float64

### Example - map for series

In [127]:
raw_data = {
    'earn' :  [79571.299, 96396.988, 48710.666, 80478.096, 82089345],
    'height': [73.89, 66.23, 63.77, 63.22, 63.08],
    'sex':    ['male','female','female','female','female'],
    'race':   ['white', 'white', 'white', 'other', 'white'],
    'ed' :    [16, 16, 16, 16, 17],
    'age':    [49, 62, 33, 95, 43]
}
df = pd.DataFrame(raw_data, columns=['earn','height','sex','race','ed','age'])
df

Unnamed: 0,earn,height,sex,race,ed,age
0,79571.3,73.89,male,white,16,49
1,96396.99,66.23,female,white,16,62
2,48710.67,63.77,female,white,16,33
3,80478.1,63.22,female,other,16,95
4,82089340.0,63.08,female,white,17,43


In [128]:
df.sex.unique()

array(['male', 'female'], dtype=object)

In [129]:
df['sex_code'] = df.sex.map({'male':0, 'female':1})
df

Unnamed: 0,earn,height,sex,race,ed,age,sex_code
0,79571.3,73.89,male,white,16,49,0
1,96396.99,66.23,female,white,16,62,1
2,48710.67,63.77,female,white,16,33,1
3,80478.1,63.22,female,other,16,95,1
4,82089340.0,63.08,female,white,17,43,1


### Replace 

- map 함수 기능 중 데이터 변환 기능만 담당
- 데이터 변환 시 많이 사용하는 함수

In [132]:
df.sex.replace(
    {'male':0, 'female':1}
).head()

0    0
1    1
2    1
3    1
4    1
Name: sex, dtype: int64

In [133]:
df.sex.replace(
    ['male', 'female'], # target list
    [0,1], # conversion list
    inplace = True # 실제 df 변환
)
df.head(5)

Unnamed: 0,earn,height,sex,race,ed,age,sex_code
0,79571.3,73.89,0,white,16,49,0
1,96396.99,66.23,1,white,16,62,1
2,48710.67,63.77,1,white,16,33,1
3,80478.1,63.22,1,other,16,95,1
4,82089340.0,63.08,1,white,17,43,1


## Apply for dataframe

- map과 달리 series 전체(column)에 해당 함수를 적용
- 입력값을 series 데이터로 입력받아 handling 가능


- 각 column 별로 결과값 반환


- 내장 연산 함수 사용할 때도 똑같은 효과
- mean, std etc

In [135]:
df_info = df[['earn', 'height', 'age']]
df_info.head()

Unnamed: 0,earn,height,age
0,79571.3,73.89,49
1,96396.99,66.23,62
2,48710.67,63.77,33
3,80478.1,63.22,95
4,82089340.0,63.08,43


In [137]:
f = lambda x : x.max() - x.min()
df_info.apply(f)

earn      8.204063e+07
height    1.081000e+01
age       6.200000e+01
dtype: float64

In [138]:
df_info.sum()

earn      8.239450e+07
height    3.301900e+02
age       2.820000e+02
dtype: float64

In [139]:
df_info.apply(sum)

earn      8.239450e+07
height    3.301900e+02
age       2.820000e+02
dtype: float64

#### scalar 값 이외에 series 값의 반환 가능

In [142]:
# summary statistics
def f(x):
    return Series([x.min(), x.max()], index=['min', 'max'])
df_info.apply(f)

Unnamed: 0,earn,height,age
min,48710.67,63.08,33
max,82089340.0,73.89,95


#### series 단위가 아닌 element 단위로 함수 적용, series 단위에 apply 적용시킬 때와 같은 효과

In [143]:
f = lambda x: -x
df_info.applymap(f).head(5)

Unnamed: 0,earn,height,age
0,-79571.3,-73.89,-49
1,-96396.99,-66.23,-62
2,-48710.67,-63.77,-33
3,-80478.1,-63.22,-95
4,-82089340.0,-63.08,-43


In [144]:
f = lambda x:-x
df_info['earn'].apply(f).head(5)

0   -7.957130e+04
1   -9.639699e+04
2   -4.871067e+04
3   -8.047810e+04
4   -8.208934e+07
Name: earn, dtype: float64

---
# 7. Pandas built-in functions

## describe
- numeric type data의 요약 정보 보여줌

In [145]:
df

Unnamed: 0,earn,height,sex,race,ed,age,sex_code
0,79571.3,73.89,0,white,16,49,0
1,96396.99,66.23,1,white,16,62,1
2,48710.67,63.77,1,white,16,33,1
3,80478.1,63.22,1,other,16,95,1
4,82089340.0,63.08,1,white,17,43,1


In [147]:
df.describe() #summary statistics

Unnamed: 0,earn,height,sex,ed,age,sex_code
count,5.0,5.0,5.0,5.0,5.0,5.0
mean,16478900.0,66.038,0.8,16.2,56.4,0.8
std,36677360.0,4.569614,0.447214,0.447214,23.995833,0.447214
min,48710.67,63.08,0.0,16.0,33.0,0.0
25%,79571.3,63.22,1.0,16.0,43.0,1.0
50%,80478.1,63.77,1.0,16.0,49.0,1.0
75%,96396.99,66.23,1.0,16.0,62.0,1.0
max,82089340.0,73.89,1.0,17.0,95.0,1.0


## Unique
- series data의 유일한 값을 list 반환
- same as *distinct* in DB

In [148]:
df.sex.unique()

array([0, 1], dtype=int64)

In [149]:
np.array(dict(enumerate(df['race'].unique()))) # dict type으로 index

array({0: 'white', 1: 'other'}, dtype=object)

In [151]:
# label index 값과 label 값 각각 추출
value = list(map(int, np.array(list(enumerate(df['race'].unique())))[:,0].tolist()))
value

[0, 1]

In [155]:
key = np.array(list(enumerate(df['race'].unique())), dtype=str)[:,1].tolist()
key

['white', 'other']

In [157]:
# label str -> index
df['race'].replace(to_replace=key, value=value, inplace=True)
df

Unnamed: 0,earn,height,sex,race,ed,age,sex_code
0,79571.3,73.89,0,0,16,49,0
1,96396.99,66.23,1,0,16,62,1
2,48710.67,63.77,1,0,16,33,1
3,80478.1,63.22,1,1,16,95,1
4,82089340.0,63.08,1,0,17,43,1


## Sum
- 기본적인 column, row 값 연산 지원
- sub, mean, min, max, count, median, mad, var etc

In [159]:
df.sum(axis=0) # column 별

earn        8.239450e+07
height      3.301900e+02
sex         4.000000e+00
race        1.000000e+00
ed          8.100000e+01
age         2.820000e+02
sex_code    4.000000e+00
dtype: float64

In [160]:
df.sum(axis=1) # row 별

0    7.971019e+04
1    9.654322e+04
2    4.882544e+04
3    8.065532e+04
4    8.208947e+07
dtype: float64

## isnull
- column or row 값의 NaN(null) 값의 index 반환

In [161]:
df.isnull()

Unnamed: 0,earn,height,sex,race,ed,age,sex_code
0,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False


In [162]:
df.isnull().sum()

earn        0
height      0
sex         0
race        0
ed          0
age         0
sex_code    0
dtype: int64

## sort_values
- column 기준으로 sorting

In [163]:
df.sort_values(['age', 'earn'], ascending=True).head(10)

Unnamed: 0,earn,height,sex,race,ed,age,sex_code
2,48710.67,63.77,1,0,16,33,1
4,82089340.0,63.08,1,0,17,43,1
0,79571.3,73.89,0,0,16,49,0
1,96396.99,66.23,1,0,16,62,1
3,80478.1,63.22,1,1,16,95,1


#### Time series data 에서 cumsum, cummax 많이 사용

In [164]:
df.cumsum()

Unnamed: 0,earn,height,sex,race,ed,age,sex_code
0,79571.3,73.89,0.0,0.0,16.0,49.0,0.0
1,175968.3,140.12,1.0,0.0,32.0,111.0,1.0
2,224679.0,203.89,2.0,0.0,48.0,144.0,2.0
3,305157.0,267.11,3.0,1.0,64.0,239.0,3.0
4,82394500.0,330.19,4.0,1.0,81.0,282.0,4.0


In [165]:
df.cummax()

Unnamed: 0,earn,height,sex,race,ed,age,sex_code
0,79571.3,73.89,0.0,0.0,16.0,49.0,0.0
1,96396.99,73.89,1.0,0.0,16.0,62.0,1.0
2,96396.99,73.89,1.0,0.0,16.0,62.0,1.0
3,96396.99,73.89,1.0,1.0,16.0,95.0,1.0
4,82089340.0,73.89,1.0,1.0,17.0,95.0,1.0


## Correlation & Covariance

- corr, cov, corrwith

In [166]:
df.age.corr(df.earn)

-0.31191765928447773

In [167]:
df.age.cov(df.earn)

-274519900.63965

In [168]:
df.corrwith(df.earn)

earn        1.000000
height     -0.361743
sex         0.249950
race       -0.249936
ed          1.000000
age        -0.311918
sex_code    0.249950
dtype: float64