# DataFrame (Part 2)

In [1]:
import numpy as np
import pandas as pd

In [2]:
data = {
    'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
    'year': [2000, 2001, 2002, 2001, 2002],
    'pop': [1.5, 1.7, 3.6, 2.4, 2.9]
}
df = pd.DataFrame(data)
df

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


## 0. 한 열에 대하여 적용 (Series와 동일)

In [9]:
def count_string(x):
    return len(x)

df['state'].apply(count_string)

0    4
1    4
2    4
3    6
4    6
Name: state, dtype: int64

In [11]:
df['state_len']=df['state'].apply(count_string)
df

Unnamed: 0,state,year,pop,state_len
0,Ohio,2000,1.5,4
1,Ohio,2001,1.7,4
2,Ohio,2002,3.6,4
3,Nevada,2001,2.4,6
4,Nevada,2002,2.9,6


In [10]:
df['state'].map({'Ohio':True,'Nevada':False})

0     True
1     True
2     True
3    False
4    False
Name: state, dtype: bool

In [8]:
def standardize(x):
    return x.mean()

df['pop'].agg(standardize)

2.4200000000000004

## 1. 여러열에 사용자함수 적용 (apply, many to one)

In [16]:
data = {
    'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
    'year': [2000, 2001, 2002, 2001, 2002],
    'pop': [1.5, 1.7, 3.6, 2.4, 2.9],
    'pop2': [2.7,1.5,3.5,2.0,4.8]
}
df = pd.DataFrame(data)
df

Unnamed: 0,state,year,pop,pop2
0,Ohio,2000,1.5,2.7
1,Ohio,2001,1.7,1.5
2,Ohio,2002,3.6,3.5
3,Nevada,2001,2.4,2.0
4,Nevada,2002,2.9,4.8


In [17]:
np.max(df['pop'])

3.6

In [30]:
df[['pop','pop2']].apply(np.max)

pop     3.6
pop2    4.8
dtype: float64

In [31]:
df[['pop','pop2']].iloc[0]

pop     1.5
pop2    2.7
Name: 0, dtype: float64

In [32]:
np.max(df[['pop','pop2']].iloc[0])

2.7

In [34]:
df[['pop','pop2']].apply(np.max,axis=1)

0    2.7
1    1.7
2    3.6
3    2.4
4    4.8
dtype: float64

In [35]:
np.argmax(df[['pop','pop']].iloc[0])

'pop2'

## 2. 여러개 함수 동시적용 적용 (agg, many-to-one)

In [46]:
df[['pop','pop2']].agg([np.max,np.mean,np.std])

Unnamed: 0,pop,pop2
amax,3.6,4.8
mean,2.42,2.9
std,0.864292,1.301922


In [47]:
df[['pop','pop2']].agg([np.max,np.mean,np.std],axis=1)

Unnamed: 0,amax,mean,std
0,2.7,2.1,0.848528
1,1.7,1.6,0.141421
2,3.6,3.55,0.070711
3,2.4,2.2,0.282843
4,4.8,3.85,1.343503


## 3. 여러행/열에 걸친 사용자함수 (apply, many to many)

In [38]:
def standardize(x):
    return (x-x.mean())/x.std()

standardize(df['pop'])

0   -1.064456
1   -0.833052
2    1.365280
3   -0.023140
4    0.555368
Name: pop, dtype: float64

In [41]:
df[['pop','pop2']].apply(standardize,axis=0)

Unnamed: 0,pop,pop2
0,-1.064456,-0.153619
1,-0.833052,-1.075334
2,1.36528,0.460857
3,-0.02314,-0.691286
4,0.555368,1.459381


In [48]:
df[['pop','pop2']].apply(lambda x: x/(x.max()-x.min()))

Unnamed: 0,pop,pop2
0,0.714286,0.818182
1,0.809524,0.454545
2,1.714286,1.060606
3,1.142857,0.606061
4,1.380952,1.454545


## 4. 모든 값에 하나하나에 대하여 사용자 함수 적용 (applymap, one-to-one)

In [52]:
int(3.3)

3

In [53]:
df[['pop','pop2']].applymap(int)

Unnamed: 0,pop,pop2
0,1,2
1,1,1
2,3,3
3,2,2
4,2,4
