In [146]:
import pandas as pd

## 예제 8-4  사용자 정의 함수를 apply 메소드로  처리하기

In [147]:
import numpy as np

In [148]:
df = pd.DataFrame ({'a' : np.random.randn(6),
                 'b' : ['철수', '영희'] * 3,
                 'c' : np.random.randn(6)})

In [149]:
df

Unnamed: 0,a,b,c
0,0.145469,철수,-0.405284
1,1.963585,영희,-0.739328
2,-1.603775,철수,1.395074
3,1.090697,영희,-0.435323
4,-0.215842,철수,0.191742
5,0.9487,영희,0.689989


In [150]:
def my_test(a, c):
    return a % c

In [151]:
# lambda 함수는 행단위로 데이터를 처리하는 사용자 함수를 작성한다.
df['Value'] = df.apply(lambda df: my_test(df['a'], df['c']), axis=1)
# lambda df: my_test(df['a'], df['c']), axis=1
#   입력값:출력값

In [152]:
df

Unnamed: 0,a,b,c,Value
0,0.145469,철수,-0.405284,-0.259815
1,1.963585,영희,-0.739328,-0.2544
2,-1.603775,철수,1.395074,1.186374
3,1.090697,영희,-0.435323,-0.215271
4,-0.215842,철수,0.191742,0.167642
5,0.9487,영희,0.689989,0.258711


In [153]:
df.a % df.c

0   -0.259815
1   -0.254400
2    1.186374
3   -0.215271
4    0.167642
5    0.258711
dtype: float64

In [154]:
df['a']

0    0.145469
1    1.963585
2   -1.603775
3    1.090697
4   -0.215842
5    0.948700
Name: a, dtype: float64

In [109]:
# 일반함수는 데이터 프레임 전체를 넘겨 열 전체의 값을 반환하는 형태로 작성한다. 이쪽을 추천!
def my_test2(df):
    return df['a'] % df['c']

In [110]:
df['Value 2'] = df.apply(my_test2, axis=1)

In [111]:
df

Unnamed: 0,a,b,c,Value,Value 2
0,-0.780143,철수,0.261421,0.00412,0.00412
1,-1.3947,영희,1.685737,0.291037,0.291037
2,-0.487244,철수,1.104486,0.617242,0.617242
3,-2.256125,영희,-0.280827,-0.009509,-0.009509
4,-1.541753,철수,-0.68207,-0.177613,-0.177613
5,-0.972073,영희,-1.233346,-0.972073,-0.972073


In [112]:
%timeit df['Value3'] = df.apply(lambda df: my_test(df['a'], df['c']), axis=1)

613 µs ± 11.8 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [113]:
%timeit df['Value4'] = df.apply(my_test2, axis=1)

619 µs ± 14.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [114]:
sample = pd.DataFrame({'임의의값':[10,100,40] })

In [115]:
sample

Unnamed: 0,임의의값
0,10
1,100
2,40


In [116]:
# 기존의 데이터 3개와 10만개의 데이터를 곱하여 30만개의 데이터를 생성한다.
# 인덱스는 갱신되어야 하므로 reset_index 함수 인자에 drop=True를 하면 새로운 인덱스로 갱신이된다.
sample = pd.concat([sample]*100000).reset_index(drop=True)

In [117]:
sample.shape

(300000, 1)

In [118]:
sample.head(10)

Unnamed: 0,임의의값
0,10
1,100
2,40
3,10
4,100
5,40
6,10
7,100
8,40
9,10


In [119]:
sample.shape

(300000, 1)

In [120]:
%timeit sample['임의의값'].apply(lambda x: np.nan if x < 90 else x)

72.1 ms ± 821 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [121]:
# 자기자신의 값은 변경되지 않으므로 추후 복제해서 적용해 본다.
sample.head()

Unnamed: 0,임의의값
0,10
1,100
2,40
3,10
4,100


In [122]:
%timeit sample['임의의값'].mask(sample['임의의값'] < 90, np.nan)

3.24 ms ± 210 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [123]:
sample1 = sample.copy()

In [124]:
sample1 = sample1['임의의값'].apply(lambda x: np.nan if x < 90 else x)

In [125]:
sample1.head()

0      NaN
1    100.0
2      NaN
3      NaN
4    100.0
Name: 임의의값, dtype: float64

In [126]:
sample2 = sample.copy()

In [127]:
sample2 = sample2['임의의값'].mask(sample['임의의값'] < 90, np.nan)

In [128]:
sample2.head()

0      NaN
1    100.0
2      NaN
3      NaN
4    100.0
Name: 임의의값, dtype: float64

In [129]:
(sample1 == sample2).shape

(300000,)

In [130]:
sample1.isnull().sum(), sample1.notnull().sum()

(200000, 100000)

In [131]:
(sample1 == sample2).sum()

100000

## 예제 8-5  사용자 정의 함수를 map,applymap메소드로  처리하기

In [132]:
import numpy as np

In [133]:
ser = pd.Series(np.random.randn(6))

In [134]:
ser

0    0.426036
1   -0.305476
2    1.097504
3    0.983715
4    1.477194
5   -0.265940
dtype: float64

In [135]:
df = pd.DataFrame(ser,columns=['관측값_A'])
df

Unnamed: 0,관측값_A
0,0.426036
1,-0.305476
2,1.097504
3,0.983715
4,1.477194
5,-0.26594


In [136]:
def map_test(a):
    print(" 원소별 처리")
    return np.abs(a)

In [37]:
# 열거형객체.map(함수) : 열거형 객체의 각 인수에 함수를 매핑하여 적용한다.
ser.map(map_test)

 원소별 처리
 원소별 처리
 원소별 처리
 원소별 처리
 원소별 처리
 원소별 처리


0    1.184446
1    2.217372
2    0.566503
3    0.390657
4    0.863537
5    0.115466
dtype: float64

In [38]:
s = pd.Series(ser.map(map_test))

 원소별 처리
 원소별 처리
 원소별 처리
 원소별 처리
 원소별 처리
 원소별 처리


In [39]:
s

0    1.184446
1    2.217372
2    0.566503
3    0.390657
4    0.863537
5    0.115466
dtype: float64

In [40]:
df['관측값_A_절대값'] = s
df

Unnamed: 0,관측값_A,관측값_A_절대값
0,-1.184446,1.184446
1,2.217372,2.217372
2,-0.566503,0.566503
3,-0.390657,0.390657
4,-0.863537,0.863537
5,0.115466,0.115466


## 예제 8-6 pipe 메소드 처리하기

In [139]:
import numpy as np

In [140]:
df = pd.DataFrame({"name": ['김상갑', '임종문', '조현웅'],
                   "program language": [np.nan, 'Python', 'Scala'],
                   "born": [pd.NaT, pd.Timestamp("1966-04-25"),
                             pd.NaT]})

In [141]:
df

Unnamed: 0,name,program language,born
0,김상갑,,NaT
1,임종문,Python,1966-04-25
2,조현웅,Scala,NaT


In [142]:
def name_length(df) :
    df['length'] = df.name.str.len()
    return df

In [143]:
# 판다스에서 pipe라는 함수로도 사용자 정의 함수를 지정할 수 있다.

In [144]:
df.pipe(name_length)

Unnamed: 0,name,program language,born,length
0,김상갑,,NaT,3
1,임종문,Python,1966-04-25,3
2,조현웅,Scala,NaT,3


In [47]:
def born_fillna(df) :
    df['born'] = df['born'].fillna(pd.Timestamp("1967-04-25"))
    return df

In [48]:
df.pipe(born_fillna)

Unnamed: 0,name,program language,born,length
0,김상갑,,1967-04-25,3
1,임종문,Python,1966-04-25,3
2,조현웅,Scala,1967-04-25,3


In [49]:
def pl_fillna(df) :
    df['program language'] = df['program language'].fillna("Java")
    return df

In [50]:
df.pipe(pl_fillna)

Unnamed: 0,name,program language,born,length
0,김상갑,Java,1967-04-25,3
1,임종문,Python,1966-04-25,3
2,조현웅,Scala,1967-04-25,3


In [51]:
df1 = pd.DataFrame({"name": ['김상갑', '임종문', '조현웅'],
                   "program language": [np.nan, 'Python', 'Scala'],
                   "born": [pd.NaT, pd.Timestamp("1966-04-25"),
                             pd.NaT]})

In [52]:
df1

Unnamed: 0,name,program language,born
0,김상갑,,NaT
1,임종문,Python,1966-04-25
2,조현웅,Scala,NaT


In [53]:
df1.pipe(name_length).pipe(born_fillna).pipe(pl_fillna)
# 처리순서    1      .        2        .        3
# 처리순서의 결과값을 바탕으로 다음 처리순서를 진행한다.

Unnamed: 0,name,program language,born,length
0,김상갑,Java,1967-04-25,3
1,임종문,Python,1966-04-25,3
2,조현웅,Scala,1967-04-25,3


In [54]:
df2 = pd.DataFrame()

In [55]:
df2['name'] = ['은옥찬', '은석찬', '은옥주']
df2['gender'] = ['Male', 'Male', 'Female']
df2['age'] = [31, 32, 19]

In [56]:
df2

Unnamed: 0,name,gender,age
0,은옥찬,Male,31
1,은석찬,Male,32
2,은옥주,Female,19


In [57]:
def mean_age_by_group(dataframe, col):
    return dataframe.groupby(col).mean()

In [58]:
mean_age_by_group(df2, 'gender')

Unnamed: 0_level_0,age
gender,Unnamed: 1_level_1
Female,19.0
Male,31.5


In [59]:
def uppercase_column_name(dataframe):
    dataframe.columns = dataframe.columns.str.upper()
    return dataframe

In [60]:
uppercase_column_name(df2)

Unnamed: 0,NAME,GENDER,AGE
0,은옥찬,Male,31
1,은석찬,Male,32
2,은옥주,Female,19


In [61]:
df3 = pd.DataFrame()

In [62]:
df3['name'] = ['구옥찬', '구석찬', '구옥주']
df3['gender'] = ['Male', 'Male', 'Female']
df3['age'] = [31, 32, 19]

In [63]:
(df3.pipe(mean_age_by_group, col='gender')
   .pipe(uppercase_column_name))

Unnamed: 0_level_0,AGE
gender,Unnamed: 1_level_1
Female,19.0
Male,31.5
