# 함수매핑

## 개별 원소에 함수 매핑

In [2]:
import seaborn as sns

In [3]:
titanic = sns.load_dataset('titanic')
df = titanic.loc[:,['age','fare']]
df['ten']=10
df.head()

Unnamed: 0,age,fare,ten
0,22.0,7.25,10
1,38.0,71.2833,10
2,26.0,7.925,10
3,35.0,53.1,10
4,35.0,8.05,10


In [4]:
#함수 정의
def add_10(n) :
    return n + 10
def add_two_obj(a,b) :
    return a+b

print(add_10(10))
print(add_two_obj(10,10))

20
20


In [5]:
sr1 = df['age'].apply(add_10)
sr1.head()

0    32.0
1    48.0
2    36.0
3    45.0
4    45.0
Name: age, dtype: float64

In [6]:
sr2 = df['age'].apply(add_two_obj, b=10)
sr2.head()

0    32.0
1    48.0
2    36.0
3    45.0
4    45.0
Name: age, dtype: float64

In [7]:
sr3 = df['age'].apply(lambda x : add_10(x))
# lambda 함수 활용
sr3.head()

0    32.0
1    48.0
2    36.0
3    45.0
4    45.0
Name: age, dtype: float64

In [10]:
#dt frame에 add_10() 함수를 매핑 적용
df_map = df.applymap(add_10)
df_map.head()

Unnamed: 0,age,fare,ten
0,32.0,17.25,20
1,48.0,81.2833,20
2,36.0,17.925,20
3,45.0,63.1,20
4,45.0,18.05,20


## 시리즈 객체에 함수 매핑

In [11]:
#함수정의
def missing_value (series) :
    return series.isnull()

In [12]:
#dt frame에 apply 메소드 적용
result = df.apply(missing_value, axis=0)
result.head()

Unnamed: 0,age,fare,ten
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,False
4,False,False,False


In [13]:
type(result)

pandas.core.frame.DataFrame

In [14]:
def min_max(x) : #최댓값-최솟값
    return x.max()-x.min()

In [15]:
result = df.apply(min_max)
result

age      79.5800
fare    512.3292
ten       0.0000
dtype: float64

In [16]:
type(result)

pandas.core.series.Series

# 데이터프레임 각 행에 함수 매핑

def add_two_obj(a,b):
    return a+b

In [18]:
# dtf 두개 열에 적용
df['add'] = df.apply(lambda x : add_two_obj(x['age'],x['ten']),axis=1)
df.head()

Unnamed: 0,age,fare,ten,add
0,22.0,7.25,10,32.0
1,38.0,71.2833,10,48.0
2,26.0,7.925,10,36.0
3,35.0,53.1,10,45.0
4,35.0,8.05,10,45.0


# 데이터프레임 객체에 함수 매핑

In [19]:
#함수정의
def missing_value (x) : # 각 열의 NaN 찾기
    return x.isnull()

def missing_count (x) : # 각 열의 NaN 개수 세기
    return missing_value(x).sum()

def total_number_missing(x) : # 전체 NaN 개수 세기
    return missing_count(x).sum()

In [25]:
result_df = df.pipe(missing_value)
result_df.head()

Unnamed: 0,age,fare,ten,add
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False
4,False,False,False,False


In [23]:
type(result_df)

pandas.core.frame.DataFrame

# pipe()

In [27]:
result_series = df.pipe(missing_count)
result_series

age     177
fare      0
ten       0
add     177
dtype: int64

In [28]:
result_value = df.pipe(total_number_missing)
result_value

354

# 열 재구성
## 열 순서 변경

In [29]:
df = titanic.loc[0:4,'survived':'age']
df

Unnamed: 0,survived,pclass,sex,age
0,0,3,male,22.0
1,1,1,female,38.0
2,1,3,female,26.0
3,1,1,female,35.0
4,0,3,male,35.0


In [30]:
columns = list(df.columns.values)
columns

['survived', 'pclass', 'sex', 'age']

In [31]:
columns_sorted = sorted(columns)
df_sorted = df[columns_sorted] #알파벳 순으로 정렬
df_sorted

Unnamed: 0,age,pclass,sex,survived
0,22.0,3,male,0
1,38.0,1,female,1
2,26.0,3,female,1
3,35.0,1,female,1
4,35.0,3,male,0


In [32]:
columns_reversed = list(reversed(columns))
df_reversed = df[columns_reversed] #알파벳 역순으로 정렬
df_reversed

Unnamed: 0,age,sex,pclass,survived
0,22.0,male,3,0
1,38.0,female,1,1
2,26.0,female,3,1
3,35.0,female,1,1
4,35.0,male,3,0


In [33]:
# 사용자가 정한 순서대로 배치학
columns_customed = ['pclass','sex','age','survived']
df_customed = df[columns_customed]
df_customed

Unnamed: 0,pclass,sex,age,survived
0,3,male,22.0,0
1,1,female,38.0,1
2,3,female,26.0,1
3,1,female,35.0,1
4,3,male,35.0,0


## 열 분리

In [34]:
import pandas as pd

In [38]:
df = pd.read_excel('/Users/shindongeun/Documents/sample_data/주가데이터.xlsx')
df.head()

Unnamed: 0,연월일,당일종가,전일종가,시가,고가,저가,거래량
0,2018-07-02,10100,600,10850,10900,10000,137977
1,2018-06-29,10700,300,10550,10900,9990,170253
2,2018-06-28,10400,500,10900,10950,10150,155769
3,2018-06-27,10900,100,10800,11050,10500,133548
4,2018-06-26,10800,350,10900,11000,10700,63039


In [37]:
df['연월일']=df['연월일'].astype('str')
dates = df['연월일'].str.split('-')
dates.head()

0    [2018, 07, 02]
1    [2018, 06, 29]
2    [2018, 06, 28]
3    [2018, 06, 27]
4    [2018, 06, 26]
Name: 연월일, dtype: object

In [40]:
df['연'] = dates.str.get(0)
df['월'] = dates.str.get(1)
df['일'] = dates.str.get(2)
df.head()

Unnamed: 0,연월일,당일종가,전일종가,시가,고가,저가,거래량,연,월,일
0,2018-07-02,10100,600,10850,10900,10000,137977,2018,7,2
1,2018-06-29,10700,300,10550,10900,9990,170253,2018,6,29
2,2018-06-28,10400,500,10900,10950,10150,155769,2018,6,28
3,2018-06-27,10900,100,10800,11050,10500,133548,2018,6,27
4,2018-06-26,10800,350,10900,11000,10700,63039,2018,6,26


## 필터링

In [42]:
mask1 = (titanic.age >=10) & (titanic.age <20)
# 나이가 10~19세 인 승객만 따로 선택
df_teenage = titanic.loc[mask1,:]
df_teenage.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
9,1,2,female,14.0,1,0,30.0708,C,Second,child,False,,Cherbourg,yes,False
14,0,3,female,14.0,0,0,7.8542,S,Third,child,False,,Southampton,no,True
22,1,3,female,15.0,0,0,8.0292,Q,Third,child,False,,Queenstown,yes,True
27,0,1,male,19.0,3,2,263.0,S,First,man,True,C,Southampton,no,False
38,0,3,female,18.0,2,0,18.0,S,Third,woman,False,,Southampton,no,False


In [43]:
mask2 = (titanic.age >=10) & (titanic.sex=='female')
df_female_under10 = titanic.loc[mask2,:]
df_female_under10.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
8,1,3,female,27.0,0,2,11.1333,S,Third,woman,False,,Southampton,yes,False
9,1,2,female,14.0,1,0,30.0708,C,Second,child,False,,Cherbourg,yes,False


In [45]:
mask3 = (titanic.age <10) | (titanic.age >=60)
df_under10_morethan60 = titanic.loc[mask3,['age','sex','alone']]
df_under10_morethan60.head()

Unnamed: 0,age,sex,alone
7,2.0,male,False
10,4.0,female,False
16,2.0,male,False
24,8.0,female,False
33,66.0,male,True


In [49]:
pd.set_option('display.max_columns',10)

mask3 = titanic['sibsp']==3
mask4 = titanic['sibsp']==4 #함께 탑승한 사람수
mask5 = titanic['sibsp']==5

In [50]:
df_boolin = titanic[mask3|mask4|mask5]
df_boolin.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,...,adult_male,deck,embark_town,alive,alone
7,0,3,male,2.0,3,...,False,,Southampton,no,False
16,0,3,male,2.0,4,...,False,,Queenstown,no,False
24,0,3,female,8.0,3,...,False,,Southampton,no,False
27,0,1,male,19.0,3,...,True,C,Southampton,no,False
50,0,3,male,7.0,4,...,False,,Southampton,no,False
