import

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## 인덱싱 `.loc[]`, `.iloc[]`
- `.loc[]` : 라벨 값 기반의 2차원 인덱싱
- `.iloc[]` : 순서를 나타내는 정수 기반의 2차원 인덱싱

### .loc[]
- df.loc['행']
- df.loc['행', '열']

In [4]:
sample_df = pd.DataFrame(np.arange(10,22).reshape(3, 4),
                        index = ['a', 'b', 'c'],
                        columns = ['A', 'B', 'C', 'D'])
sample_df

Unnamed: 0,A,B,C,D
a,10,11,12,13
b,14,15,16,17
c,18,19,20,21


행 인덱싱

In [9]:
# 'a'행 찾기
sample_df.loc['a']

A    10
B    11
C    12
D    13
Name: a, dtype: int32

In [10]:
# 'b', 'c'행 찾기
sample_df.loc[['b', 'c']]

Unnamed: 0,A,B,C,D
b,14,15,16,17
c,18,19,20,21


열 인덱싱

In [11]:
# 'A'열 찾기
sample_df.A

a    10
b    14
c    18
Name: A, dtype: int32

In [12]:
sample_df['A']

a    10
b    14
c    18
Name: A, dtype: int32

In [16]:
# A행이 15보다 큰 열 모두 찾기
sample_df[sample_df.A>15]

Unnamed: 0,A,B,C,D
c,18,19,20,21


In [17]:
sample_df

Unnamed: 0,A,B,C,D
a,10,11,12,13
b,14,15,16,17
c,18,19,20,21


In [18]:
# 10가져오기
sample_df.loc['a', 'A']

10

In [19]:
# 1, 2행 가져오기
sample_df.loc[:'b']

Unnamed: 0,A,B,C,D
a,10,11,12,13
b,14,15,16,17


In [20]:
# 'a'행, 'A'열 가져오기
sample_df.loc['a', 'A']

10

In [21]:
# 14, 18 가져오기
sample_df.loc['b':, 'A']

b    14
c    18
Name: A, dtype: int32

In [22]:
# a행 가져오기
sample_df.loc['a']

A    10
B    11
C    12
D    13
Name: a, dtype: int32

In [25]:
# 16, 17, 20, 21을 가져오기
sample_df.loc['b':,'C': ]

Unnamed: 0,C,D
b,16,17
c,20,21


### iloc[]
- df.iloc['행 인덱스번지']
- df.iloc['행 인덱스번지', '열 인덱스번지']

In [26]:
sample_df

Unnamed: 0,A,B,C,D
a,10,11,12,13
b,14,15,16,17
c,18,19,20,21


In [27]:
# 11을 가져오고 싶다면
sample_df.iloc[0, 1]

11

In [28]:
# B열 전체 가져오기
sample_df.iloc[:, 1]

a    11
b    15
c    19
Name: B, dtype: int32

In [31]:
# 12, 13 가져오기
sample_df.iloc[0, 2:]

C    12
D    13
Name: a, dtype: int32

In [32]:
# 19, 20 가져오기
sample_df.iloc[2, 1:3]

B    19
C    20
Name: c, dtype: int32

In [33]:
# 19, 20 가져오기
sample_df.iloc[-1, 1:3]

B    19
C    20
Name: c, dtype: int32

In [36]:
# C행의 값에 *2 해서 할당하기
sample_df.iloc[2, :] = sample_df.iloc[2, :]*2
sample_df

Unnamed: 0,A,B,C,D
a,10,11,12,13
b,14,15,16,17
c,36,38,40,42


## 데이터 개수세기 `count`

### series

In [37]:
s = pd.Series(range(10))
s

0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
9    9
dtype: int64

In [38]:
s[5]= np.NaN
s[2]= np.NaN
s.count()

8

In [39]:
s

0    0.0
1    1.0
2    NaN
3    3.0
4    4.0
5    NaN
6    6.0
7    7.0
8    8.0
9    9.0
dtype: float64

NaN값은 세지 않는다

### DataFrame

columns 안의 데이터 개수를 센다

In [40]:
np.random.seed(2)
count_df = pd.DataFrame(np.random.randint(5, size=(4, 4)), dtype=np.float64)
count_df

Unnamed: 0,0,1,2,3
0,0.0,0.0,3.0,2.0
1,3.0,0.0,2.0,1.0
2,3.0,2.0,4.0,4.0
3,4.0,3.0,4.0,2.0


In [41]:
count_df.count()

0    4
1    4
2    4
3    4
dtype: int64

In [42]:
count_df.iloc[0, 0] = np.NaN
count_df.iloc[3, 0] = np.NaN
count_df.iloc[1, 2] = np.NaN
count_df

Unnamed: 0,0,1,2,3
0,,0.0,3.0,2.0
1,3.0,0.0,,1.0
2,3.0,2.0,4.0,4.0
3,,3.0,4.0,2.0


In [43]:
count_df.count()

0    2
1    4
2    3
3    4
dtype: int64

### 타이타닉 데이터

In [45]:
import seaborn as sns
titanic = sns.load_dataset('titanic')

In [47]:
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [48]:
# 각 열의 데이터 개수 구하기
titanic.count()

survived       891
pclass         891
sex            891
age            714
sibsp          891
parch          891
fare           891
embarked       889
class          891
who            891
adult_male     891
deck           203
embark_town    889
alive          891
alone          891
dtype: int64

### value_counts()
- series의 attribute
- series의 value를 세준다

In [50]:
# 데이터 프레임에 쓰면 에러난다 AttributeError
# titanic.value_counts()

In [51]:
# 생존자 value_counts()
titanic['survived'].value_counts()

0    549
1    342
Name: survived, dtype: int64

In [57]:
# pclass value_counts() 후 value만 가져오기
titanic['pclass'].value_counts().values

array([491, 216, 184], dtype=int64)

## 컬럼 생성

In [58]:
# 새로운 컬럼 age_0 추가 후 일괄적으로 0 할당
titanic['age_0'] = 0
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,age_0
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False,0
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False,0
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True,0
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False,0
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True,0


In [59]:
# age의 각 값에 10을 곱한 age_by_10 컬럼 생성
titanic['age_by_10'] = titanic['age'] *10
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,age_0,age_by_10
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False,0,220.0
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False,0,380.0
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True,0,260.0
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False,0,350.0
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True,0,350.0


In [60]:
# parch와 sibSp의 값과 1을 더한 family_no 컬럼 생성
titanic['family_no'] = titanic['parch'] + titanic['sibsp'] + 1
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,age_0,age_by_10,family_no
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False,0,220.0,2
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False,0,380.0,2
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True,0,260.0,1
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False,0,350.0,2
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True,0,350.0,1


In [61]:
# age_by_10 컬럼 값에 일괄적으로 + 100 처리
titanic['age_by_10'] = titanic['age_by_10'] + 100
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,age_0,age_by_10,family_no
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False,0,320.0,2
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False,0,480.0,2
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True,0,360.0,1
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False,0,450.0,2
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True,0,450.0,1


### 데이터 삭제 `drop`
- df.drop('컬럼명', axis = , inplace= )

In [62]:
# age_0을 삭제
titanic.drop('age_0', axis = 1)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,age_by_10,family_no
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False,320.0,2
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False,480.0,2
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True,360.0,1
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False,450.0,2
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True,450.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True,370.0,1
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True,290.0,1
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False,,4
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True,360.0,1


In [63]:
# age_0, age_by_10, family_no 컬럼 삭제 후 원본에 반영
titanic.drop(['age_0', 'age_by_10', 'family_no'], axis = 1, inplace = True)
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [65]:
# 0, 1, 2 row 삭제 후 원본에 반영
titanic.drop([0, 1, 2], axis = 0, inplace = True)
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True
5,0,3,male,,0,0,8.4583,Q,Third,man,True,,Queenstown,no,True
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
7,0,3,male,2.0,3,1,21.075,S,Third,child,False,,Southampton,no,False


### 인덱스 데이터 타입

In [71]:
print(titanic.index)
print(type(titanic.index) )
print('*'*50)
print(titanic.index.values[:5])
print(type(titanic.index.values) )

Int64Index([  3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
            ...
            881, 882, 883, 884, 885, 886, 887, 888, 889, 890],
           dtype='int64', length=888)
<class 'pandas.core.indexes.numeric.Int64Index'>
**************************************************
[3 4 5 6 7]
<class 'numpy.ndarray'>


인덱스 조작하려면 `df.index.values`해서 np.ndarray형식으로 만들어야 한다

### 인덱스 인덱싱 및 슬라이싱

In [80]:
# 인덱스 5개 꺼내기
titanic.index[:5].values

array([3, 4, 5, 6, 7], dtype=int64)

In [78]:
# 인덱스의 6번째 값 꺼내기
titanic.index[6]

9

In [83]:
# fare컬럼만 가져오기
series_fare = titanic['fare']
print(series_fare.values)
print(type(series_fare))

[ 53.1      8.05     8.4583  51.8625  21.075   11.1333  30.0708  16.7
  26.55     8.05    31.275    7.8542  16.      29.125   13.      18.
   7.225   26.      13.       8.0292  35.5     21.075   31.3875   7.225
 263.       7.8792   7.8958  27.7208 146.5208   7.75    10.5     82.1708
  52.       7.2292   8.05    18.      11.2417   9.475   21.       7.8958
  41.5792   7.8792   8.05    15.5      7.75    21.6792  17.8     39.6875
   7.8     76.7292  26.      61.9792  35.5     10.5      7.2292  27.75
  46.9      7.2292  80.      83.475   27.9     27.7208  15.2458  10.5
   8.1583   7.925    8.6625  10.5     46.9     73.5     14.4542  56.4958
   7.65     7.8958   8.05    29.      12.475    9.       9.5      7.7875
  47.1     10.5     15.85    34.375    8.05   263.       8.05     8.05
   7.8542  61.175   20.575    7.25     8.05    34.6542  63.3583  23.
  26.       7.8958   7.8958  77.2875   8.6542   7.925    7.8958   7.65
   7.775    7.8958  24.15    52.      14.4542   8.05     9.825   14.4583

In [87]:
# fare의 max, min, sum / fare 10% 할인
print('max', series_fare.max())
print('min', series_fare.min())
print('sum', series_fare.sum())
print('*'* 50)
print(series_fare * 0.9)

max 512.3292
min 0.0
sum 28607.491
**************************************************
3      47.79000
4       7.24500
5       7.61247
6      46.67625
7      18.96750
         ...   
886    11.70000
887    27.00000
888    21.10500
889    27.00000
890     6.97500
Name: fare, Length: 888, dtype: float64


### `reset_index()`
- 새로운 인덱스를 할당. 기존 인덱스는 index라는 새로운 컬럼으로 추가된다.
- 기존 인덱스를 value로 사용하고 싶을 때 사용한다.
- inplace = True 조건을 줘야 원본이 변경된다.

In [94]:
titanic.reset_index(inplace = False)
titanic_reset_index_df = titanic.reset_index(inplace = False)
titanic_reset_index_df

Unnamed: 0,index,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
1,4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
2,5,0,3,male,,0,0,8.4583,Q,Third,man,True,,Queenstown,no,True
3,6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
4,7,0,3,male,2.0,3,1,21.0750,S,Third,child,False,,Southampton,no,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
883,886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
884,887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
885,888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
886,889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


In [95]:
# pclass, fare 가져오기
titanic_reset_index_df[['pclass', 'fare']].head()

Unnamed: 0,pclass,fare
0,1,53.1
1,3,8.05
2,3,8.4583
3,1,51.8625
4,3,21.075


In [98]:
# pclass가 3등급인 것만 가져오기
titanic_reset_index_df[titanic_reset_index_df['pclass'] == 3].head()

Unnamed: 0,index,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
1,4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True
2,5,0,3,male,,0,0,8.4583,Q,Third,man,True,,Queenstown,no,True
4,7,0,3,male,2.0,3,1,21.075,S,Third,child,False,,Southampton,no,False
5,8,1,3,female,27.0,0,2,11.1333,S,Third,woman,False,,Southampton,yes,False
7,10,1,3,female,4.0,1,1,16.7,S,Third,child,False,G,Southampton,yes,False


In [100]:
# age가 60 이상인 정보만 추출
titanic_reset_index_df[titanic_reset_index_df['age']>= 60].head()

Unnamed: 0,index,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
30,33,0,2,male,66.0,0,0,10.5,S,Second,man,True,,Southampton,no,True
51,54,0,1,male,65.0,0,1,61.9792,C,First,man,True,B,Cherbourg,no,False
93,96,0,1,male,71.0,0,0,34.6542,C,First,man,True,A,Cherbourg,no,True
113,116,0,3,male,70.5,0,0,7.75,Q,Third,man,True,,Queenstown,no,True
167,170,0,1,male,61.0,0,0,33.5,S,First,man,True,B,Southampton,no,True


In [102]:
# age가 60 이상인 사람들의 pclass, survived, who만 추출
titanic_reset_index_df.loc[titanic_reset_index_df['age']>= 60, 
                           ['pclass', 'survived', 'who']].head()

Unnamed: 0,pclass,survived,who
30,2,0,man
51,1,0,man
93,1,0,man
113,3,0,man
167,1,0,man


### 여러 조건을 이용해서 boolean 인덱스 만들기
- and `&`
- or `|`
- not `!`, `~`

In [104]:
# age 60이상이고 선실등급이 1등급, 성별 여자 (1)
titanic_reset_index_df[(titanic_reset_index_df['age']>=60) &
                      (titanic_reset_index_df['pclass'] == 1) &
                      (titanic_reset_index_df['sex'] == 'female')]

Unnamed: 0,index,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
272,275,1,1,female,63.0,1,0,77.9583,S,First,woman,False,D,Southampton,yes,False
363,366,1,1,female,60.0,1,0,75.25,C,First,woman,False,D,Cherbourg,yes,False
826,829,1,1,female,62.0,0,0,80.0,,First,woman,False,B,,yes,True


In [106]:
# age 60이상이고 선실등급이 1등급, 성별 여자 (2)
# 변수 만들어서 해보기
age = titanic_reset_index_df['age']>=60
pclass = titanic_reset_index_df['pclass'] == 1
sex = titanic_reset_index_df['sex']=='female'
titanic_reset_index_df[age & pclass & sex]

Unnamed: 0,index,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
272,275,1,1,female,63.0,1,0,77.9583,S,First,woman,False,D,Southampton,yes,False
363,366,1,1,female,60.0,1,0,75.25,C,First,woman,False,D,Cherbourg,yes,False
826,829,1,1,female,62.0,0,0,80.0,,First,woman,False,B,,yes,True


## 정렬
- sort_index()
- sort_values()

데이터프레임만들기

In [108]:
np.random.seed(100)
sort_df = pd.DataFrame(np.random.randint(10, size = (6, 4)))
sort_df

Unnamed: 0,0,1,2,3
0,8,8,3,7
1,7,0,4,2
2,5,2,2,2
3,1,0,8,4
4,0,9,6,2
5,4,1,5,3


In [109]:
sort_df.columns = ['A', 'B', 'C', 'D']
sort_df.index = pd.date_range('20201014', periods=6)
sort_df

Unnamed: 0,A,B,C,D
2020-10-14,8,8,3,7
2020-10-15,7,0,4,2
2020-10-16,5,2,2,2
2020-10-17,1,0,8,4
2020-10-18,0,9,6,2
2020-10-19,4,1,5,3


In [110]:
# 순열 랜덤 치환
random_date = np.random.permutation(sort_df.index)
random_date

array(['2020-10-14T00:00:00.000000000', '2020-10-16T00:00:00.000000000',
       '2020-10-15T00:00:00.000000000', '2020-10-17T00:00:00.000000000',
       '2020-10-19T00:00:00.000000000', '2020-10-18T00:00:00.000000000'],
      dtype='datetime64[ns]')

In [111]:
sort_df2 = sort_df.reindex(index=random_date, 
                          columns = ['B', 'A', 'C', 'D'])
sort_df2

Unnamed: 0,B,A,C,D
2020-10-14,8,8,3,7
2020-10-16,2,5,2,2
2020-10-15,0,7,4,2
2020-10-17,0,1,8,4
2020-10-19,1,4,5,3
2020-10-18,9,0,6,2


In [115]:
# 행 인덱스로 정렬
sort_df2.sort_index(axis=0)

Unnamed: 0,B,A,C,D
2020-10-14,8,8,3,7
2020-10-15,0,7,4,2
2020-10-16,2,5,2,2
2020-10-17,0,1,8,4
2020-10-18,9,0,6,2
2020-10-19,1,4,5,3


In [116]:
# 컬럼 인덱스로 정렬
sort_df2.sort_index(axis=1)

Unnamed: 0,A,B,C,D
2020-10-14,8,8,3,7
2020-10-16,5,2,2,2
2020-10-15,7,0,4,2
2020-10-17,1,0,8,4
2020-10-19,4,1,5,3
2020-10-18,0,9,6,2


In [118]:
# 값을 기준으로 정렬
sort_df2.sort_values(by='B', ascending = False)

Unnamed: 0,B,A,C,D
2020-10-18,9,0,6,2
2020-10-14,8,8,3,7
2020-10-16,2,5,2,2
2020-10-19,1,4,5,3
2020-10-15,0,7,4,2
2020-10-17,0,1,8,4


In [121]:
# 값을 기준으로 정렬
sort_df2.sort_values(by=['D', 'B'], ascending = True)

Unnamed: 0,B,A,C,D
2020-10-15,0,7,4,2
2020-10-16,2,5,2,2
2020-10-18,9,0,6,2
2020-10-19,1,4,5,3
2020-10-17,0,1,8,4
2020-10-14,8,8,3,7


### 행/ 열의 합 `sum(axis = )`

In [122]:
sort_df2.sum(axis = 0)

B    20
A    25
C    28
D    20
dtype: int64

In [126]:
sort_df2.sum(axis = 1)

2020-10-14    26
2020-10-16    11
2020-10-15    13
2020-10-17    13
2020-10-19    13
2020-10-18    17
dtype: int64

axis가 헷갈리는데 이렇게 생각하자  
axis = 0은 여러 row를 합친다  
axis = 1은 여러 col을 합친다

In [127]:
# 날짜별로 합을 구해서 새 컬럼 추가
sort_df2['row sum'] = sort_df2.sum(axis =1)
sort_df2

Unnamed: 0,B,A,C,D,row sum
2020-10-14,8,8,3,7,26
2020-10-16,2,5,2,2,11
2020-10-15,0,7,4,2,13
2020-10-17,0,1,8,4,13
2020-10-19,1,4,5,3,13
2020-10-18,9,0,6,2,17


In [129]:
# 컬럼 별로 합을 구해서 새 row 추가
sort_df2.loc['col sum', :] = sort_df2.sum(axis = 0)
sort_df2

Unnamed: 0,B,A,C,D,row sum,col sum
2020-10-14 00:00:00,8.0,8.0,3.0,7.0,26.0,
2020-10-16 00:00:00,2.0,5.0,2.0,2.0,11.0,
2020-10-15 00:00:00,0.0,7.0,4.0,2.0,13.0,
2020-10-17 00:00:00,0.0,1.0,8.0,4.0,13.0,
2020-10-19 00:00:00,1.0,4.0,5.0,3.0,13.0,
2020-10-18 00:00:00,9.0,0.0,6.0,2.0,17.0,
col sum,20.0,25.0,28.0,20.0,93.0,0.0


### 실습
- 타이타닉 호 승객의 평균 나이를 구하라
- 타이타닉 호 승객 중 여성 승객의 평균 나이를 구하라
- 타이타닉 호 승객 중 1등실 선실의 여성 승객의 평균 나이를 구하라

In [132]:
titanic['age'].mean()

29.703473980309422

In [136]:
titanic.loc[titanic['sex'] == 'female', 'age'].mean()

27.884169884169886

In [137]:
titanic.loc[(titanic['sex'] == 'female')&(titanic['pclass'] == 1),
            'age'].mean()

34.57142857142857

## apply 변환
- 행이나 열 단위로 복잡한 데이터 가공이 필요한 경우 사용
- lambda식을 알아야 한다
- apply 함수는 인자로 함수를 넘겨받을 수 있다
- loop작업을 대신할 수 있다

### lambda
x를 제곱해주는 함수를 만들어보자

In [138]:
def get_square(x):
    return(x**2)

In [139]:
print(get_square(2))

4


lambda 이용

In [140]:
lambda_square = lambda x : x**2

In [141]:
print(lambda_square(2))

4


In [142]:
np.random.seed(100)
apply_df = pd.DataFrame(np.random.randint(0, 10, (6, 4)))
apply_df.columns = ['A', 'B', 'C', 'D']
apply_df.index = pd.date_range('20201014', periods=6)
apply_df

Unnamed: 0,A,B,C,D
2020-10-14,8,8,3,7
2020-10-15,7,0,4,2
2020-10-16,5,2,2,2
2020-10-17,1,0,8,4
2020-10-18,0,9,6,2
2020-10-19,4,1,5,3


`.apply(func, axis = )`  
axis = 0 : 행에 대한 apply    
axis = 1 : 열에 대한 apply

In [145]:
# 각 행의 최대값 - 최소값을 구해 새로운 컬럼 추가
func = lambda x : x.max() - x.min()
apply_df['max-min'] = apply_df.apply(func, axis = 1)
apply_df

Unnamed: 0,A,B,C,D,max-min
2020-10-14,8,8,3,7,5
2020-10-15,7,0,4,2,7
2020-10-16,5,2,2,2,3
2020-10-17,1,0,8,4,8
2020-10-18,0,9,6,2,9
2020-10-19,4,1,5,3,4


### [실습] 타이타닉에 apply 적용해보기 (lambda)

- embark_town의 문자열 개수를 별도의 컬럼인 embark_len 컬럼을 추가
- if ~ else 활용하여 나이가 15세 이하면 child, 그렇지 않으면 adult로 구분하는 child_adult 컬럼 추가

In [146]:
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True
5,0,3,male,,0,0,8.4583,Q,Third,man,True,,Queenstown,no,True
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
7,0,3,male,2.0,3,1,21.075,S,Third,child,False,,Southampton,no,False


In [148]:
# embark_town의 문자열 개수를 별도의 컬럼인 embark_len 컬럼을 추가
func = lambda x: len(str(x))
titanic['embark_len'] = titanic['embark_town'].apply(func)
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,embark_len
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False,11
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True,11
5,0,3,male,,0,0,8.4583,Q,Third,man,True,,Queenstown,no,True,10
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True,11
7,0,3,male,2.0,3,1,21.075,S,Third,child,False,,Southampton,no,False,11


In [152]:
# if ~ else 활용하여 나이가 15세 이하면 child, 
# 그렇지 않으면 adult로 구분하는 child_adult 컬럼 추가

func = lambda x: 'child' if x<=15 else 'adult'
titanic['child_adult'] = titanic['age'].apply(func)
titanic.head(10)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,embark_len,child_adult
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False,11,adult
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True,11,adult
5,0,3,male,,0,0,8.4583,Q,Third,man,True,,Queenstown,no,True,10,adult
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True,11,adult
7,0,3,male,2.0,3,1,21.075,S,Third,child,False,,Southampton,no,False,11,child
8,1,3,female,27.0,0,2,11.1333,S,Third,woman,False,,Southampton,yes,False,11,adult
9,1,2,female,14.0,1,0,30.0708,C,Second,child,False,,Cherbourg,yes,False,9,child
10,1,3,female,4.0,1,1,16.7,S,Third,child,False,G,Southampton,yes,False,11,child
11,1,1,female,58.0,0,0,26.55,S,First,woman,False,C,Southampton,yes,True,11,adult
12,0,3,male,20.0,0,0,8.05,S,Third,man,True,,Southampton,no,True,11,adult
