##### 데이터프레임의 데이터 조작

In [1]:
import numpy as np
import pandas as pd

##### 데이터 갯수 세기

In [2]:
s = pd.Series(range(10))
s[3] = np.nan
s

0    0.0
1    1.0
2    2.0
3    NaN
4    4.0
5    5.0
6    6.0
7    7.0
8    8.0
9    9.0
dtype: float64

In [3]:
s.count()

9

In [4]:
df = pd.DataFrame(data=np.arange(16).reshape(4,4), dtype=float)
df.loc[2,3] = np.nan
df

Unnamed: 0,0,1,2,3
0,0.0,1.0,2.0,3.0
1,4.0,5.0,6.0,7.0
2,8.0,9.0,10.0,
3,12.0,13.0,14.0,15.0


In [5]:
df.count()

0    4
1    4
2    4
3    3
dtype: int64

In [6]:
import seaborn as sns
titanic = sns.load_dataset('titanic')
type(titanic)

pandas.core.frame.DataFrame

In [7]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


In [8]:
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [9]:
titanic[['embark_town', 'survived']].value_counts()

embark_town  survived
Southampton  0           427
             1           217
Cherbourg    1            93
             0            75
Queenstown   0            47
             1            30
dtype: int64

In [10]:
pd.pivot_table(data=titanic, index='sex', columns='class', values='survived', aggfunc=np.mean).round(3)

class,First,Second,Third
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.968,0.921,0.5
male,0.369,0.157,0.135


In [11]:
titanic['survived'].sum()

342

In [12]:
titanic.count()

survived       891
pclass         891
sex            891
age            714
sibsp          891
parch          891
fare           891
embarked       889
class          891
who            891
adult_male     891
deck           203
embark_town    889
alive          891
alone          891
dtype: int64

In [13]:
titanic.describe().round(3)

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.384,2.309,29.699,0.523,0.382,32.204
std,0.487,0.836,14.526,1.103,0.806,49.693
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.91
50%,0.0,3.0,28.0,0.0,0.0,14.454
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.329


##### 카테고리 값 세기

In [14]:
titanic.sex.value_counts()

male      577
female    314
Name: sex, dtype: int64

In [15]:
titanic.embark_town.value_counts()

Southampton    644
Cherbourg      168
Queenstown      77
Name: embark_town, dtype: int64

In [16]:
titanic[["embark_town","sex"]].value_counts()

embark_town  sex   
Southampton  male      441
             female    203
Cherbourg    male       95
             female     73
Queenstown   male       41
             female     36
dtype: int64

##### 정렬

In [18]:
np.random.seed(2021)
s2 = pd.Series(np.random.randint(6, size=100))
s2.tail()

95    1
96    1
97    5
98    0
99    5
dtype: int32

In [19]:
s2.value_counts()

1    21
5    21
2    19
3    18
4    13
0     8
dtype: int64

In [21]:
s2.value_counts().sort_index()

0     8
1    21
2    19
3    18
4    13
5    21
dtype: int64

In [20]:
s2.value_counts().sort_values()

0     8
4    13
3    18
2    19
1    21
5    21
dtype: int64

In [23]:
# 내림차순
s2.value_counts().sort_values(ascending=False)

1    21
5    21
2    19
3    18
4    13
0     8
dtype: int64

In [32]:
titanic.sort_values(by='fare', ascending=False).head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
258,1,1,female,35.0,0,0,512.3292,C,First,woman,False,,Cherbourg,yes,True
737,1,1,male,35.0,0,0,512.3292,C,First,man,True,B,Cherbourg,yes,True
679,1,1,male,36.0,0,1,512.3292,C,First,man,True,B,Cherbourg,yes,False
88,1,1,female,23.0,3,2,263.0,S,First,woman,False,C,Southampton,yes,False
27,0,1,male,19.0,3,2,263.0,S,First,man,True,C,Southampton,no,False


In [33]:
titanic.sort_values(by=['fare','age'], ascending=False).head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
679,1,1,male,36.0,0,1,512.3292,C,First,man,True,B,Cherbourg,yes,False
258,1,1,female,35.0,0,0,512.3292,C,First,woman,False,,Cherbourg,yes,True
737,1,1,male,35.0,0,0,512.3292,C,First,man,True,B,Cherbourg,yes,True
438,0,1,male,64.0,1,4,263.0,S,First,man,True,C,Southampton,no,False
341,1,1,female,24.0,3,2,263.0,S,First,woman,False,C,Southampton,yes,False


##### 행/열 합계

In [34]:
np.random.seed(2021)
df2 = pd.DataFrame(data=np.random.randint(10, size=(4,8)))
df2

Unnamed: 0,0,1,2,3,4,5,6,7
0,4,5,9,0,6,5,8,6
1,6,6,6,1,5,7,1,1
2,5,2,0,3,1,0,2,6
3,4,8,5,1,6,7,5,6


In [38]:
df2.sum()

0    19
1    21
2    20
3     5
4    18
5    19
6    16
7    19
dtype: int64

In [40]:
df2.sum(axis=0)

0    19
1    21
2    20
3     5
4    18
5    19
6    16
7    19
dtype: int64

In [41]:
df2.sum(axis=1)

0    43
1    33
2    19
3    42
dtype: int64

In [42]:
df2['RowSum'] = df2.sum(axis=1)
df2

Unnamed: 0,0,1,2,3,4,5,6,7,RowSum
0,4,5,9,0,6,5,8,6,43
1,6,6,6,1,5,7,1,1,33
2,5,2,0,3,1,0,2,6,19
3,4,8,5,1,6,7,5,6,42


In [43]:
df2.loc['ColSum', :] = df2.sum(axis=0)
df2

Unnamed: 0,0,1,2,3,4,5,6,7,RowSum
0,4.0,5.0,9.0,0.0,6.0,5.0,8.0,6.0,43.0
1,6.0,6.0,6.0,1.0,5.0,7.0,1.0,1.0,33.0
2,5.0,2.0,0.0,3.0,1.0,0.0,2.0,6.0,19.0
3,4.0,8.0,5.0,1.0,6.0,7.0,5.0,6.0,42.0
ColSum,19.0,21.0,20.0,5.0,18.0,19.0,16.0,19.0,137.0


##### 연습문제 4.4.3

In [45]:
# 1. 승객의 평균 나이
np.round(titanic.age.mean(), 1)

29.7

In [54]:
# 승객이 평균 나이
titanic.mean().round(1).loc['age']

29.7

In [55]:
titanic.mean().round(1)

survived       0.4
pclass         2.3
age           29.7
sibsp          0.5
parch          0.4
fare          32.2
adult_male     0.6
alone          0.6
dtype: float64

In [61]:
# 여성 승객의 평균 나이
titanic.groupby('sex').age.mean().round(1).loc['female']

27.9

In [63]:
np.round(titanic[titanic.sex=='female'].age.mean(), 1)

27.9

In [77]:
# 1등실, 여성 승객의 평균 나이
np.round(titanic[(titanic['class'] == 'First') & (titanic['sex'] == 'female')].age.mean(), 1)

34.6

In [78]:
import warnings
warnings.filterwarnings('ignore')

In [80]:
np.round(titanic[titanic.pclass==1][titanic.sex=='female'].age.mean(), 1)

34.6

In [85]:
titanic.groupby(by=['class','sex']).age.mean().round(1).loc['First', 'female']

34.6

##### apply 변환

In [87]:
df3 = pd.DataFrame({'A':[1,3,4,3,4],
                    'B':[2,3,1,2,3],
                    'C':[1,5,2,4,4]})
df3

Unnamed: 0,A,B,C
0,1,2,1
1,3,3,5
2,4,1,2
3,3,2,4
4,4,3,4


In [89]:
df3.apply(lambda x : x.max() - x.min(), axis=0)

A    3
B    2
C    4
dtype: int64

In [91]:
df3.apply(lambda x : x.max() - x.min(), axis=1)

0    1
1    2
2    3
3    2
4    1
dtype: int64

In [93]:
titanic['성년'] = titanic.apply(lambda x : '성년' if x.age >= 20 else '미성년', axis=1)
titanic[['age', '성년']].tail()

Unnamed: 0,age,성년
886,27.0,성년
887,19.0,미성년
888,,미성년
889,26.0,성년
890,32.0,성년
