# 판다스
---
[pandas](https://pandas.pydata.org/)
- Pandas(판다스): 데이터 조작 및 분석을 위한 파이썬 프로그래밍 라이브러리

In [1]:
# !pip install pandas

In [2]:
import numpy as np
import pandas as pd

print(pd.__version__)

1.3.5


### Series와 DataFrame

### Series
**Series 객체**<br>
- 1차원 배열 구조와 인덱스(index)를 가짐
- dtype 속성, shape 속성 등

In [3]:
sr = pd.Series([1, 2, 3, 4, 5], name='Apple')
sr

0    1
1    2
2    3
3    4
4    5
Name: Apple, dtype: int64

In [4]:
sr.index

RangeIndex(start=0, stop=5, step=1)

In [5]:
sr.name

'Apple'

In [6]:
sr.dtype

dtype('int64')

In [7]:
sr.shape

(5,)

In [8]:
type(sr)

pandas.core.series.Series

In [9]:
sr[1]

2

In [10]:
sr[1:3]

1    2
2    3
Name: Apple, dtype: int64

In [11]:
sr = pd.Series([1, 2, 3, 4, 5], name='Apple', index=['a', 'b', 'c', 'd', 'e'])
sr

a    1
b    2
c    3
d    4
e    5
Name: Apple, dtype: int64

In [12]:
sr['b']

2

In [13]:
sr.index

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [14]:
sr.values

array([1, 2, 3, 4, 5])

In [15]:
type(sr.values)

numpy.ndarray

In [16]:
sr.to_numpy()

array([1, 2, 3, 4, 5])

In [17]:
# DataFrame
sr.reset_index()

Unnamed: 0,index,Apple
0,a,1
1,b,2
2,c,3
3,d,4
4,e,5


In [18]:
type(sr.reset_index())

pandas.core.frame.DataFrame

In [19]:
# np.nan: 빈 값
sr = pd.Series([1, np.nan, 2, 3, np.nan, 4, 5])
sr

0    1.0
1    NaN
2    2.0
3    3.0
4    NaN
5    4.0
6    5.0
dtype: float64

In [20]:
# Fancy Indexing
sr[[1, 2, 4]]

1    NaN
2    2.0
4    NaN
dtype: float64

In [21]:
# Boolean Indexing
idx = [False, True, True, False, True, False, False]
sr[idx]

1    NaN
2    2.0
4    NaN
dtype: float64

In [22]:
sr[sr > 3]

5    4.0
6    5.0
dtype: float64

### 결측치, 이상치 처리
결측치(Missing value): `isna()`, `isnull()`

In [23]:
# isna(), isnull()
sr[sr.isna()]

1   NaN
4   NaN
dtype: float64

In [24]:
sr.isna()

0    False
1     True
2    False
3    False
4     True
5    False
6    False
dtype: bool

In [25]:
# True의 개수
sr.isna().sum()

2

In [26]:
x = sr.copy()
x[x.isna()] = x.mean()

In [27]:
x

0    1.0
1    3.0
2    2.0
3    3.0
4    3.0
5    4.0
6    5.0
dtype: float64

In [28]:
# 변수
y = sr.copy()
y = y.dropna()

In [29]:
y

0    1.0
2    2.0
3    3.0
5    4.0
6    5.0
dtype: float64

In [30]:
# inplace=True 사용
z = sr.copy()
z.dropna(inplace=True)

In [31]:
z

0    1.0
2    2.0
3    3.0
5    4.0
6    5.0
dtype: float64

In [32]:
# 결측치를 평균값으로 대체
x = sr.copy()
x[x.isna()] = x.mean()
x

0    1.0
1    3.0
2    2.0
3    3.0
4    3.0
5    4.0
6    5.0
dtype: float64

In [33]:
# 결측치를 평균값으로 대체
w = sr.copy()
w = w.fillna(w.mean())
w

0    1.0
1    3.0
2    2.0
3    3.0
4    3.0
5    4.0
6    5.0
dtype: float64

### 슬라이싱

In [34]:
sr = pd.Series([10, 15, 12, 17, 13], 
               index=['john', 'eva', 'james', 'liam', 'zoe'])
sr

john     10
eva      15
james    12
liam     17
zoe      13
dtype: int64

In [35]:
sr[1:4]

eva      15
james    12
liam     17
dtype: int64

In [36]:
sr['eva':'liam']

eva      15
james    12
liam     17
dtype: int64

In [37]:
sr[2:]

james    12
liam     17
zoe      13
dtype: int64

In [38]:
sr[0:-1]

john     10
eva      15
james    12
liam     17
dtype: int64

In [39]:
sr[:]

john     10
eva      15
james    12
liam     17
zoe      13
dtype: int64

In [40]:
sr[::-1]

zoe      13
liam     17
james    12
eva      15
john     10
dtype: int64

In [41]:
# 오름차순
sr.sort_values()

john     10
james    12
zoe      13
eva      15
liam     17
dtype: int64

In [42]:
# 내림차순
sr.sort_values(ascending=False)

liam     17
eva      15
zoe      13
james    12
john     10
dtype: int64

In [43]:
# Top 3
sr.sort_values(ascending=False)[:3]

liam    17
eva     15
zoe     13
dtype: int64

In [44]:
# 이름순
sr.sort_index()

eva      15
james    12
john     10
liam     17
zoe      13
dtype: int64

### DataFrame
**DataFrame 속성**<br>
- `shape`: 데이터의 행과 열의 수를 튜플 자료형으로 정의
- `dtype`: 각 열의 데이터 타입
- `columns`: 컬럼명
- `index`: 행의 인덱스

**DataFrame 함수**<br>
- `head()`: 데이터 프레임의 처음부터 설정한 개수만큼 표시
- `tail()`: 데이터 프레임의 끝에서 설정한 개수만큼 표시
- `info()`: 데이터 프레임의 정보를 표시
- `describe()`: 각 열의 통계 정보를 표시

- 리스트

In [45]:
doc = [['Joe', 20, 85.10, 'A', 'Swimming'],
        ['Nat', 21, 77.80, 'B', 'Reading'],
        ['Harry', 19, 91.54, 'A', 'Music'],
        ['Sam', 20, 88.78, 'A', 'Painting'],
        ['Monica', 22, 60.55, 'B', 'Dancing']]

c_name = ['Name', 'Age', 'Marks', 'Grade', 'Hobby']
idx = ['s1', 's2', 's3', 's4', 's5']

In [46]:
df = pd.DataFrame(doc, columns=c_name, index=idx)
df.shape

(5, 5)

In [47]:
df.head()

Unnamed: 0,Name,Age,Marks,Grade,Hobby
s1,Joe,20,85.1,A,Swimming
s2,Nat,21,77.8,B,Reading
s3,Harry,19,91.54,A,Music
s4,Sam,20,88.78,A,Painting
s5,Monica,22,60.55,B,Dancing


- 딕셔너리

In [48]:
doc = {'Name' :['Joe','Nat','Harry','Sam','Monica',],
       'Age':[20, 21, 19, 20, 22],
       'Marks':[85.10, 77.80, 91.54, 88.78, 60.55],
       'Grade':['A', 'B', 'A', 'A', 'B',],
       'Hobby':['Swmming', 'Reading', 'Music', 'Painting', 'Dancing']}

In [49]:
df = pd.DataFrame(doc)
df.shape

(5, 5)

In [50]:
df.head(3)

Unnamed: 0,Name,Age,Marks,Grade,Hobby
0,Joe,20,85.1,A,Swmming
1,Nat,21,77.8,B,Reading
2,Harry,19,91.54,A,Music


In [51]:
df.index

RangeIndex(start=0, stop=5, step=1)

In [52]:
df.dtypes

Name      object
Age        int64
Marks    float64
Grade     object
Hobby     object
dtype: object

In [53]:
x = df.dtypes
x

Name      object
Age        int64
Marks    float64
Grade     object
Hobby     object
dtype: object

In [54]:
x['Name']

dtype('O')

In [55]:
df.columns[[0, 2, 3]]

Index(['Name', 'Marks', 'Grade'], dtype='object')

In [56]:
df[df.columns[[0, 2, 3]]]

Unnamed: 0,Name,Marks,Grade
0,Joe,85.1,A
1,Nat,77.8,B
2,Harry,91.54,A
3,Sam,88.78,A
4,Monica,60.55,B


In [57]:
df[['Name', 'Marks', 'Grade']]

Unnamed: 0,Name,Marks,Grade
0,Joe,85.1,A
1,Nat,77.8,B
2,Harry,91.54,A
3,Sam,88.78,A
4,Monica,60.55,B


In [58]:
df.columns

Index(['Name', 'Age', 'Marks', 'Grade', 'Hobby'], dtype='object')

In [59]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Name    5 non-null      object 
 1   Age     5 non-null      int64  
 2   Marks   5 non-null      float64
 3   Grade   5 non-null      object 
 4   Hobby   5 non-null      object 
dtypes: float64(1), int64(1), object(3)
memory usage: 328.0+ bytes


- 컬럼 다루기

In [60]:
doc = {'Name' :['Joe','Nat','Harry','Sam','Monica',],
       'Age':[20, 21, 19, 20, 22],
       'Marks':[85.10, 77.80, 91.54, 88.78, 60.55],
       'Grade':['A', 'B', 'A', 'A', 'B',],
       'Hobby':['Swmming', 'Reading', 'Music', 'Painting', 'Dancing']}

In [61]:
df = pd.DataFrame(doc)
df.shape

(5, 5)

In [62]:
df.head()

Unnamed: 0,Name,Age,Marks,Grade,Hobby
0,Joe,20,85.1,A,Swmming
1,Nat,21,77.8,B,Reading
2,Harry,19,91.54,A,Music
3,Sam,20,88.78,A,Painting
4,Monica,22,60.55,B,Dancing


In [63]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Name    5 non-null      object 
 1   Age     5 non-null      int64  
 2   Marks   5 non-null      float64
 3   Grade   5 non-null      object 
 4   Hobby   5 non-null      object 
dtypes: float64(1), int64(1), object(3)
memory usage: 328.0+ bytes


In [64]:
df['Name']

0       Joe
1       Nat
2     Harry
3       Sam
4    Monica
Name: Name, dtype: object

In [65]:
df[['Name', 'Age']]

Unnamed: 0,Name,Age
0,Joe,20
1,Nat,21
2,Harry,19
3,Sam,20
4,Monica,22


In [66]:
# DataFrame
df[['Age']]

Unnamed: 0,Age
0,20
1,21
2,19
3,20
4,22


In [67]:
# Series
df['Age']

0    20
1    21
2    19
3    20
4    22
Name: Age, dtype: int64

In [68]:
df.columns

Index(['Name', 'Age', 'Marks', 'Grade', 'Hobby'], dtype='object')

- 컬럼명 바꾸기 방법-1

In [69]:
df.columns = ['Name', 'Age', 'Score', 'Grade', 'Hobby']

In [70]:
df

Unnamed: 0,Name,Age,Score,Grade,Hobby
0,Joe,20,85.1,A,Swmming
1,Nat,21,77.8,B,Reading
2,Harry,19,91.54,A,Music
3,Sam,20,88.78,A,Painting
4,Monica,22,60.55,B,Dancing


- 컬럼명 바꾸기 방법-2

In [71]:
df.rename(columns={'Score': 'Marks', 'Hobby': 'etc'})

Unnamed: 0,Name,Age,Marks,Grade,etc
0,Joe,20,85.1,A,Swmming
1,Nat,21,77.8,B,Reading
2,Harry,19,91.54,A,Music
3,Sam,20,88.78,A,Painting
4,Monica,22,60.55,B,Dancing


In [72]:
df

Unnamed: 0,Name,Age,Score,Grade,Hobby
0,Joe,20,85.1,A,Swmming
1,Nat,21,77.8,B,Reading
2,Harry,19,91.54,A,Music
3,Sam,20,88.78,A,Painting
4,Monica,22,60.55,B,Dancing


### 파일 입출력

- gdown

In [73]:
# !pip install gdown

In [74]:
import gdown

In [75]:
# !gdown https://raw.githubusercontent.com/devdio/datasets/main/doc.csv

In [76]:
df = pd.read_csv('doc.csv')
df.head()

Unnamed: 0,Name,Age,Marks,Grade,Hobby
0,Joe,20,85.1,A,Swmming
1,Nat,21,77.8,B,Reading
2,Harry,19,91.54,A,Music
3,Sam,20,88.78,A,Painting
4,Monica,22,60.55,B,Dancing


In [77]:
# !gdown https://raw.githubusercontent.com/devdio/datasets/main/doc_idx.csv

In [78]:
df = pd.read_csv('doc_idx.csv', index_col=0)
df.head()

Unnamed: 0,Name,Age,Marks,Grade,Hobby
s1,Joe,20,85.1,A,Swmming
s2,Nat,21,77.8,B,Reading
s3,Harry,19,91.54,A,Music
s4,Sam,20,88.78,A,Painting
s5,Monica,22,60.55,B,Dancing


In [79]:
# !gdown https://raw.githubusercontent.com/devdio/datasets/main/doc_na.csv

In [80]:
df = pd.read_csv('doc_na.csv', 
                 index_col=0, 
                 na_values=['?', '*', '-'])
df.head()

Unnamed: 0,Name,Age,Marks,Grade,Hobby
s1,Joe,20.0,,A,Swmming
s2,Nat,21.0,77.8,B,Reading
s3,Harry,,91.54,A,Music
s4,Sam,20.0,88.78,A,
s5,Monica,22.0,60.55,B,Dancing


In [81]:
# !gdown https://raw.githubusercontent.com/devdio/datasets/main/titanic.csv

In [82]:
titanic = pd.read_csv('titanic.csv')
titanic.shape

(891, 12)

In [83]:
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,lost,3rd,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,saved,1st,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,saved,3rd,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,saved,1st,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,lost,3rd,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [84]:
# 원본 복사
df = titanic.copy()

In [85]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [86]:
df.columns = [col.lower() for col in df.columns]
df.head()

Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,1,lost,3rd,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,saved,1st,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,saved,3rd,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,saved,1st,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,lost,3rd,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [87]:
# age, cabin, embarked 결측치 존재
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   passengerid  891 non-null    int64  
 1   survived     891 non-null    object 
 2   pclass       891 non-null    object 
 3   name         891 non-null    object 
 4   sex          891 non-null    object 
 5   age          714 non-null    float64
 6   sibsp        891 non-null    int64  
 7   parch        891 non-null    int64  
 8   ticket       891 non-null    object 
 9   fare         891 non-null    float64
 10  cabin        204 non-null    object 
 11  embarked     889 non-null    object 
dtypes: float64(2), int64(3), object(7)
memory usage: 83.7+ KB


In [88]:
# 통계적 수치, 최대값, 최소값
df.describe()

Unnamed: 0,passengerid,age,sibsp,parch,fare
count,891.0,714.0,891.0,891.0,891.0
mean,446.0,29.699118,0.523008,0.381594,32.204208
std,257.353842,14.526497,1.102743,0.806057,49.693429
min,1.0,0.42,0.0,0.0,0.0
25%,223.5,20.125,0.0,0.0,7.9104
50%,446.0,28.0,0.0,0.0,14.4542
75%,668.5,38.0,1.0,0.0,31.0
max,891.0,80.0,8.0,6.0,512.3292


In [89]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
passengerid,891.0,446.0,257.353842,1.0,223.5,446.0,668.5,891.0
age,714.0,29.699118,14.526497,0.42,20.125,28.0,38.0,80.0
sibsp,891.0,0.523008,1.102743,0.0,0.0,0.0,1.0,8.0
parch,891.0,0.381594,0.806057,0.0,0.0,0.0,0.0,6.0
fare,891.0,32.204208,49.693429,0.0,7.9104,14.4542,31.0,512.3292


In [90]:
# 타이타닉호가 방문한 항구 이름
df['embarked'].unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [91]:
df['embarked'].value_counts()

S    644
C    168
Q     77
Name: embarked, dtype: int64

In [92]:
# 성별
df['sex'].value_counts()

male      577
female    314
Name: sex, dtype: int64

In [93]:
# 생존 여부
df['survived'].value_counts()

lost     549
saved    342
Name: survived, dtype: int64

In [94]:
# 객실등급
df['pclass'].value_counts()

3rd    491
1st    216
2nd    184
Name: pclass, dtype: int64

In [95]:
# 결측치 개수 계산
df.isna().sum(axis=0)

passengerid      0
survived         0
pclass           0
name             0
sex              0
age            177
sibsp            0
parch            0
ticket           0
fare             0
cabin          687
embarked         2
dtype: int64

### 인덱싱(Indexing)과 슬라이싱(Slicing)
**인덱싱(Indexing)**<br>
- 데이터 프레임의 특정 위치의 요소에 접근

**슬라이싱(Slicing)**<br>
- 특정 부분의 데이터들을 추출하는 것
- `loc`
- `iloc`

In [96]:
# 인덱싱
df.loc[5, 'pclass']

'3rd'

In [97]:
df.loc[5:10, 'pclass']

5     3rd
6     1st
7     3rd
8     3rd
9     2nd
10    3rd
Name: pclass, dtype: object

In [98]:
df.loc[5:10, ['pclass', 'name', 'survived']]

Unnamed: 0,pclass,name,survived
5,3rd,"Moran, Mr. James",lost
6,1st,"McCarthy, Mr. Timothy J",lost
7,3rd,"Palsson, Master. Gosta Leonard",lost
8,3rd,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",saved
9,2nd,"Nasser, Mrs. Nicholas (Adele Achem)",saved
10,3rd,"Sandstrom, Miss. Marguerite Rut",saved


In [99]:
df.iloc[5:10, [2, 3, 1]]

Unnamed: 0,pclass,name,survived
5,3rd,"Moran, Mr. James",lost
6,1st,"McCarthy, Mr. Timothy J",lost
7,3rd,"Palsson, Master. Gosta Leonard",lost
8,3rd,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",saved
9,2nd,"Nasser, Mrs. Nicholas (Adele Achem)",saved


- df.loc[행, 열]
- df.loc[, 열]

In [100]:
df['age'].min(), df['age'].max()

(0.42, 80.0)

- 30살 미만은 명 몇인가

In [101]:
df.loc[df['age'] < 30, ['name', 'age']].count()

name    384
age     384
dtype: int64

- 30살 미만인 남자는 몆 명인가

In [102]:
df.loc[(df['age'] < 30) & (df['sex'] == 'male')].count()

passengerid    237
survived       237
pclass         237
name           237
sex            237
age            237
sibsp          237
parch          237
ticket         237
fare           237
cabin           27
embarked       237
dtype: int64

- 나이가 30살 미만이거나 성별이 남자인 경우는 몇 명인가 

In [103]:
df.loc[(df['age'] < 30) | (df['sex'] == 'male')].count()

passengerid    724
survived       724
pclass         724
name           724
sex            724
age            600
sibsp          724
parch          724
ticket         724
fare           724
cabin          145
embarked       724
dtype: int64

### 결측치(Missing Value) 확인 및 처리
**결측치(Missing Value)**<br>
- 데이터가 없는 경우

**결측치 처리 함수**<br>
- `isna()`, `isnull()` 함수: 결측치인지 아닌지 여부(True, False로 표현)
- `dropna()` 함수: 결측치가 포함된 행 삭제
- `fillna()` 함수: 결측치를 특정 값으로 변경

In [104]:
# 결측치 확인
df.isna().sum(axis=0)

passengerid      0
survived         0
pclass           0
name             0
sex              0
age            177
sibsp            0
parch            0
ticket           0
fare             0
cabin          687
embarked         2
dtype: int64

In [105]:
# 나이가 NaN인 데이터
df.loc[df['age'].isna()]['embarked'].value_counts()

S    90
Q    49
C    38
Name: embarked, dtype: int64

1. 항구 탑승자 조사
2. embarked 결측치 채워넣기

In [106]:
# 1. 항구 탑승자 조사
df['embarked'].value_counts()

S    644
C    168
Q     77
Name: embarked, dtype: int64

In [107]:
# 2. embarked 결측치 채워넣기
df['embarked'] = df['embarked'].fillna('S')

In [108]:
df.isna().sum(axis=0)

passengerid      0
survived         0
pclass           0
name             0
sex              0
age            177
sibsp            0
parch            0
ticket           0
fare             0
cabin          687
embarked         0
dtype: int64

In [109]:
df = df.drop(['cabin', 'ticket', 'passengerid'], axis=1)
df.head()

Unnamed: 0,survived,pclass,name,sex,age,sibsp,parch,fare,embarked
0,lost,3rd,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S
1,saved,1st,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C
2,saved,3rd,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S
3,saved,1st,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S
4,lost,3rd,"Allen, Mr. William Henry",male,35.0,0,0,8.05,S


- Feature Engineering

In [110]:
# sibsp(형제자매) + parch(부모) = 동반자 유무
df['family'] = df['sibsp'] + df['parch']
df.head()

Unnamed: 0,survived,pclass,name,sex,age,sibsp,parch,fare,embarked,family
0,lost,3rd,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S,1
1,saved,1st,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C,1
2,saved,3rd,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S,0
3,saved,1st,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S,1
4,lost,3rd,"Allen, Mr. William Henry",male,35.0,0,0,8.05,S,0


In [111]:
x = df.copy()
x.head()

Unnamed: 0,survived,pclass,name,sex,age,sibsp,parch,fare,embarked,family
0,lost,3rd,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S,1
1,saved,1st,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C,1
2,saved,3rd,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S,0
3,saved,1st,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S,1
4,lost,3rd,"Allen, Mr. William Henry",male,35.0,0,0,8.05,S,0


In [112]:
# 1. 결측치 존재 행 제거
x = x.dropna()
x.isna().sum(axis=0)

survived    0
pclass      0
name        0
sex         0
age         0
sibsp       0
parch       0
fare        0
embarked    0
family      0
dtype: int64

In [113]:
x = ([1, 2, 3, 4, 5, 100])
np.median(x), np.mean(x)

(3.5, 19.166666666666668)

In [114]:
# 2. age 평균(중앙값)을 결측치에 넣기
df['age'].fillna(df['age'].mean(), inplace=True)

In [115]:
df.isna().sum(axis=0)

survived    0
pclass      0
name        0
sex         0
age         0
sibsp       0
parch       0
fare        0
embarked    0
family      0
dtype: int64

### 그룹핑 함수
**그룹핑**<br>
- `groupby()`: 특정 컬럼의 값을 기준으로 그룹핑해서 함수를 실행함

**일괄처리 함수**<br>
- `apply()`: 데이터 프레임에 함수를 적용함( 함수에 전달되는 객체는 Series 형식)
- `map()`: 단일 컬럼의 경우 apply() 함수와 같은 기능, 다중 컬럼의 경우 map() 함수를 사용할 수 없음

In [116]:
# .index: MultiIndex
df.groupby(['sex', 'pclass'])['age'].mean()

sex     pclass
female  1st       34.141405
        2nd       28.748661
        3rd       24.068493
male    1st       39.287717
        2nd       30.653908
        3rd       27.372153
Name: age, dtype: float64

- 선실 등급별 남여 생존자 수

In [117]:
x = df.loc[df['survived'] == 'saved']
x.groupby(['pclass', 'sex'])['survived'].count()

pclass  sex   
1st     female    91
        male      45
2nd     female    70
        male      17
3rd     female    72
        male      47
Name: survived, dtype: int64

- apply() 함수

In [118]:
def myfunc(x):
    print(type(x))

In [119]:
# 컬럼 단위로 들어옴
df.apply(myfunc)

<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>


survived    None
pclass      None
name        None
sex         None
age         None
sibsp       None
parch       None
fare        None
embarked    None
family      None
dtype: object

In [120]:
def myfunc(x):
    print(x)

In [121]:
# 데이터 하나씩 들어옴
df['sex'].apply(myfunc)

male
female
female
female
male
male
male
male
female
female
female
female
male
male
female
female
male
male
female
female
male
male
female
male
female
female
male
male
female
male
male
female
female
male
male
male
male
male
female
female
female
female
male
female
female
male
male
female
male
female
male
male
female
female
male
male
female
male
female
male
male
female
male
male
male
male
female
male
female
male
male
female
male
male
male
male
male
male
male
female
male
male
female
male
female
female
male
male
female
male
male
male
male
male
male
male
male
male
female
male
female
male
male
male
male
male
female
male
male
female
male
female
male
female
female
male
male
male
male
female
male
male
male
female
male
male
male
male
female
male
male
male
female
female
male
male
female
male
male
male
female
female
female
male
male
male
male
female
male
male
male
female
male
male
male
male
female
male
male
male
male
female
male
male
male
male
female
female
male
male
male
male
female
male
male
mal

0      None
1      None
2      None
3      None
4      None
       ... 
886    None
887    None
888    None
889    None
890    None
Name: sex, Length: 891, dtype: object

- 인코딩: 문자열 데이터를 수치형 데이터로 변경

In [122]:
def myfunc(x):
    if x == 'male':
        return 1
    else:
        return 0

In [123]:
df['sex'] = df['sex'].apply(myfunc)
df.head()

Unnamed: 0,survived,pclass,name,sex,age,sibsp,parch,fare,embarked,family
0,lost,3rd,"Braund, Mr. Owen Harris",1,22.0,1,0,7.25,S,1
1,saved,1st,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,71.2833,C,1
2,saved,3rd,"Heikkinen, Miss. Laina",0,26.0,0,0,7.925,S,0
3,saved,1st,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,53.1,S,1
4,lost,3rd,"Allen, Mr. William Henry",1,35.0,0,0,8.05,S,0


In [124]:
def pclass_encoding(x):
    if x == '1st':
        return 0
    elif x == '2nd':
        return 1
    elif x == '3rd':
        return 2

In [125]:
df['pclass'] = df['pclass'].apply(pclass_encoding)

In [126]:
def survived_encoding(x):
    if x == 'lost':
        return 0
    elif x == 'saved':
        return 1

In [127]:
df['survived'] = df['survived'].apply(survived_encoding)

In [128]:
def embarked_encoding(x):
    if x == 'C':
        return 0
    elif x == 'Q':
        return 1
    elif x == 'S':
        return 2

In [129]:
df['embarked'] = df['embarked'].apply(embarked_encoding)

In [130]:
df = df.drop(['name'], axis=1)

In [131]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,family
0,0,2,1,22.0,1,0,7.25,2,1
1,1,0,0,38.0,1,0,71.2833,0,1
2,1,2,0,26.0,0,0,7.925,2,0
3,1,0,0,35.0,1,0,53.1,2,1
4,0,2,1,35.0,0,0,8.05,2,0


- 연속형 데이터
- 범주형 데이터

In [132]:
my_arr = df.values

- 데이터 저장하기

In [133]:
np.savez('titanic.npz', arr=my_arr)

- 데이터 불러오기

In [134]:
data = np.load('titanic.npz')

In [135]:
data['arr']

array([[ 0.    ,  2.    ,  1.    , ...,  7.25  ,  2.    ,  1.    ],
       [ 1.    ,  0.    ,  0.    , ..., 71.2833,  0.    ,  1.    ],
       [ 1.    ,  2.    ,  0.    , ...,  7.925 ,  2.    ,  0.    ],
       ...,
       [ 0.    ,  2.    ,  0.    , ..., 23.45  ,  2.    ,  3.    ],
       [ 1.    ,  0.    ,  1.    , ..., 30.    ,  0.    ,  0.    ],
       [ 0.    ,  2.    ,  1.    , ...,  7.75  ,  1.    ,  0.    ]])