## 02_pandas_tutorial

# 판다스
---

[판다스](https://pandas.pydata.org/)

In [1]:
# 커널 명령문 실행

In [2]:
# !pip install pandas --upgrade

In [3]:
import pandas as pd
import numpy as np

pd.__version__

'1.5.2'

In [4]:
# series, dataframe

## Series

In [5]:
sr = pd.Series([1, 2, 3, 4, 5], 
               name='Apple')
sr

0    1
1    2
2    3
3    4
4    5
Name: Apple, dtype: int64

In [6]:
sr.index

RangeIndex(start=0, stop=5, step=1)

In [7]:
sr.name

'Apple'

In [8]:
sr.dtype

dtype('int64')

In [9]:
sr.shape

(5,)

In [10]:
type(sr)

pandas.core.series.Series

In [11]:
sr[1]

2

In [12]:
sr[1:3]

1    2
2    3
Name: Apple, dtype: int64

In [13]:
sr = pd.Series([1, 2, 3, 4, 5],
               name='Apple', 
               index=['a', 'b', 'c', 'd', 'e'])
sr

a    1
b    2
c    3
d    4
e    5
Name: Apple, dtype: int64

In [14]:
sr['b']

2

In [15]:
sr.index

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [16]:
sr.values # numpy ndarray type

array([1, 2, 3, 4, 5], dtype=int64)

In [17]:
type(sr.values)

numpy.ndarray

In [18]:
sr.to_numpy()

array([1, 2, 3, 4, 5], dtype=int64)

In [19]:
sr.reset_index() # DataFrame

Unnamed: 0,index,Apple
0,a,1
1,b,2
2,c,3
3,d,4
4,e,5


In [20]:
type(sr.reset_index())

pandas.core.frame.DataFrame

In [21]:
sr = pd.Series([1, np.nan, 2, 3, np.nan, 4, 5]) # 빈 값: np.nan
sr

0    1.0
1    NaN
2    2.0
3    3.0
4    NaN
5    4.0
6    5.0
dtype: float64

In [22]:
# fancy indexing
sr[[1, 2, 4]]

1    NaN
2    2.0
4    NaN
dtype: float64

In [23]:
# boolean indexing
idx = [False, True, True, False, True, False, False]
sr[idx]

1    NaN
2    2.0
4    NaN
dtype: float64

In [24]:
# 조건
sr[sr > 3]

5    4.0
6    5.0
dtype: float64

In [25]:
# 결측지, 이상치 처리

In [26]:
# 결측치 (Missing Value)
# isna(), isnull()
sr[sr.isna()] # np.nan = True

1   NaN
4   NaN
dtype: float64

In [27]:
sr.isnull()

0    False
1     True
2    False
3    False
4     True
5    False
6    False
dtype: bool

In [28]:
sr.isna().sum() # True(1), False(0), True의 개수

2

In [29]:
x = sr.copy()
x[x.isna()] = x.mean()

In [30]:
x

0    1.0
1    3.0
2    2.0
3    3.0
4    3.0
5    4.0
6    5.0
dtype: float64

In [31]:
y = sr.copy()
y = y.dropna() # 변수에 넣는 경우

In [32]:
y

0    1.0
2    2.0
3    3.0
5    4.0
6    5.0
dtype: float64

In [33]:
z = sr.copy()
z.dropna(inplace=True) # inplace=True를 사용하는 경우

In [34]:
z

0    1.0
2    2.0
3    3.0
5    4.0
6    5.0
dtype: float64

In [35]:
x = sr.copy()
x[x.isna()] = x.mean()
x

0    1.0
1    3.0
2    2.0
3    3.0
4    3.0
5    4.0
6    5.0
dtype: float64

In [36]:
w = sr.copy()
w = w.fillna(w.mean())
w

0    1.0
1    3.0
2    2.0
3    3.0
4    3.0
5    4.0
6    5.0
dtype: float64

### 슬라이싱

In [37]:
sr = pd.Series([10, 15, 12, 17, 13], 
              index=['john', 'eva', 'james', 'liam', 'zoe'])
sr

john     10
eva      15
james    12
liam     17
zoe      13
dtype: int64

In [38]:
sr[1:4]

eva      15
james    12
liam     17
dtype: int64

In [39]:
sr['eva':'liam']

eva      15
james    12
liam     17
dtype: int64

In [40]:
sr[2:]

james    12
liam     17
zoe      13
dtype: int64

In [41]:
sr[0:-1]

john     10
eva      15
james    12
liam     17
dtype: int64

In [42]:
sr[:]

john     10
eva      15
james    12
liam     17
zoe      13
dtype: int64

In [43]:
sr[::-1]

zoe      13
liam     17
james    12
eva      15
john     10
dtype: int64

In [44]:
# opencv: BGR => RGB

In [45]:
sr.sort_values() # 오름차순

john     10
james    12
zoe      13
eva      15
liam     17
dtype: int64

In [46]:
sr.sort_values(ascending=False) # 내림차순

liam     17
eva      15
zoe      13
james    12
john     10
dtype: int64

In [47]:
sr.sort_values(ascending=False)[:3] # Top 3

liam    17
eva     15
zoe     13
dtype: int64

In [48]:
sr.sort_index() # 이름순

eva      15
james    12
john     10
liam     17
zoe      13
dtype: int64

## DataFrame

In [49]:
# https://github.com/devdio/datasets

In [50]:
doc = [['Joe', 20, 85.10, 'A', 'Swimming'],
        ['Nat', 21, 77.80, 'B', 'Reading'],
        ['Harry', 19, 91.54, 'A', 'Music'],
        ['Sam', 20, 88.78, 'A', 'Painting'],
        ['Monica', 22, 60.55, 'B', 'Dancing']]

c_name = ['Name', 'Age', 'Marks', 'Grade', 'Hobby']
idx = ['s1', 's2', 's3', 's4', 's5']

In [51]:
df = pd.DataFrame(doc, columns=c_name, index=idx)
df.shape

(5, 5)

In [52]:
df.head()

Unnamed: 0,Name,Age,Marks,Grade,Hobby
s1,Joe,20,85.1,A,Swimming
s2,Nat,21,77.8,B,Reading
s3,Harry,19,91.54,A,Music
s4,Sam,20,88.78,A,Painting
s5,Monica,22,60.55,B,Dancing


In [53]:
doc = {'Name' :['Joe','Nat','Harry','Sam','Monica'],
        'Age':[20, 21, 19, 20, 22],
        'Marks':[85.10, 77.80, 91.54, 88.78, 60.55],
        'Grade':['A', 'B', 'A', 'A', 'B',],
        'Hobby':['Swmming', 'Reading', 'Music', 'Painting', 'Dancing']}

In [54]:
df = pd.DataFrame(doc)
df.shape

(5, 5)

In [55]:
df.head(3)

Unnamed: 0,Name,Age,Marks,Grade,Hobby
0,Joe,20,85.1,A,Swmming
1,Nat,21,77.8,B,Reading
2,Harry,19,91.54,A,Music


In [56]:
df.index

RangeIndex(start=0, stop=5, step=1)

In [57]:
df.dtypes # object 문자열

Name      object
Age        int64
Marks    float64
Grade     object
Hobby     object
dtype: object

In [58]:
x = df.dtypes
x

Name      object
Age        int64
Marks    float64
Grade     object
Hobby     object
dtype: object

In [59]:
x['Name']

dtype('O')

In [60]:
df.columns[[0, 2, 3]]

Index(['Name', 'Marks', 'Grade'], dtype='object')

In [61]:
df[df.columns[[0, 2, 3]]]

Unnamed: 0,Name,Marks,Grade
0,Joe,85.1,A
1,Nat,77.8,B
2,Harry,91.54,A
3,Sam,88.78,A
4,Monica,60.55,B


In [62]:
df[['Name', 'Marks', 'Grade']]

Unnamed: 0,Name,Marks,Grade
0,Joe,85.1,A
1,Nat,77.8,B
2,Harry,91.54,A
3,Sam,88.78,A
4,Monica,60.55,B


In [63]:
df.columns

Index(['Name', 'Age', 'Marks', 'Grade', 'Hobby'], dtype='object')

In [64]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Name    5 non-null      object 
 1   Age     5 non-null      int64  
 2   Marks   5 non-null      float64
 3   Grade   5 non-null      object 
 4   Hobby   5 non-null      object 
dtypes: float64(1), int64(1), object(3)
memory usage: 328.0+ bytes


In [65]:
doc = {'Name' :['Joe',np.nan,'Harry','Sam','Monica'],
        'Age':[20, 21, 19, 20, 22],
        'Marks':[85.10, 77.80, np.nan, np.nan, 60.55],
        'Grade':['A', 'B', 'A', 'A', 'B',],
        'Hobby':['Swmming', 'Reading', 'Music', 'Painting', 'Dancing']}

In [66]:
df = pd.DataFrame(doc)

In [67]:
df.head()

Unnamed: 0,Name,Age,Marks,Grade,Hobby
0,Joe,20,85.1,A,Swmming
1,,21,77.8,B,Reading
2,Harry,19,,A,Music
3,Sam,20,,A,Painting
4,Monica,22,60.55,B,Dancing


In [68]:
df.info() # 전체적인 데이터프레임의 정보

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Name    4 non-null      object 
 1   Age     5 non-null      int64  
 2   Marks   3 non-null      float64
 3   Grade   5 non-null      object 
 4   Hobby   5 non-null      object 
dtypes: float64(1), int64(1), object(3)
memory usage: 328.0+ bytes


### 컬럼 다루기

In [69]:
doc = {'Name' :['Joe','Nat','Harry','Sam','Monica'],
        'Age':[20, 21, 19, 20, 22],
        'Marks':[85.10, 77.80, 91.54, 88.78, 60.55],
        'Grade':['A', 'B', 'A', 'A', 'B',],
        'Hobby':['Swmming', 'Reading', 'Music', 'Painting', 'Dancing']}

In [70]:
df = pd.DataFrame(doc)
df.head()

Unnamed: 0,Name,Age,Marks,Grade,Hobby
0,Joe,20,85.1,A,Swmming
1,Nat,21,77.8,B,Reading
2,Harry,19,91.54,A,Music
3,Sam,20,88.78,A,Painting
4,Monica,22,60.55,B,Dancing


In [71]:
df['Name']

0       Joe
1       Nat
2     Harry
3       Sam
4    Monica
Name: Name, dtype: object

In [72]:
df[['Name', 'Age']]

Unnamed: 0,Name,Age
0,Joe,20
1,Nat,21
2,Harry,19
3,Sam,20
4,Monica,22


In [73]:
df[['Age']] # dataframe

Unnamed: 0,Age
0,20
1,21
2,19
3,20
4,22


In [74]:
df['Age'] # series

0    20
1    21
2    19
3    20
4    22
Name: Age, dtype: int64

In [75]:
df.columns

Index(['Name', 'Age', 'Marks', 'Grade', 'Hobby'], dtype='object')

In [76]:
df.columns = ['Name', 'Age', 'Score', 'Grade', 'Hobby']

In [77]:
df

Unnamed: 0,Name,Age,Score,Grade,Hobby
0,Joe,20,85.1,A,Swmming
1,Nat,21,77.8,B,Reading
2,Harry,19,91.54,A,Music
3,Sam,20,88.78,A,Painting
4,Monica,22,60.55,B,Dancing


In [78]:
df.rename(columns={'Score':'Marks', 'Hobby':'etc'})

Unnamed: 0,Name,Age,Marks,Grade,etc
0,Joe,20,85.1,A,Swmming
1,Nat,21,77.8,B,Reading
2,Harry,19,91.54,A,Music
3,Sam,20,88.78,A,Painting
4,Monica,22,60.55,B,Dancing


In [79]:
df

Unnamed: 0,Name,Age,Score,Grade,Hobby
0,Joe,20,85.1,A,Swmming
1,Nat,21,77.8,B,Reading
2,Harry,19,91.54,A,Music
3,Sam,20,88.78,A,Painting
4,Monica,22,60.55,B,Dancing


### 파일 입출력

In [80]:
# !wget https://github.com/devdio/datasets/blob/main/doc.csv

In [81]:
# !pip install gdown

In [82]:
import gdown

In [83]:
# github->Raw->url복사

In [84]:
# !gdown https://raw.githubusercontent.com/devdio/datasets/main/doc.csv

In [85]:
pd.read_csv('doc.csv')

Unnamed: 0,Name,Age,Marks,Grade,Hobby
0,Joe,20,85.1,A,Swmming
1,Nat,21,77.8,B,Reading
2,Harry,19,91.54,A,Music
3,Sam,20,88.78,A,Painting
4,Monica,22,60.55,B,Dancing


In [86]:
# !gdown https://raw.githubusercontent.com/devdio/datasets/main/doc_idx.csv

In [87]:
df = pd.read_csv('doc_idx.csv', index_col=0)
df.head()

Unnamed: 0,Name,Age,Marks,Grade,Hobby
s1,Joe,20,85.1,A,Swmming
s2,Nat,21,77.8,B,Reading
s3,Harry,19,91.54,A,Music
s4,Sam,20,88.78,A,Painting
s5,Monica,22,60.55,B,Dancing


In [88]:
# !gdown https://raw.githubusercontent.com/devdio/datasets/main/doc_na.csv

In [89]:
df = pd.read_csv('doc_na.csv', 
                 index_col=0, 
                 na_values=['?', '*', '-'])
df

Unnamed: 0,Name,Age,Marks,Grade,Hobby
s1,Joe,20.0,,A,Swmming
s2,Nat,21.0,77.8,B,Reading
s3,Harry,,91.54,A,Music
s4,Sam,20.0,88.78,A,
s5,Monica,22.0,60.55,B,Dancing


In [90]:
# https://www.kaggle.com/competitions/titanic

In [91]:
# !gdown https://raw.githubusercontent.com/devdio/datasets/main/titanic.csv

In [92]:
titanic = pd.read_csv('titanic.csv')
titanic.shape

(891, 12)

In [93]:
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,lost,3rd,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,saved,1st,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,saved,3rd,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,saved,1st,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,lost,3rd,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [94]:
df = titanic.copy() # 원본 복사

In [95]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [96]:
df.columns = [col.lower() for col in df.columns]
df.head()

Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,1,lost,3rd,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,saved,1st,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,saved,3rd,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,saved,1st,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,lost,3rd,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [97]:
df.info() # age, cabin, embarked: 결측치 존재

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   passengerid  891 non-null    int64  
 1   survived     891 non-null    object 
 2   pclass       891 non-null    object 
 3   name         891 non-null    object 
 4   sex          891 non-null    object 
 5   age          714 non-null    float64
 6   sibsp        891 non-null    int64  
 7   parch        891 non-null    int64  
 8   ticket       891 non-null    object 
 9   fare         891 non-null    float64
 10  cabin        204 non-null    object 
 11  embarked     889 non-null    object 
dtypes: float64(2), int64(3), object(7)
memory usage: 83.7+ KB


In [98]:
df.describe() # 통계적 수치, 최대값, 최소값

Unnamed: 0,passengerid,age,sibsp,parch,fare
count,891.0,714.0,891.0,891.0,891.0
mean,446.0,29.699118,0.523008,0.381594,32.204208
std,257.353842,14.526497,1.102743,0.806057,49.693429
min,1.0,0.42,0.0,0.0,0.0
25%,223.5,20.125,0.0,0.0,7.9104
50%,446.0,28.0,0.0,0.0,14.4542
75%,668.5,38.0,1.0,0.0,31.0
max,891.0,80.0,8.0,6.0,512.3292


In [99]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
passengerid,891.0,446.0,257.353842,1.0,223.5,446.0,668.5,891.0
age,714.0,29.699118,14.526497,0.42,20.125,28.0,38.0,80.0
sibsp,891.0,0.523008,1.102743,0.0,0.0,0.0,1.0,8.0
parch,891.0,0.381594,0.806057,0.0,0.0,0.0,0.0,6.0
fare,891.0,32.204208,49.693429,0.0,7.9104,14.4542,31.0,512.3292


In [100]:
df.head(3)

Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,1,lost,3rd,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,saved,1st,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,saved,3rd,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [101]:
df['embarked'].unique() # 타이타닉이 방문한 항구 이름

array(['S', 'C', 'Q', nan], dtype=object)

In [102]:
df['embarked'].value_counts() # 연속형 숫자가 아닐 때, 카테고리(범주형 범수)로 나누어진 것

S    644
C    168
Q     77
Name: embarked, dtype: int64

In [103]:
df['sex'].value_counts()

male      577
female    314
Name: sex, dtype: int64

In [104]:
df['survived'].value_counts()

lost     549
saved    342
Name: survived, dtype: int64

In [105]:
df['pclass'].value_counts()

3rd    491
1st    216
2nd    184
Name: pclass, dtype: int64

In [106]:
# 결측치 개수 계산
df.isna().sum(axis=0)

passengerid      0
survived         0
pclass           0
name             0
sex              0
age            177
sibsp            0
parch            0
ticket           0
fare             0
cabin          687
embarked         2
dtype: int64

### 인덱싱, 슬라이싱
- loc
- iloc

In [107]:
df.loc[5, 'pclass'] # 인덱싱

'3rd'

In [108]:
df.loc[5:10, 'pclass']

5     3rd
6     1st
7     3rd
8     3rd
9     2nd
10    3rd
Name: pclass, dtype: object

In [109]:
df.loc[5:10, ['pclass', 'name', 'survived']]

Unnamed: 0,pclass,name,survived
5,3rd,"Moran, Mr. James",lost
6,1st,"McCarthy, Mr. Timothy J",lost
7,3rd,"Palsson, Master. Gosta Leonard",lost
8,3rd,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",saved
9,2nd,"Nasser, Mrs. Nicholas (Adele Achem)",saved
10,3rd,"Sandstrom, Miss. Marguerite Rut",saved


In [110]:
df.iloc[5:10, [2, 3, 1]]

Unnamed: 0,pclass,name,survived
5,3rd,"Moran, Mr. James",lost
6,1st,"McCarthy, Mr. Timothy J",lost
7,3rd,"Palsson, Master. Gosta Leonard",lost
8,3rd,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",saved
9,2nd,"Nasser, Mrs. Nicholas (Adele Achem)",saved


In [111]:
# 조건
# df.loc[행, 열]
# df.loc[, 열]

In [112]:
df['age'].min(), df['age'].max()

(0.42, 80.0)

In [113]:
# 30 이하는 몇 명이나 되는지
df.loc[df['age'] < 30, ['name', 'age']].count()

name    384
age     384
dtype: int64

In [114]:
df.loc[(df['age'] < 30) & (df['sex'] == 'male')].count() # &는 and

passengerid    237
survived       237
pclass         237
name           237
sex            237
age            237
sibsp          237
parch          237
ticket         237
fare           237
cabin           27
embarked       237
dtype: int64

In [115]:
df.loc[(df['age'] < 30) | (df['sex'] == 'male')].count() # |은 or

passengerid    724
survived       724
pclass         724
name           724
sex            724
age            600
sibsp          724
parch          724
ticket         724
fare           724
cabin          145
embarked       724
dtype: int64

In [116]:
# 결측치 처리
df.isna().sum(axis=0)

passengerid      0
survived         0
pclass           0
name             0
sex              0
age            177
sibsp            0
parch            0
ticket           0
fare             0
cabin          687
embarked         2
dtype: int64

In [117]:
# age가 NaN인 데이터만 추출
df.loc[df['age'].isna()]['embarked'].value_counts()

S    90
Q    49
C    38
Name: embarked, dtype: int64

- 1. 항구 탑승자 조사
- 2. embarked 결측치 채워넣기

In [118]:
# 1. 항구 탑승자 조사
df['embarked'].value_counts()

S    644
C    168
Q     77
Name: embarked, dtype: int64

In [119]:
# 2. embarked 결측치 채워넣기
df['embarked'] = df['embarked'].fillna('S')

In [120]:
df.isna().sum(axis=0)

passengerid      0
survived         0
pclass           0
name             0
sex              0
age            177
sibsp            0
parch            0
ticket           0
fare             0
cabin          687
embarked         0
dtype: int64

In [121]:
df = df.drop(['cabin', 'ticket', 'passengerid'], axis=1) # 방 번호
df.head()

Unnamed: 0,survived,pclass,name,sex,age,sibsp,parch,fare,embarked
0,lost,3rd,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S
1,saved,1st,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C
2,saved,3rd,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S
3,saved,1st,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S
4,lost,3rd,"Allen, Mr. William Henry",male,35.0,0,0,8.05,S


In [122]:
# feature engineering

In [123]:
# sibsp: 형제자매
# parch: 부모
df['family'] = df['sibsp'] + df['parch']
df # 동반자가 있는가

Unnamed: 0,survived,pclass,name,sex,age,sibsp,parch,fare,embarked,family
0,lost,3rd,"Braund, Mr. Owen Harris",male,22.0,1,0,7.2500,S,1
1,saved,1st,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C,1
2,saved,3rd,"Heikkinen, Miss. Laina",female,26.0,0,0,7.9250,S,0
3,saved,1st,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1000,S,1
4,lost,3rd,"Allen, Mr. William Henry",male,35.0,0,0,8.0500,S,0
...,...,...,...,...,...,...,...,...,...,...
886,lost,2nd,"Montvila, Rev. Juozas",male,27.0,0,0,13.0000,S,0
887,saved,1st,"Graham, Miss. Margaret Edith",female,19.0,0,0,30.0000,S,0
888,lost,3rd,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,23.4500,S,3
889,saved,1st,"Behr, Mr. Karl Howell",male,26.0,0,0,30.0000,C,0


In [124]:
x = df.copy()
x.head()

Unnamed: 0,survived,pclass,name,sex,age,sibsp,parch,fare,embarked,family
0,lost,3rd,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S,1
1,saved,1st,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C,1
2,saved,3rd,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S,0
3,saved,1st,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S,1
4,lost,3rd,"Allen, Mr. William Henry",male,35.0,0,0,8.05,S,0


In [125]:
# 1. 결측치 존재하는 행 제거
x = x.dropna()
x.isna().sum()

survived    0
pclass      0
name        0
sex         0
age         0
sibsp       0
parch       0
fare        0
embarked    0
family      0
dtype: int64

In [126]:
x = ([1, 2, 3, 4, 5, 100])
np.median(x), np.mean(x)

(3.5, 19.166666666666668)

In [127]:
# 2. age 평균(중앙값)을 결측치에 넣기
df['age'].fillna(df['age'].mean(), inplace=True)

In [128]:
df.isna().sum(axis=0)

survived    0
pclass      0
name        0
sex         0
age         0
sibsp       0
parch       0
fare        0
embarked    0
family      0
dtype: int64

### 그룹함수
- map() : 컬럼 단위로
- apply()

- groupby()

In [129]:
df

Unnamed: 0,survived,pclass,name,sex,age,sibsp,parch,fare,embarked,family
0,lost,3rd,"Braund, Mr. Owen Harris",male,22.000000,1,0,7.2500,S,1
1,saved,1st,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.000000,1,0,71.2833,C,1
2,saved,3rd,"Heikkinen, Miss. Laina",female,26.000000,0,0,7.9250,S,0
3,saved,1st,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.000000,1,0,53.1000,S,1
4,lost,3rd,"Allen, Mr. William Henry",male,35.000000,0,0,8.0500,S,0
...,...,...,...,...,...,...,...,...,...,...
886,lost,2nd,"Montvila, Rev. Juozas",male,27.000000,0,0,13.0000,S,0
887,saved,1st,"Graham, Miss. Margaret Edith",female,19.000000,0,0,30.0000,S,0
888,lost,3rd,"Johnston, Miss. Catherine Helen ""Carrie""",female,29.699118,1,2,23.4500,S,3
889,saved,1st,"Behr, Mr. Karl Howell",male,26.000000,0,0,30.0000,C,0


In [136]:
df.groupby(['sex', 'pclass'])['age'].mean() #.index: MultiIndex

sex     pclass
female  1st       34.141405
        2nd       28.748661
        3rd       24.068493
male    1st       39.287717
        2nd       30.653908
        3rd       27.372153
Name: age, dtype: float64

In [140]:
# 선실 등급별 남여 생존자 수
x = df.loc[df['survived']=='saved']
x.groupby(['pclass', 'sex'])['survived'].count()

pclass  sex   
1st     female    91
        male      45
2nd     female    70
        male      17
3rd     female    72
        male      47
Name: survived, dtype: int64

- apply() 함수

In [141]:
def myfunc(x):
    print(type(x))

In [142]:
# 인코딩: 문자로 된 데이터를 숫자로 바꿔줌

In [143]:
df.apply(myfunc) # 컬럼 단위로 들어옴

<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>


survived    None
pclass      None
name        None
sex         None
age         None
sibsp       None
parch       None
fare        None
embarked    None
family      None
dtype: object

In [144]:
def myfunc(x):
    print(x)

In [145]:
# df['sex'].apply(myfunc) # 데이터 하나씩

In [146]:
def myfunc(x):
    if x == 'male':
        return 1
    else:
        return 0

In [147]:
df['sex'] = df['sex'].apply(myfunc) # 데이터 하나씩
df.head()

Unnamed: 0,survived,pclass,name,sex,age,sibsp,parch,fare,embarked,family
0,lost,3rd,"Braund, Mr. Owen Harris",1,22.0,1,0,7.25,S,1
1,saved,1st,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,71.2833,C,1
2,saved,3rd,"Heikkinen, Miss. Laina",0,26.0,0,0,7.925,S,0
3,saved,1st,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,53.1,S,1
4,lost,3rd,"Allen, Mr. William Henry",1,35.0,0,0,8.05,S,0


In [148]:
def pclass_encoding(x):
    if x == '1st':
        return 0
    elif x == '2nd':
        return 1
    elif x == '3rd':
        return 2

In [149]:
df['pclass'] = df['pclass'].apply(pclass_encoding)

In [150]:
def survived_encoding(x):
    if x == 'lost':
        return 0
    elif x == 'saved':
        return 1

In [151]:
df['survived'] = df['survived'].apply(survived_encoding)

In [152]:
def embarked_encoding(x):
    if x == 'C':
        return 0
    elif x == 'Q':
        return 1
    elif x == 'S':
        return 2

In [153]:
df['embarked'] = df['embarked'].apply(embarked_encoding)

In [154]:
df = df.drop(['name'], axis=1)

In [155]:
df

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,family
0,0,2,1,22.000000,1,0,7.2500,2,1
1,1,0,0,38.000000,1,0,71.2833,0,1
2,1,2,0,26.000000,0,0,7.9250,2,0
3,1,0,0,35.000000,1,0,53.1000,2,1
4,0,2,1,35.000000,0,0,8.0500,2,0
...,...,...,...,...,...,...,...,...,...
886,0,1,1,27.000000,0,0,13.0000,2,0
887,1,0,0,19.000000,0,0,30.0000,2,0
888,0,2,0,29.699118,1,2,23.4500,2,3
889,1,0,1,26.000000,0,0,30.0000,0,0


In [156]:
# 연속형 데이터
# 범주형 데이터(Categorical data)

In [168]:
my_arr = df.values

In [171]:
# 데이터 저장하기
np.savez('titanic.npz', arr=my_arr)

In [173]:
data = np.load('titanic.npz')

In [175]:
data['arr']

array([[ 0.    ,  2.    ,  1.    , ...,  7.25  ,  2.    ,  1.    ],
       [ 1.    ,  0.    ,  0.    , ..., 71.2833,  0.    ,  1.    ],
       [ 1.    ,  2.    ,  0.    , ...,  7.925 ,  2.    ,  0.    ],
       ...,
       [ 0.    ,  2.    ,  0.    , ..., 23.45  ,  2.    ,  3.    ],
       [ 1.    ,  0.    ,  1.    , ..., 30.    ,  0.    ,  0.    ],
       [ 0.    ,  2.    ,  1.    , ...,  7.75  ,  1.    ,  0.    ]])