# 판다스(pandas)
---

In [242]:
# !pip install pandas

In [243]:
import pandas as pd
import numpy as np
pd.__version__

'1.5.2'

![](https://img1.daumcdn.net/thumb/R1280x0/?scode=mtistory2&fname=https%3A%2F%2Fblog.kakaocdn.net%2Fdn%2FN4NAc%2FbtqRoP1ml8o%2Fx4DD7ITezrXVcJKgEYR5a1%2Fimg.png)

## Series

In [244]:
sr = pd.Series([1, 2, 3, 4, 5])
print(sr, "\n")

sr = pd.Series([1, 2, 3, 4, 5], name="Apple")
print(sr, "\n")

0    1
1    2
2    3
3    4
4    5
dtype: int64 

0    1
1    2
2    3
3    4
4    5
Name: Apple, dtype: int64 



In [245]:
print(sr.index)
print(sr.dtype)
print(sr.shape)
print(type(sr))

RangeIndex(start=0, stop=5, step=1)
int64
(5,)
<class 'pandas.core.series.Series'>


In [246]:
print(sr[1], "\n")
print(sr[1:3])

2 

1    2
2    3
Name: Apple, dtype: int64


In [247]:
sr = pd.Series([1, 2, 3, 4, 5], name="Apple", index=["a", "b", "c", "d", "e"])
print(sr, "\n")
print(sr["c"], "\n")
print(sr[2], "\n")
print(sr.index)

a    1
b    2
c    3
d    4
e    5
Name: Apple, dtype: int64 

3 

3 

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')


In [248]:
print(sr.values, "\n")   # 인덱스는 제외하고 값만 가져옴
print(sr.to_numpy())

[1 2 3 4 5] 

[1 2 3 4 5]


In [249]:
sr.reset_index()

Unnamed: 0,index,Apple
0,a,1
1,b,2
2,c,3
3,d,4
4,e,5


In [250]:
sr = pd.Series([1, np.nan, 2, 3, np.nan, 4, 5])
print(sr)
sr[[1, 2, 4]]

0    1.0
1    NaN
2    2.0
3    3.0
4    NaN
5    4.0
6    5.0
dtype: float64


1    NaN
2    2.0
4    NaN
dtype: float64

In [251]:
idx = [False, True, True, False, True, False, False]
sr[idx]

1    NaN
2    2.0
4    NaN
dtype: float64

In [252]:
sr[sr > 3]

5    4.0
6    5.0
dtype: float64

## 결측치(Missing Value)

In [253]:
# 결측치 확인
sr[sr.isna()]   # sr.isnull()

1   NaN
4   NaN
dtype: float64

In [254]:
# 결측치가 몇개인지 확인
sr.isna().sum()

2

##### 결측치는 없애거나 값을 변경해서 넣야함

In [255]:
sr = pd.Series([1, np.nan, 2, 3, np.nan, 4, 5])
x = sr.copy()
x[x.isna()]

1   NaN
4   NaN
dtype: float64

In [256]:
x[x.isna()] = x.mean()   # 결측치에 나머지 값들의 평균을 넣음
x

0    1.0
1    3.0
2    2.0
3    3.0
4    3.0
5    4.0
6    5.0
dtype: float64

In [257]:
w = sr.copy()
w = w.fillna(w.mean())
w

0    1.0
1    3.0
2    2.0
3    3.0
4    3.0
5    4.0
6    5.0
dtype: float64

In [258]:
sr.dropna()
print(sr, "\n")   # 원본이 변하지 않음
y = sr.copy()
y = y.dropna()   # 원본이 변하지 않기때문에 별도의 변수에 저장해야함
print(y, "\n")
z = sr.copy()
z.dropna(inplace=True)   # 원본이 변하는 옵션을 지정
print(z, "\n")

0    1.0
1    NaN
2    2.0
3    3.0
4    NaN
5    4.0
6    5.0
dtype: float64 

0    1.0
2    2.0
3    3.0
5    4.0
6    5.0
dtype: float64 

0    1.0
2    2.0
3    3.0
5    4.0
6    5.0
dtype: float64 



## 슬라이싱

In [259]:
sr = pd.Series([10, 15, 12, 17, 13], index=["john", "eva", "james", "liam", "zoe"])
sr  

john     10
eva      15
james    12
liam     17
zoe      13
dtype: int64

In [260]:
print(sr[1:4], "\n")
print(sr["eva":"liam"], "\n")
print(sr[2:], "\n")
print(sr[:], "\n")
print(sr[::-1], "\n")

eva      15
james    12
liam     17
dtype: int64 

eva      15
james    12
liam     17
dtype: int64 

james    12
liam     17
zoe      13
dtype: int64 

john     10
eva      15
james    12
liam     17
zoe      13
dtype: int64 

zoe      13
liam     17
james    12
eva      15
john     10
dtype: int64 



In [261]:
print(sr.sort_values(), "\n")   # 값 정렬
print(sr.sort_values(ascending=False), "\n")   # 값 역순으로 정렬
print(sr.sort_values(ascending=False)[:3], "\n")   # 값 역순으로 정렬하고 3위까지 출력

john     10
james    12
zoe      13
eva      15
liam     17
dtype: int64 

liam     17
eva      15
zoe      13
james    12
john     10
dtype: int64 

liam    17
eva     15
zoe     13
dtype: int64 



In [262]:
print(sr.sort_index(), "\n")   # 인덱스를 정렬하고 출력

eva      15
james    12
john     10
liam     17
zoe      13
dtype: int64 



## DataFrame

In [263]:
doc = [['Joe', 20, 85.10, 'A', 'Swimming'],
        ['Nat', 21, 77.80, 'B', 'Reading'],
        ['Harry', 19, 91.54, 'A', 'Music'],
        ['Sam', 20, 88.78, 'A', 'Painting'],
        ['Monica', 22, 60.55, 'B', 'Dancing']]

c_name = ['Name', 'Age', 'Marks', 'Grade', 'Hobby']
idx = ['s1', 's2', 's3', 's4', 's5']

In [264]:
df = pd.DataFrame(doc)
print(df.shape, "\n")
print(df.head(), "\n")   # 가장 위에 있는 5개의 데이터 확인
df

(5, 5) 

        0   1      2  3         4
0     Joe  20  85.10  A  Swimming
1     Nat  21  77.80  B   Reading
2   Harry  19  91.54  A     Music
3     Sam  20  88.78  A  Painting
4  Monica  22  60.55  B   Dancing 



Unnamed: 0,0,1,2,3,4
0,Joe,20,85.1,A,Swimming
1,Nat,21,77.8,B,Reading
2,Harry,19,91.54,A,Music
3,Sam,20,88.78,A,Painting
4,Monica,22,60.55,B,Dancing


In [265]:
df = pd.DataFrame(doc, columns=c_name, index=idx)   # 열 이름 지정, 인덱스 이름 지정
df

Unnamed: 0,Name,Age,Marks,Grade,Hobby
s1,Joe,20,85.1,A,Swimming
s2,Nat,21,77.8,B,Reading
s3,Harry,19,91.54,A,Music
s4,Sam,20,88.78,A,Painting
s5,Monica,22,60.55,B,Dancing


In [266]:
doc = {'Name' :['Joe','Nat','Harry','Sam','Monica',],
        'Age':[20, 21, 19, 20, 22],
        'Marks':[85.10, 77.80, 91.54, 88.78, 60.55],
        'Grade':['A', 'B', 'A', 'A', 'B',],
        'Hobby':['Swmming', 'Reading', 'Music', 'Painting', 'Dancing']}

In [267]:
df = pd.DataFrame(doc)
print(df.shape, "\n")
print(df.head(3), "\n")   # head의 기본값은 5
df

(5, 5) 

    Name  Age  Marks Grade    Hobby
0    Joe   20  85.10     A  Swmming
1    Nat   21  77.80     B  Reading
2  Harry   19  91.54     A    Music 



Unnamed: 0,Name,Age,Marks,Grade,Hobby
0,Joe,20,85.1,A,Swmming
1,Nat,21,77.8,B,Reading
2,Harry,19,91.54,A,Music
3,Sam,20,88.78,A,Painting
4,Monica,22,60.55,B,Dancing


In [268]:
df.index

RangeIndex(start=0, stop=5, step=1)

In [269]:
df.dtypes   # object는 문자열을 의미

Name      object
Age        int64
Marks    float64
Grade     object
Hobby     object
dtype: object

In [270]:
df.columns

Index(['Name', 'Age', 'Marks', 'Grade', 'Hobby'], dtype='object')

In [271]:
df.columns[[0, 2, 3]]   # 0번, 2번, 3번째 칼럼 확인하기

Index(['Name', 'Marks', 'Grade'], dtype='object')

In [272]:
df[df.columns[[0, 2, 3]]]   # 0번, 2번, 3번째 칼럼 가져오기

Unnamed: 0,Name,Marks,Grade
0,Joe,85.1,A
1,Nat,77.8,B
2,Harry,91.54,A
3,Sam,88.78,A
4,Monica,60.55,B


In [273]:
df[["Marks", "Grade", "Name"]]   # "Marks", "Grade", "Name" 칼럼 가져오기

Unnamed: 0,Marks,Grade,Name
0,85.1,A,Joe
1,77.8,B,Nat
2,91.54,A,Harry
3,88.78,A,Sam
4,60.55,B,Monica


In [274]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Name    5 non-null      object 
 1   Age     5 non-null      int64  
 2   Marks   5 non-null      float64
 3   Grade   5 non-null      object 
 4   Hobby   5 non-null      object 
dtypes: float64(1), int64(1), object(3)
memory usage: 328.0+ bytes


In [275]:
doc = {'Name' :['Joe', np.nan,'Harry','Sam','Monica',],
        'Age':[20, 21, 19, 20, 22],
        'Marks':[85.10, 77.80, np.nan, np.nan, 60.55],
        'Grade':['A', 'B', 'A', 'A', 'B',],
        'Hobby':['Swmming', 'Reading', 'Music', 'Painting', 'Dancing']}

In [276]:
df = pd.DataFrame(doc)
df

Unnamed: 0,Name,Age,Marks,Grade,Hobby
0,Joe,20,85.1,A,Swmming
1,,21,77.8,B,Reading
2,Harry,19,,A,Music
3,Sam,20,,A,Painting
4,Monica,22,60.55,B,Dancing


In [277]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Name    4 non-null      object 
 1   Age     5 non-null      int64  
 2   Marks   3 non-null      float64
 3   Grade   5 non-null      object 
 4   Hobby   5 non-null      object 
dtypes: float64(1), int64(1), object(3)
memory usage: 328.0+ bytes


## 칼럼 다루기

In [278]:
doc = {'Name' :['Joe','Nat','Harry','Sam','Monica',],
        'Age':[20, 21, 19, 20, 22],
        'Marks':[85.10, 77.80, 91.54, 88.78, 60.55],
        'Grade':['A', 'B', 'A', 'A', 'B',],
        'Hobby':['Swmming', 'Reading', 'Music', 'Painting', 'Dancing']}
df = pd.DataFrame(doc)
df

Unnamed: 0,Name,Age,Marks,Grade,Hobby
0,Joe,20,85.1,A,Swmming
1,Nat,21,77.8,B,Reading
2,Harry,19,91.54,A,Music
3,Sam,20,88.78,A,Painting
4,Monica,22,60.55,B,Dancing


In [279]:
df["Name"]   # 칼럼 1개만 가져오면 Series 타입

0       Joe
1       Nat
2     Harry
3       Sam
4    Monica
Name: Name, dtype: object

In [280]:
df[["Name", "Age"]]

Unnamed: 0,Name,Age
0,Joe,20
1,Nat,21
2,Harry,19
3,Sam,20
4,Monica,22


In [281]:
df[["Age"]]   # DataFrame 형태로 리턴

Unnamed: 0,Age
0,20
1,21
2,19
3,20
4,22


In [282]:
df.columns

Index(['Name', 'Age', 'Marks', 'Grade', 'Hobby'], dtype='object')

In [283]:
df.columns = ["Name", "Age", "Score", "Grade", "Hobby"]   # 칼럼 이름 변경 방법 첫번째
df

Unnamed: 0,Name,Age,Score,Grade,Hobby
0,Joe,20,85.1,A,Swmming
1,Nat,21,77.8,B,Reading
2,Harry,19,91.54,A,Music
3,Sam,20,88.78,A,Painting
4,Monica,22,60.55,B,Dancing


In [284]:
df.rename(columns={"Score": "Marks", "Hobby": "ETC"})   # 칼럼 이름 변경 방법 두번째(바꾸고 싶은 이름을 키로, 바꿀 이름을 밸류로)

Unnamed: 0,Name,Age,Marks,Grade,ETC
0,Joe,20,85.1,A,Swmming
1,Nat,21,77.8,B,Reading
2,Harry,19,91.54,A,Music
3,Sam,20,88.78,A,Painting
4,Monica,22,60.55,B,Dancing


## 파일 입출력

In [285]:
# !pip install gdown

In [286]:
!gdown https://raw.githubusercontent.com/devdio/datasets/main/doc.csv

Downloading...
From: https://raw.githubusercontent.com/devdio/datasets/main/doc.csv
To: C:\workspace\flyai\doc.csv

  0%|          | 0.00/140 [00:00<?, ?B/s]
144B [00:00, ?B/s]                       


In [287]:
df = pd.read_csv("doc.csv")
df.shape

(5, 5)

In [288]:
df.head(3)

Unnamed: 0,Name,Age,Marks,Grade,Hobby
0,Joe,20,85.1,A,Swmming
1,Nat,21,77.8,B,Reading
2,Harry,19,91.54,A,Music


In [289]:
# !gdown https://raw.githubusercontent.com/devdio/datasets/main/doc_idx.csv

In [290]:
df = pd.read_csv("doc_idx.csv")
df

Unnamed: 0.1,Unnamed: 0,Name,Age,Marks,Grade,Hobby
0,s1,Joe,20,85.1,A,Swmming
1,s2,Nat,21,77.8,B,Reading
2,s3,Harry,19,91.54,A,Music
3,s4,Sam,20,88.78,A,Painting
4,s5,Monica,22,60.55,B,Dancing


In [291]:
df = pd.read_csv("doc_idx.csv", index_col=0)   # 0번째 칼럼을 인덱스로 읽겠다는 의미
df

Unnamed: 0,Name,Age,Marks,Grade,Hobby
s1,Joe,20,85.1,A,Swmming
s2,Nat,21,77.8,B,Reading
s3,Harry,19,91.54,A,Music
s4,Sam,20,88.78,A,Painting
s5,Monica,22,60.55,B,Dancing


In [292]:
# !gdown https://raw.githubusercontent.com/devdio/datasets/main/doc_na.csv

In [293]:
df = pd.read_csv("doc_na.csv")
df

Unnamed: 0.1,Unnamed: 0,Name,Age,Marks,Grade,Hobby
0,s1,Joe,20,?,A,Swmming
1,s2,Nat,21,77.8,B,Reading
2,s3,Harry,?,91.54,A,Music
3,s4,Sam,20,88.78,A,?
4,s5,Monica,22,60.55,B,Dancing


In [294]:
df = pd.read_csv("doc_na.csv", index_col=0, na_values=["?", "*", " "])   # []안에 있는 요소들은 결측치로 처리해라라는 의미
df

Unnamed: 0,Name,Age,Marks,Grade,Hobby
s1,Joe,20.0,,A,Swmming
s2,Nat,21.0,77.8,B,Reading
s3,Harry,,91.54,A,Music
s4,Sam,20.0,88.78,A,
s5,Monica,22.0,60.55,B,Dancing


In [295]:
# !gdown https://raw.githubusercontent.com/devdio/datasets/main/titanic.csv

In [296]:
titanic = pd.read_csv("titanic.csv")
titanic.shape

(891, 12)

In [297]:
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,lost,3rd,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,saved,1st,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,saved,3rd,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,saved,1st,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,lost,3rd,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [298]:
df = titanic.copy()
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [299]:
df.columns = [c.lower() for c in df.columns]   # df의 칼럼들을 소문자로 변환
df

Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,1,lost,3rd,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,saved,1st,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,saved,3rd,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,saved,1st,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,lost,3rd,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,lost,2nd,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,saved,1st,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,lost,3rd,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,saved,1st,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [300]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   passengerid  891 non-null    int64  
 1   survived     891 non-null    object 
 2   pclass       891 non-null    object 
 3   name         891 non-null    object 
 4   sex          891 non-null    object 
 5   age          714 non-null    float64
 6   sibsp        891 non-null    int64  
 7   parch        891 non-null    int64  
 8   ticket       891 non-null    object 
 9   fare         891 non-null    float64
 10  cabin        204 non-null    object 
 11  embarked     889 non-null    object 
dtypes: float64(2), int64(3), object(7)
memory usage: 83.7+ KB


In [301]:
df.describe()   # Dtype이 숫자인 경우 통계량 출력

Unnamed: 0,passengerid,age,sibsp,parch,fare
count,891.0,714.0,891.0,891.0,891.0
mean,446.0,29.699118,0.523008,0.381594,32.204208
std,257.353842,14.526497,1.102743,0.806057,49.693429
min,1.0,0.42,0.0,0.0,0.0
25%,223.5,20.125,0.0,0.0,7.9104
50%,446.0,28.0,0.0,0.0,14.4542
75%,668.5,38.0,1.0,0.0,31.0
max,891.0,80.0,8.0,6.0,512.3292


In [302]:
df.describe().T   # df.describe()의 전치로 출력

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
passengerid,891.0,446.0,257.353842,1.0,223.5,446.0,668.5,891.0
age,714.0,29.699118,14.526497,0.42,20.125,28.0,38.0,80.0
sibsp,891.0,0.523008,1.102743,0.0,0.0,0.0,1.0,8.0
parch,891.0,0.381594,0.806057,0.0,0.0,0.0,0.0,6.0
fare,891.0,32.204208,49.693429,0.0,7.9104,14.4542,31.0,512.3292


In [303]:
df["embarked"].unique()   # embarked 칼럼의 unique 값 확인

array(['S', 'C', 'Q', nan], dtype=object)

In [304]:
df["embarked"].value_counts()   # embarked 칼럼의 각 값이 몇개있는지 출력

S    644
C    168
Q     77
Name: embarked, dtype: int64

In [305]:
df["sex"].value_counts()   # sex 칼럼의 각 값이 몇개있는지 출력

male      577
female    314
Name: sex, dtype: int64

In [306]:
print(df["pclass"].value_counts(), "\n")
print(df["survived"].value_counts())

3rd    491
1st    216
2nd    184
Name: pclass, dtype: int64 

lost     549
saved    342
Name: survived, dtype: int64


## 결측치 개수 계산

In [307]:
df.isnull().sum(axis=0)

passengerid      0
survived         0
pclass           0
name             0
sex              0
age            177
sibsp            0
parch            0
ticket           0
fare             0
cabin          687
embarked         2
dtype: int64

## 인덱싱, 슬라이싱
- loc
- iloc

In [308]:
df.loc[5, "pclass"]   # pclass 칼럼의 5번재 행 읽어오기

'3rd'

In [309]:
df.loc[5:10, "pclass"]   # pclass 칼럼의 5~10번째 행 읽어오기

5     3rd
6     1st
7     3rd
8     3rd
9     2nd
10    3rd
Name: pclass, dtype: object

In [310]:
df.loc[5:10, ["pclass", "survived", "name"]]   # pclass, name, survived 칼럼의 5~10번째 행 읽어오기

Unnamed: 0,pclass,survived,name
5,3rd,lost,"Moran, Mr. James"
6,1st,lost,"McCarthy, Mr. Timothy J"
7,3rd,lost,"Palsson, Master. Gosta Leonard"
8,3rd,saved,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)"
9,2nd,saved,"Nasser, Mrs. Nicholas (Adele Achem)"
10,3rd,saved,"Sandstrom, Miss. Marguerite Rut"


In [311]:
df.iloc[5:10, [0, 3, 5]]   # 0번째, 3번째, 5번째 칼럼의 5~10번째 행 읽어오기

Unnamed: 0,passengerid,name,age
5,6,"Moran, Mr. James",
6,7,"McCarthy, Mr. Timothy J",54.0
7,8,"Palsson, Master. Gosta Leonard",2.0
8,9,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",27.0
9,10,"Nasser, Mrs. Nicholas (Adele Achem)",14.0


In [312]:
# 조건
print(df["age"].min(), df["age"].max(), "\n\n")
print(df.loc[df["age"] < 30, :], "\n\n")
print(df.loc[df["age"] < 30, ["name", "age"]], "\n\n")
print(df.loc[df["age"] < 30, ["name", "age"]].count(), "\n\n")
print(df.loc[(df["age"] < 30) & (df["sex"] == "male")].count())


0.42 80.0 


     passengerid survived pclass  \
0              1     lost    3rd   
2              3    saved    3rd   
7              8     lost    3rd   
8              9    saved    3rd   
9             10    saved    2nd   
..           ...      ...    ...   
883          884     lost    2nd   
884          885     lost    3rd   
886          887     lost    2nd   
887          888    saved    1st   
889          890    saved    1st   

                                                  name     sex   age  sibsp  \
0                              Braund, Mr. Owen Harris    male  22.0      1   
2                               Heikkinen, Miss. Laina  female  26.0      0   
7                       Palsson, Master. Gosta Leonard    male   2.0      3   
8    Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)  female  27.0      0   
9                  Nasser, Mrs. Nicholas (Adele Achem)  female  14.0      1   
..                                                 ...     ...   ...    ...   
8

## 결측치 채우기

In [313]:
df["age"].isna()

0      False
1      False
2      False
3      False
4      False
       ...  
886    False
887    False
888     True
889    False
890    False
Name: age, Length: 891, dtype: bool

In [314]:
df.loc[df["age"].isna()]   # age가 NaN인 데이터만 추출

Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
5,6,lost,3rd,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
17,18,saved,2nd,"Williams, Mr. Charles Eugene",male,,0,0,244373,13.0000,,S
19,20,saved,3rd,"Masselmani, Mrs. Fatima",female,,0,0,2649,7.2250,,C
26,27,lost,3rd,"Emir, Mr. Farred Chehab",male,,0,0,2631,7.2250,,C
28,29,saved,3rd,"O'Dwyer, Miss. Ellen ""Nellie""",female,,0,0,330959,7.8792,,Q
...,...,...,...,...,...,...,...,...,...,...,...,...
859,860,lost,3rd,"Razi, Mr. Raihed",male,,0,0,2629,7.2292,,C
863,864,lost,3rd,"Sage, Miss. Dorothy Edith ""Dolly""",female,,8,2,CA. 2343,69.5500,,S
868,869,lost,3rd,"van Melkebeke, Mr. Philemon",male,,0,0,345777,9.5000,,S
878,879,lost,3rd,"Laleff, Mr. Kristo",male,,0,0,349217,7.8958,,S


In [315]:
df.loc[df["age"].isna()]["embarked"].value_counts()

S    90
Q    49
C    38
Name: embarked, dtype: int64

In [316]:
# 1. 어느 항구(embarked) 탑승자가 많은지 조사
# 2. embarked 결측치 채워넣기
print(df.isna().sum(axis=0), "\n\n")
print(df["embarked"].value_counts(), "\n\n")
df["embarked"] = df["embarked"].fillna("S")   # 원본은 안바뀌기 때문에 별도로 저장
print(df["embarked"].isna().sum())

passengerid      0
survived         0
pclass           0
name             0
sex              0
age            177
sibsp            0
parch            0
ticket           0
fare             0
cabin          687
embarked         2
dtype: int64 


S    644
C    168
Q     77
Name: embarked, dtype: int64 


0


## 칼럼 없애는 방법

In [317]:
df = df.drop(["cabin", "ticket", "passengerid"], axis=1)   # cabin, ticket 칼럼 삭제
print(df.head(3))   # 원본은 안바뀜

  survived pclass                                               name     sex  \
0     lost    3rd                            Braund, Mr. Owen Harris    male   
1    saved    1st  Cumings, Mrs. John Bradley (Florence Briggs Th...  female   
2    saved    3rd                             Heikkinen, Miss. Laina  female   

    age  sibsp  parch     fare embarked  
0  22.0      1      0   7.2500        S  
1  38.0      1      0  71.2833        C  
2  26.0      0      0   7.9250        S  


## 칼럼 추가

In [318]:
# df["family"] = 0   # family 칼럼의 모든 행에 0을 삽입
df["family"] = df["sibsp"] + df["parch"]
df

Unnamed: 0,survived,pclass,name,sex,age,sibsp,parch,fare,embarked,family
0,lost,3rd,"Braund, Mr. Owen Harris",male,22.0,1,0,7.2500,S,1
1,saved,1st,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C,1
2,saved,3rd,"Heikkinen, Miss. Laina",female,26.0,0,0,7.9250,S,0
3,saved,1st,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1000,S,1
4,lost,3rd,"Allen, Mr. William Henry",male,35.0,0,0,8.0500,S,0
...,...,...,...,...,...,...,...,...,...,...
886,lost,2nd,"Montvila, Rev. Juozas",male,27.0,0,0,13.0000,S,0
887,saved,1st,"Graham, Miss. Margaret Edith",female,19.0,0,0,30.0000,S,0
888,lost,3rd,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,23.4500,S,3
889,saved,1st,"Behr, Mr. Karl Howell",male,26.0,0,0,30.0000,C,0


In [319]:
x = df.copy()
x.head()

Unnamed: 0,survived,pclass,name,sex,age,sibsp,parch,fare,embarked,family
0,lost,3rd,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S,1
1,saved,1st,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C,1
2,saved,3rd,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S,0
3,saved,1st,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S,1
4,lost,3rd,"Allen, Mr. William Henry",male,35.0,0,0,8.05,S,0


In [320]:
# age가 결측치인 모든 행을 삭제하는 방법
# x = x.dropna()

# age 중앙값을 결측치에 넣는 방법
df["age"] = df["age"].fillna(df["age"].median())   # inplace 옵션 써도 됨
df.isna().sum(axis=0)

survived    0
pclass      0
name        0
sex         0
age         0
sibsp       0
parch       0
fare        0
embarked    0
family      0
dtype: int64

## 그룹 함수
- map() : 칼럼 단위로 작용
- apply() : 

#### apply()

In [325]:
def myfunc(x):
    if x == "male":
        return 1
    else:
        return 0

In [326]:
df.apply(myfunc)

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [327]:
df["sex"].apply(myfunc)

0      1
1      0
2      0
3      0
4      1
      ..
886    1
887    0
888    0
889    1
890    1
Name: sex, Length: 891, dtype: int64

In [329]:
df["sex_encoded"] = df["sex"].apply(myfunc)
df

Unnamed: 0,survived,pclass,name,sex,age,sibsp,parch,fare,embarked,family,sex_encoded
0,lost,3rd,"Braund, Mr. Owen Harris",male,22.0,1,0,7.2500,S,1,1
1,saved,1st,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C,1,0
2,saved,3rd,"Heikkinen, Miss. Laina",female,26.0,0,0,7.9250,S,0,0
3,saved,1st,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1000,S,1,0
4,lost,3rd,"Allen, Mr. William Henry",male,35.0,0,0,8.0500,S,0,1
...,...,...,...,...,...,...,...,...,...,...,...
886,lost,2nd,"Montvila, Rev. Juozas",male,27.0,0,0,13.0000,S,0,1
887,saved,1st,"Graham, Miss. Margaret Edith",female,19.0,0,0,30.0000,S,0,0
888,lost,3rd,"Johnston, Miss. Catherine Helen ""Carrie""",female,28.0,1,2,23.4500,S,3,0
889,saved,1st,"Behr, Mr. Karl Howell",male,26.0,0,0,30.0000,C,0,1


#### groupby()

In [330]:
df

Unnamed: 0,survived,pclass,name,sex,age,sibsp,parch,fare,embarked,family,sex_encoded
0,lost,3rd,"Braund, Mr. Owen Harris",male,22.0,1,0,7.2500,S,1,1
1,saved,1st,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C,1,0
2,saved,3rd,"Heikkinen, Miss. Laina",female,26.0,0,0,7.9250,S,0,0
3,saved,1st,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1000,S,1,0
4,lost,3rd,"Allen, Mr. William Henry",male,35.0,0,0,8.0500,S,0,1
...,...,...,...,...,...,...,...,...,...,...,...
886,lost,2nd,"Montvila, Rev. Juozas",male,27.0,0,0,13.0000,S,0,1
887,saved,1st,"Graham, Miss. Margaret Edith",female,19.0,0,0,30.0000,S,0,0
888,lost,3rd,"Johnston, Miss. Catherine Helen ""Carrie""",female,28.0,1,2,23.4500,S,3,0
889,saved,1st,"Behr, Mr. Karl Howell",male,26.0,0,0,30.0000,C,0,1


In [331]:
df.groupby("sex")

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001C5119B6190>

In [333]:
df.groupby("sex")["age"].mean()

sex
female    27.929936
male      30.140676
Name: age, dtype: float64

In [334]:
print(df.groupby(["sex", "pclass"])["age"].mean(), "\n\n")

sex     pclass
female  1st       33.978723
        2nd       28.703947
        3rd       23.572917
male    1st       38.995246
        2nd       30.512315
        3rd       26.911873
Name: age, dtype: float64

In [338]:
# 선실 등급별 남여 생존자수
df.groupby(["pclass", "sex"])["survived"].value_counts()

pclass  sex     survived
1st     female  saved        91
                lost          3
        male    lost         77
                saved        45
2nd     female  saved        70
                lost          6
        male    lost         91
                saved        17
3rd     female  lost         72
                saved        72
        male    lost        300
                saved        47
Name: survived, dtype: int64

In [None]:
# df.to_csv("mydata.csv")