# 판다스
---

In [1813]:
# !pip install pandas --upgrade

In [1814]:
import pandas as pd
import numpy as np
pd.__version__

'1.4.4'

![](https://www.runoob.com/wp-content/uploads/2021/04/df-dp.png)

## Series

In [1815]:
sr = pd.Series([1,2,3,4,5],name='Apple')
print(sr)

0    1
1    2
2    3
3    4
4    5
Name: Apple, dtype: int64


In [1816]:
print(sr.index)
print(sr.shape)

RangeIndex(start=0, stop=5, step=1)
(5,)


In [1817]:
sr[1:3]

1    2
2    3
Name: Apple, dtype: int64

In [1818]:
sr = pd.Series([1,2,3,4,5],name='Apple',index=['a','b','c','d','e'])
sr

a    1
b    2
c    3
d    4
e    5
Name: Apple, dtype: int64

In [1819]:
sr.index,sr.values,sr.to_numpy()

(Index(['a', 'b', 'c', 'd', 'e'], dtype='object'),
 array([1, 2, 3, 4, 5], dtype=int64),
 array([1, 2, 3, 4, 5], dtype=int64))

In [1820]:
sr.reset_index()

Unnamed: 0,index,Apple
0,a,1
1,b,2
2,c,3
3,d,4
4,e,5


In [1821]:
sr = pd.Series([1,np.nan,2,3,np.nan,4,5]) #np.nan 값이 없음을 뜻함
sr

0    1.0
1    NaN
2    2.0
3    3.0
4    NaN
5    4.0
6    5.0
dtype: float64

In [1822]:
#fancy indexing
sr[[1,2,4]]

1    NaN
2    2.0
4    NaN
dtype: float64

In [1823]:
# boolean indexing
idx = [False,True,True,False,True,False,False]
sr[idx]

1    NaN
2    2.0
4    NaN
dtype: float64

In [1824]:
# 조건
sr[sr>3]

5    4.0
6    5.0
dtype: float64

In [1825]:
# 결측치 (Missing Value)

print(sr.isna()) #결측치파트만 True로 반환
sr[sr.isna()] #isna() == isnull()
sr[sr.isnull()]

0    False
1     True
2    False
3    False
4     True
5    False
6    False
dtype: bool


1   NaN
4   NaN
dtype: float64

In [1826]:
sr.isna().sum() #내가 가진 데이터에서 결측치가 몇개인지 알 수 있다

2

In [1827]:
x = sr.copy()
x[x.isna()] = x.mean() #결측치에 x의 평균값을 넣는 문장
x

0    1.0
1    3.0
2    2.0
3    3.0
4    3.0
5    4.0
6    5.0
dtype: float64

In [1828]:
y = sr.copy()
y = y.dropna()  #결측치를 날린 상태로 y에 복사하는 방법
y

0    1.0
2    2.0
3    3.0
5    4.0
6    5.0
dtype: float64

In [1829]:
z = sr.copy()
z.dropna(inplace=True) #inplace 는 기본적으로 False로 돼 있다. True로 바꾸면 바로 바뀐다.
z

0    1.0
2    2.0
3    3.0
5    4.0
6    5.0
dtype: float64

In [1830]:
# x = sr.copy()
# x[x.isna()] = x.mean()

w = sr.copy()
w.fillna(w.mean()) #fillna 역시 위 식과 같은 결과가 나온다. 내부를 뭘로 채울지 정할 수 있음

0    1.0
1    3.0
2    2.0
3    3.0
4    3.0
5    4.0
6    5.0
dtype: float64

### 슬라이싱

In [1831]:
sr = pd.Series([10, 15, 12, 17, 13], index = ['john', 'eva','james','lian','zoe'])
sr

john     10
eva      15
james    12
lian     17
zoe      13
dtype: int64

In [1832]:
sr['eva':'lian'] #index로 slicing할 경우 마지막 범위까지 포함된다

eva      15
james    12
lian     17
dtype: int64

In [1833]:
sr[:-1]

john     10
eva      15
james    12
lian     17
dtype: int64

In [1834]:
sr[:]

john     10
eva      15
james    12
lian     17
zoe      13
dtype: int64

### 정렬

In [1835]:
sr.sort_values() #오름차순
sr.sort_values(ascending = False) #내림차순 (ascending은 기본적으로 True로 돼있다)

Flushing oldest 200 entries.
  warn('Output cache limit (currently {sz} entries) hit.\n'


lian     17
eva      15
zoe      13
james    12
john     10
dtype: int64

In [1836]:
## op3 멤버를 뽑는다면
sr.sort_values(ascending = False)[:3]

lian    17
eva     15
zoe     13
dtype: int64

In [1837]:
# index 기준 정렬
sr.sort_index()

eva      15
james    12
john     10
lian     17
zoe      13
dtype: int64

## DataFrame

In [1838]:
doc = [['Joe', 20, 85.10, 'A', 'Swimming'],
        ['Nat', 21, 77.80, 'B', 'Reading'],
        ['Harry', 19, 91.54, 'A', 'Music'],
        ['Sam', 20, 88.78, 'A', 'Painting'],
        ['Monica', 22, 60.55, 'B', 'Dancing']]

c_name = ['Name', 'Age', 'Marks', 'Grade', 'Hobby']
idx = ['s1', 's2', 's3', 's4', 's5']

In [1839]:
df = pd.DataFrame(doc, columns = c_name, index = idx)
print(df)
df.shape

      Name  Age  Marks Grade     Hobby
s1     Joe   20  85.10     A  Swimming
s2     Nat   21  77.80     B   Reading
s3   Harry   19  91.54     A     Music
s4     Sam   20  88.78     A  Painting
s5  Monica   22  60.55     B   Dancing


(5, 5)

In [1840]:
df.head() #데이터중에서 제일 위에 있는 5개의 데이터만 표시하는 함수

Unnamed: 0,Name,Age,Marks,Grade,Hobby
s1,Joe,20,85.1,A,Swimming
s2,Nat,21,77.8,B,Reading
s3,Harry,19,91.54,A,Music
s4,Sam,20,88.78,A,Painting
s5,Monica,22,60.55,B,Dancing


In [1841]:
doc = {'Name' :['Joe','Nat','Harry','Sam','Monica',],
        'Age':[20, 21, 19, 20, 22],
        'Marks':[85.10, 77.80, 91.54, 88.78, 60.55],
        'Grade':['A', 'B', 'A', 'A', 'B',],
        'Hobby':['Swmming', 'Reading', 'Music', 'Painting', 'Dancing']}

In [1842]:
x = df.dtypes
type(x)

pandas.core.series.Series

In [1843]:
print(df.columns)
df.columns[[0,2,3]]

Index(['Name', 'Age', 'Marks', 'Grade', 'Hobby'], dtype='object')


Index(['Name', 'Marks', 'Grade'], dtype='object')

In [1844]:
#원하는 컬럼만 가져오기
df[df.columns[[0,2,3]]]

Unnamed: 0,Name,Marks,Grade
s1,Joe,85.1,A
s2,Nat,77.8,B
s3,Harry,91.54,A
s4,Sam,88.78,A
s5,Monica,60.55,B


In [1845]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, s1 to s5
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Name    5 non-null      object 
 1   Age     5 non-null      int64  
 2   Marks   5 non-null      float64
 3   Grade   5 non-null      object 
 4   Hobby   5 non-null      object 
dtypes: float64(1), int64(1), object(3)
memory usage: 240.0+ bytes


In [1846]:
doc = {'Name' :['Joe',np.nan,'Harry','Sam','Monica',],
        'Age':[20, 21, 19, 20, 22],
        'Marks':[85.10, 77.80, np.nan,np.nan, 60.55],
        'Grade':['A', 'B', 'A', 'A', 'B',],
        'Hobby':['Swmming', 'Reading', 'Music', 'Painting', 'Dancing']}
df = pd.DataFrame(doc)
df.head()

Unnamed: 0,Name,Age,Marks,Grade,Hobby
0,Joe,20,85.1,A,Swmming
1,,21,77.8,B,Reading
2,Harry,19,,A,Music
3,Sam,20,,A,Painting
4,Monica,22,60.55,B,Dancing


In [1847]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Name    4 non-null      object 
 1   Age     5 non-null      int64  
 2   Marks   3 non-null      float64
 3   Grade   5 non-null      object 
 4   Hobby   5 non-null      object 
dtypes: float64(1), int64(1), object(3)
memory usage: 328.0+ bytes


### 컬럼 다루기

In [1848]:
doc = {'Name' :['Joe',np.nan,'Harry','Sam','Monica',],
        'Age':[20, 21, 19, 20, 22],
        'Marks':[85.10, 77.80, np.nan,np.nan, 60.55],
        'Grade':['A', 'B', 'A', 'A', 'B',],
        'Hobby':['Swmming', 'Reading', 'Music', 'Painting', 'Dancing']}
df = pd.DataFrame(doc)
df.head()

Unnamed: 0,Name,Age,Marks,Grade,Hobby
0,Joe,20,85.1,A,Swmming
1,,21,77.8,B,Reading
2,Harry,19,,A,Music
3,Sam,20,,A,Painting
4,Monica,22,60.55,B,Dancing


In [1849]:
df[['Name','Age']] #df['Name','Age'] 이렇게 하면 에러가 발생한다. 꼭 범위를 리스트로 만들도록 하자.

Unnamed: 0,Name,Age
0,Joe,20
1,,21
2,Harry,19
3,Sam,20
4,Monica,22


In [1850]:
df[['Age']] #dataframe 형태로 출력이 된다

Unnamed: 0,Age
0,20
1,21
2,19
3,20
4,22


In [1851]:
df['Age'] #Series 형태로 출력이 된다

0    20
1    21
2    19
3    20
4    22
Name: Age, dtype: int64

In [1852]:
df.columns = ['Name','Age','Score','Grade','Hobby'] #Marks를 Score를 바꾸기중 한가지 방법

#### Rename

In [1853]:
df.rename(columns={'Score':'Marks','Hobby':'etc'}) #하지만 이 함수가 훨씬 효율적

Unnamed: 0,Name,Age,Marks,Grade,etc
0,Joe,20,85.1,A,Swmming
1,,21,77.8,B,Reading
2,Harry,19,,A,Music
3,Sam,20,,A,Painting
4,Monica,22,60.55,B,Dancing


### 파일 입출력

In [1854]:
# !pip install gdown

#### 평범한 csv파일

In [1855]:
# !gdown https://raw.githubusercontent.com/devdio/datasets/main/doc.csv

In [1856]:
df = pd.read_csv('doc.csv')
df.shape

(5, 5)

In [1857]:
df

Unnamed: 0,Name,Age,Marks,Grade,Hobby
0,Joe,20,85.1,A,Swmming
1,Nat,21,77.8,B,Reading
2,Harry,19,91.54,A,Music
3,Sam,20,88.78,A,Painting
4,Monica,22,60.55,B,Dancing


#### index가 포함돼있는 csv파일

In [1858]:
# !gdown https://raw.githubusercontent.com/devdio/datasets/main/doc_idx.csv

In [1859]:
df = pd.read_csv('doc_idx.csv', index_col = 0) #첫번째를 index로 만드려면

In [1860]:
df

Unnamed: 0,Name,Age,Marks,Grade,Hobby
s1,Joe,20,85.1,A,Swmming
s2,Nat,21,77.8,B,Reading
s3,Harry,19,91.54,A,Music
s4,Sam,20,88.78,A,Painting
s5,Monica,22,60.55,B,Dancing


#### index와 결측치가 포함돼있는 파일

In [1861]:
# !gdown https://raw.githubusercontent.com/devdio/datasets/main/doc_na.csv

In [1862]:
df = pd.read_csv('doc_na.csv', index_col = 0, na_values = ['?'])
#df = pd.read_csv('doc_na.csv', index_col = 0)
#df = df.replace('?',np.NaN)
df.head(3)

Unnamed: 0,Name,Age,Marks,Grade,Hobby
s1,Joe,20.0,,A,Swmming
s2,Nat,21.0,77.8,B,Reading
s3,Harry,,91.54,A,Music


### Titanic.csv를 활용한 예제

In [1863]:
# !gdown https://raw.githubusercontent.com/devdio/datasets/main/titanic.csv

In [1864]:
titanic = pd.read_csv('titanic.csv')
titanic.shape

(891, 12)

In [1865]:
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,lost,3rd,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,saved,1st,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,saved,3rd,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,saved,1st,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,lost,3rd,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


#### columns의 내용을 모두 소문자로 바꾸는 방법

In [1866]:
df = titanic.copy() # 작업을 하기위해 df에 원본을 복사
print(df.columns)
#df.columns = [c.lower() for c in df.columns]
df.columns = df.columns.str.lower()
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')


Index(['passengerid', 'survived', 'pclass', 'name', 'sex', 'age', 'sibsp',
       'parch', 'ticket', 'fare', 'cabin', 'embarked'],
      dtype='object')

#### 숫자로만 돼있는 column에 대한 통계값 출력

In [1867]:
df.describe()

Unnamed: 0,passengerid,age,sibsp,parch,fare
count,891.0,714.0,891.0,891.0,891.0
mean,446.0,29.699118,0.523008,0.381594,32.204208
std,257.353842,14.526497,1.102743,0.806057,49.693429
min,1.0,0.42,0.0,0.0,0.0
25%,223.5,20.125,0.0,0.0,7.9104
50%,446.0,28.0,0.0,0.0,14.4542
75%,668.5,38.0,1.0,0.0,31.0
max,891.0,80.0,8.0,6.0,512.3292


#### 행과 열의 위치를 뒤바꾸는 방법

In [1868]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
passengerid,891.0,446.0,257.353842,1.0,223.5,446.0,668.5,891.0
age,714.0,29.699118,14.526497,0.42,20.125,28.0,38.0,80.0
sibsp,891.0,0.523008,1.102743,0.0,0.0,0.0,1.0,8.0
parch,891.0,0.381594,0.806057,0.0,0.0,0.0,0.0,6.0
fare,891.0,32.204208,49.693429,0.0,7.9104,14.4542,31.0,512.3292


In [1869]:
df.head(10)

Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,1,lost,3rd,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,saved,1st,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,saved,3rd,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,saved,1st,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,lost,3rd,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,lost,3rd,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,lost,1st,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,lost,3rd,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,saved,3rd,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,saved,2nd,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


#### 특정 column의 모든 원소 종류들 확인법

In [1870]:
df['embarked'].unique()

array(['S', 'C', 'Q', nan], dtype=object)

#### 특정 column의 통계

In [1871]:
df['embarked'].value_counts()

S    644
C    168
Q     77
Name: embarked, dtype: int64

In [1872]:
df['sex'].value_counts()

male      577
female    314
Name: sex, dtype: int64

In [1873]:
df['pclass'].value_counts()

3rd    491
1st    216
2nd    184
Name: pclass, dtype: int64

In [1874]:
df['survived'].value_counts()

lost     549
saved    342
Name: survived, dtype: int64

#### 결측치 개수 계산

In [1875]:
df.isnull().sum(axis=0)

passengerid      0
survived         0
pclass           0
name             0
sex              0
age            177
sibsp            0
parch            0
ticket           0
fare             0
cabin          687
embarked         2
dtype: int64

#### 인덱싱, 슬라이싱
+ loc
+ iloc

In [1876]:
# df.loc[행조건, 열조건]

df.loc[5,'pclass']

'3rd'

In [1877]:
df.loc[5:10,['pclass','name','survived']] #column이름을 직접적으로 입력하는 방식

Unnamed: 0,pclass,name,survived
5,3rd,"Moran, Mr. James",lost
6,1st,"McCarthy, Mr. Timothy J",lost
7,3rd,"Palsson, Master. Gosta Leonard",lost
8,3rd,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",saved
9,2nd,"Nasser, Mrs. Nicholas (Adele Achem)",saved
10,3rd,"Sandstrom, Miss. Marguerite Rut",saved


In [1878]:
df.iloc[5:10,[2,3,1]] #index를 입력하는 방식

Unnamed: 0,pclass,name,survived
5,3rd,"Moran, Mr. James",lost
6,1st,"McCarthy, Mr. Timothy J",lost
7,3rd,"Palsson, Master. Gosta Leonard",lost
8,3rd,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",saved
9,2nd,"Nasser, Mrs. Nicholas (Adele Achem)",saved


#### 조건

In [1879]:
df['age'].min(), df['age'].max()

(0.42, 80.0)

In [1880]:
#30세이하 남성의 데이터 수
df.loc[(df['age']<30) & (df['sex']=='male'),['name']].count()  #  &, | 이런 기호를 사용

name    237
dtype: int64

#### 결측치 처리

In [1881]:
# age가 결측치인 데이터만 출력
#df[df['age'].isna()]
df.loc[df['age'].isna()]

Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
5,6,lost,3rd,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
17,18,saved,2nd,"Williams, Mr. Charles Eugene",male,,0,0,244373,13.0000,,S
19,20,saved,3rd,"Masselmani, Mrs. Fatima",female,,0,0,2649,7.2250,,C
26,27,lost,3rd,"Emir, Mr. Farred Chehab",male,,0,0,2631,7.2250,,C
28,29,saved,3rd,"O'Dwyer, Miss. Ellen ""Nellie""",female,,0,0,330959,7.8792,,Q
...,...,...,...,...,...,...,...,...,...,...,...,...
859,860,lost,3rd,"Razi, Mr. Raihed",male,,0,0,2629,7.2292,,C
863,864,lost,3rd,"Sage, Miss. Dorothy Edith ""Dolly""",female,,8,2,CA. 2343,69.5500,,S
868,869,lost,3rd,"van Melkebeke, Mr. Philemon",male,,0,0,345777,9.5000,,S
878,879,lost,3rd,"Laleff, Mr. Kristo",male,,0,0,349217,7.8958,,S


In [1882]:
#age가 결측치인 사람들의 embarked 상태
df[df['age'].isna()]['embarked'].value_counts()

S    90
Q    49
C    38
Name: embarked, dtype: int64

In [1883]:
# 1. 어느 항구에 탑승자가 제일 많은지 조사
# 2. embarked 결측치 채워넣기
df = titanic.copy()
df.columns = df.columns.str.lower()
k = df['embarked'].value_counts() #embarked column의 각 개수 출력
print(k)
print(df['embarked'].isna().sum()) #embarked column의 NaN값 출력
df['embarked'] = df['embarked'].fillna('S')
df['embarked'].isna().sum()

S    644
C    168
Q     77
Name: embarked, dtype: int64
2


0

#### 자료 조사에 필요 없는 column 날리기

In [1884]:
# 생존률과 연관성 없는 데이터인 cabin, ticket, passengerid 날리기
df = df.drop(['cabin','ticket','passengerid'], axis = 1)
df.head()

Unnamed: 0,survived,pclass,name,sex,age,sibsp,parch,fare,embarked
0,lost,3rd,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S
1,saved,1st,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C
2,saved,3rd,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S
3,saved,1st,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S
4,lost,3rd,"Allen, Mr. William Henry",male,35.0,0,0,8.05,S


In [1885]:
df['family'] = df['sibsp'] + df['parch'] 
df

Unnamed: 0,survived,pclass,name,sex,age,sibsp,parch,fare,embarked,family
0,lost,3rd,"Braund, Mr. Owen Harris",male,22.0,1,0,7.2500,S,1
1,saved,1st,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C,1
2,saved,3rd,"Heikkinen, Miss. Laina",female,26.0,0,0,7.9250,S,0
3,saved,1st,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1000,S,1
4,lost,3rd,"Allen, Mr. William Henry",male,35.0,0,0,8.0500,S,0
...,...,...,...,...,...,...,...,...,...,...
886,lost,2nd,"Montvila, Rev. Juozas",male,27.0,0,0,13.0000,S,0
887,saved,1st,"Graham, Miss. Margaret Edith",female,19.0,0,0,30.0000,S,0
888,lost,3rd,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,23.4500,S,3
889,saved,1st,"Behr, Mr. Karl Howell",male,26.0,0,0,30.0000,C,0


In [1886]:
x = df.copy()
x.head()
print(x.isna().sum(axis=0))
x.isna()['embarked'].value_counts()

survived      0
pclass        0
name          0
sex           0
age         177
sibsp         0
parch         0
fare          0
embarked      0
family        0
dtype: int64


False    891
Name: embarked, dtype: int64

In [1887]:
#1.age가 결측치인걸 모두 날리기 (거의 다 age이므로 모든 결측치를 날린 모습)
x = x.dropna()
x.shape

(714, 10)

In [1888]:
#결측치를 날린 후 나이의 평균 혹은 중간값 구하기
np.median(x['age']),np.mean(x['age'])

(28.0, 29.69911764705882)

In [1889]:
df['age'] = df['age'].fillna(np.median(x['age']))

In [1890]:
# 2. age 평균을 결측치에 넣기
df.isna().sum(axis=0)

survived    0
pclass      0
name        0
sex         0
age         0
sibsp       0
parch       0
fare        0
embarked    0
family      0
dtype: int64

#### groupby() 데이터 묶기

In [1891]:
df

Unnamed: 0,survived,pclass,name,sex,age,sibsp,parch,fare,embarked,family
0,lost,3rd,"Braund, Mr. Owen Harris",male,22.0,1,0,7.2500,S,1
1,saved,1st,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C,1
2,saved,3rd,"Heikkinen, Miss. Laina",female,26.0,0,0,7.9250,S,0
3,saved,1st,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1000,S,1
4,lost,3rd,"Allen, Mr. William Henry",male,35.0,0,0,8.0500,S,0
...,...,...,...,...,...,...,...,...,...,...
886,lost,2nd,"Montvila, Rev. Juozas",male,27.0,0,0,13.0000,S,0
887,saved,1st,"Graham, Miss. Margaret Edith",female,19.0,0,0,30.0000,S,0
888,lost,3rd,"Johnston, Miss. Catherine Helen ""Carrie""",female,28.0,1,2,23.4500,S,3
889,saved,1st,"Behr, Mr. Karl Howell",male,26.0,0,0,30.0000,C,0


In [1892]:
#성별과 pclass로 묶고 평균 연령대 출력
df.groupby(['sex','pclass'])['age'].mean()

sex     pclass
female  1st       33.978723
        2nd       28.703947
        3rd       23.572917
male    1st       38.995246
        2nd       30.512315
        3rd       26.911873
Name: age, dtype: float64

In [1893]:
# 선실 등급별 남여 생존자수
df.groupby(['pclass','sex'])['survived'].count()

pclass  sex   
1st     female     94
        male      122
2nd     female     76
        male      108
3rd     female    144
        male      347
Name: survived, dtype: int64

In [1894]:
# 항구별 남녀 생존자수
df.groupby(['embarked','sex'])['survived'].count()

embarked  sex   
C         female     73
          male       95
Q         female     36
          male       41
S         female    205
          male      441
Name: survived, dtype: int64

#### 그룹 함수 (문자열을 숫자로 전환)
+ map() : 컬럼 단위로
+ apply()

In [1895]:
#male = 1, female = 0 이렇게 데이터를 숫자로 변화하려면
def swap_sex(x):
    if x == 'male':
        return 1
    else:
        return 0

In [1896]:
df['sex'] = df['sex'].apply(swap_sex)
df

Unnamed: 0,survived,pclass,name,sex,age,sibsp,parch,fare,embarked,family
0,lost,3rd,"Braund, Mr. Owen Harris",1,22.0,1,0,7.2500,S,1
1,saved,1st,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,71.2833,C,1
2,saved,3rd,"Heikkinen, Miss. Laina",0,26.0,0,0,7.9250,S,0
3,saved,1st,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,53.1000,S,1
4,lost,3rd,"Allen, Mr. William Henry",1,35.0,0,0,8.0500,S,0
...,...,...,...,...,...,...,...,...,...,...
886,lost,2nd,"Montvila, Rev. Juozas",1,27.0,0,0,13.0000,S,0
887,saved,1st,"Graham, Miss. Margaret Edith",0,19.0,0,0,30.0000,S,0
888,lost,3rd,"Johnston, Miss. Catherine Helen ""Carrie""",0,28.0,1,2,23.4500,S,3
889,saved,1st,"Behr, Mr. Karl Howell",1,26.0,0,0,30.0000,C,0


In [1897]:
def swap_survived(x):
    if x =='saved':
        return 1
    else:
        return 0

In [1898]:
df['survived'] = df['survived'].apply(swap_survived)
df

Unnamed: 0,survived,pclass,name,sex,age,sibsp,parch,fare,embarked,family
0,0,3rd,"Braund, Mr. Owen Harris",1,22.0,1,0,7.2500,S,1
1,1,1st,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,71.2833,C,1
2,1,3rd,"Heikkinen, Miss. Laina",0,26.0,0,0,7.9250,S,0
3,1,1st,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,53.1000,S,1
4,0,3rd,"Allen, Mr. William Henry",1,35.0,0,0,8.0500,S,0
...,...,...,...,...,...,...,...,...,...,...
886,0,2nd,"Montvila, Rev. Juozas",1,27.0,0,0,13.0000,S,0
887,1,1st,"Graham, Miss. Margaret Edith",0,19.0,0,0,30.0000,S,0
888,0,3rd,"Johnston, Miss. Catherine Helen ""Carrie""",0,28.0,1,2,23.4500,S,3
889,1,1st,"Behr, Mr. Karl Howell",1,26.0,0,0,30.0000,C,0


In [1899]:
df = df.drop(['name','sibsp','parch'], axis = 1)
df

Unnamed: 0,survived,pclass,sex,age,fare,embarked,family
0,0,3rd,1,22.0,7.2500,S,1
1,1,1st,0,38.0,71.2833,C,1
2,1,3rd,0,26.0,7.9250,S,0
3,1,1st,0,35.0,53.1000,S,1
4,0,3rd,1,35.0,8.0500,S,0
...,...,...,...,...,...,...,...
886,0,2nd,1,27.0,13.0000,S,0
887,1,1st,0,19.0,30.0000,S,0
888,0,3rd,0,28.0,23.4500,S,3
889,1,1st,1,26.0,30.0000,C,0


In [1900]:
def swap_embarked(x):
    if x=='C':
        return 0
    elif x=='Q':
        return 1
    elif x=='S':
        return 2

In [1901]:
df['embarked'] = df['embarked'].apply(swap_embarked)
df

Unnamed: 0,survived,pclass,sex,age,fare,embarked,family
0,0,3rd,1,22.0,7.2500,2,1
1,1,1st,0,38.0,71.2833,0,1
2,1,3rd,0,26.0,7.9250,2,0
3,1,1st,0,35.0,53.1000,2,1
4,0,3rd,1,35.0,8.0500,2,0
...,...,...,...,...,...,...,...
886,0,2nd,1,27.0,13.0000,2,0
887,1,1st,0,19.0,30.0000,2,0
888,0,3rd,0,28.0,23.4500,2,3
889,1,1st,1,26.0,30.0000,0,0


In [1902]:
def swap_pclass(x):
    if x=='1st':
        return 1
    elif x=='2nd':
        return 2
    elif x=='3rd':
        return 3

In [1903]:
df['pclass'] = df['pclass'].apply(swap_pclass)
df

Unnamed: 0,survived,pclass,sex,age,fare,embarked,family
0,0,3,1,22.0,7.2500,2,1
1,1,1,0,38.0,71.2833,0,1
2,1,3,0,26.0,7.9250,2,0
3,1,1,0,35.0,53.1000,2,1
4,0,3,1,35.0,8.0500,2,0
...,...,...,...,...,...,...,...
886,0,2,1,27.0,13.0000,2,0
887,1,1,0,19.0,30.0000,2,0
888,0,3,0,28.0,23.4500,2,3
889,1,1,1,26.0,30.0000,0,0


In [1904]:
#csv파일로 저장
df.to_csv('mydata.csv')

In [1922]:
#npz
array = df.values
np.savez('mydata.npz',xvar = array)
array

array([[ 0.    ,  3.    ,  1.    , ...,  7.25  ,  2.    ,  1.    ],
       [ 1.    ,  1.    ,  0.    , ..., 71.2833,  0.    ,  1.    ],
       [ 1.    ,  3.    ,  0.    , ...,  7.925 ,  2.    ,  0.    ],
       ...,
       [ 0.    ,  3.    ,  0.    , ..., 23.45  ,  2.    ,  3.    ],
       [ 1.    ,  1.    ,  1.    , ..., 30.    ,  0.    ,  0.    ],
       [ 0.    ,  3.    ,  1.    , ...,  7.75  ,  1.    ,  0.    ]])

In [1923]:
mydata = np.load('mydata.npz')
array = mydata['xvar']
array

array([[ 0.    ,  3.    ,  1.    , ...,  7.25  ,  2.    ,  1.    ],
       [ 1.    ,  1.    ,  0.    , ..., 71.2833,  0.    ,  1.    ],
       [ 1.    ,  3.    ,  0.    , ...,  7.925 ,  2.    ,  0.    ],
       ...,
       [ 0.    ,  3.    ,  0.    , ..., 23.45  ,  2.    ,  3.    ],
       [ 1.    ,  1.    ,  1.    , ..., 30.    ,  0.    ,  0.    ],
       [ 0.    ,  3.    ,  1.    , ...,  7.75  ,  1.    ,  0.    ]])