# 소개

- 데이터 랭글링(wrangling) : 원본 데이터를 정제하고 사용 가능한 형태로 구성하기 위한 변환 과정
- 데이터 프레임을 사용

In [3]:
import pandas as pd

data_path = './datasets/titanic.csv'
df = pd.read_csv(data_path)
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# 데이터프레임 만들기

In [24]:
import pandas as pd

# 비어 있는 데이터 프레임을 만들어 개별적으로 열 추가
df = pd.DataFrame()
df['Name'] = ['Song', 'Park', 'Lee']
df['Age'] = [22, 21, 19]
df['Driver'] = [True, False, False]
df

Unnamed: 0,Name,Age,Driver
0,Song,22,True
1,Park,21,False
2,Lee,19,False


In [25]:
# 데이터 프레임 객체를 만든 후 행 추가

new_person = pd.DataFrame({'Name':['Molly'], 'Age':[40],'Driver': [True]})
# new_person = pd.Series(['Molly',40,'Driver'], index=['Name', 'Age', 'Driver'] )
df = pd.concat([df, new_person], ignore_index=True, axis=0)
df

Unnamed: 0,Name,Age,Driver
0,Song,22,True
1,Park,21,False
2,Lee,19,False
3,Molly,40,True


In [29]:
# 넘파이 배열로 데이터 프레임 만들기

import numpy as np
data = [['Jack', 34, True], ['Steven', 22, False]]
matrix = np.array(data)
pd.DataFrame(matrix, columns=['Name', 'Age', 'Driver'])

Unnamed: 0,Name,Age,Driver
0,Jack,34,True
1,Steven,22,False


In [31]:
# 리스트로 만들기
data = [['Jack', 34, True], ['Steven', 22, False]]
pd.DataFrame(data, columns=['Name', 'Age', 'Driver'])

Unnamed: 0,Name,Age,Driver
0,Jack,34,True
1,Steven,22,False


In [32]:
# 딕셔너리로 만들기
data = {'Name':['Jack', 'Steven'],
       'Age':[34, 22],
       'Driver':[True, False]}

pd.DataFrame(data)

Unnamed: 0,Name,Age,Driver
0,Jack,34,True
1,Steven,22,False


In [33]:
# 샘플마다 딕셔너리 
data = [
    {'Name':'Jack', 'Age':34, 'Driver': True},
    {'Name':'Steven', 'Age':22, 'Driver': False}    
]
pd.DataFrame(data, index=[0, 1])

Unnamed: 0,Name,Age,Driver
0,Jack,34,True
1,Steven,22,False


# 데이터 설명하기

In [37]:
import pandas as pd

data_path = './datasets/titanic.csv'
df = pd.read_csv(data_path)
df.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


In [38]:
df.shape

(891, 12)

In [39]:
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


# 데이터프레임 탐색하기
- loc, iloc

In [40]:
import pandas as pd

data_path = './datasets/titanic.csv'
df = pd.read_csv(data_path)

In [41]:
# 첫번째 행
df.iloc[0]

PassengerId                          1
Survived                             0
Pclass                               3
Name           Braund, Mr. Owen Harris
Sex                               male
Age                               22.0
SibSp                                1
Parch                                0
Ticket                       A/5 21171
Fare                              7.25
Cabin                              NaN
Embarked                             S
Name: 0, dtype: object

In [42]:
df.iloc[1:4]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S


In [43]:
df.iloc[:4]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S


In [45]:
df.set_index('Name', inplace=True)
df

Unnamed: 0_level_0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
"Braund, Mr. Owen Harris",1,0,3,male,22.0,1,0,A/5 21171,7.2500,,S
"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",2,1,1,female,38.0,1,0,PC 17599,71.2833,C85,C
"Heikkinen, Miss. Laina",3,1,3,female,26.0,0,0,STON/O2. 3101282,7.9250,,S
"Futrelle, Mrs. Jacques Heath (Lily May Peel)",4,1,1,female,35.0,1,0,113803,53.1000,C123,S
"Allen, Mr. William Henry",5,0,3,male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...
"Montvila, Rev. Juozas",887,0,2,male,27.0,0,0,211536,13.0000,,S
"Graham, Miss. Margaret Edith",888,1,1,female,19.0,0,0,112053,30.0000,B42,S
"Johnston, Miss. Catherine Helen ""Carrie""",889,0,3,female,,1,2,W./C. 6607,23.4500,,S
"Behr, Mr. Karl Howell",890,1,1,male,26.0,0,0,111369,30.0000,C148,C


In [46]:
df.loc['Johnston, Miss. Catherine Helen "Carrie"']

PassengerId           889
Survived                0
Pclass                  3
Sex                female
Age                   NaN
SibSp                   1
Parch                   2
Ticket         W./C. 6607
Fare                23.45
Cabin                 NaN
Embarked                S
Name: Johnston, Miss. Catherine Helen "Carrie", dtype: object

In [47]:
df.loc[:'Heikkinen, Miss. Laina']

Unnamed: 0_level_0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
"Braund, Mr. Owen Harris",1,0,3,male,22.0,1,0,A/5 21171,7.25,,S
"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",2,1,1,female,38.0,1,0,PC 17599,71.2833,C85,C
"Heikkinen, Miss. Laina",3,1,3,female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [48]:
df[['Age', 'Sex']].head(3)

Unnamed: 0_level_0,Age,Sex
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
"Braund, Mr. Owen Harris",22.0,male
"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",38.0,female
"Heikkinen, Miss. Laina",26.0,female


# 조건에 따라 행 선택하기

In [49]:
import pandas as pd

data_path = './datasets/titanic.csv'
df = pd.read_csv(data_path)

In [50]:
df[df['Sex'] == 'female']

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C
...,...,...,...,...,...,...,...,...,...,...,...,...
880,881,1,2,"Shelley, Mrs. William (Imanita Parrish Hall)",female,25.0,0,1,230433,26.0000,,S
882,883,0,3,"Dahlberg, Miss. Gerda Ulrika",female,22.0,0,0,7552,10.5167,,S
885,886,0,3,"Rice, Mrs. William (Margaret Norton)",female,39.0,0,5,382652,29.1250,,Q
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S


In [60]:
df[(df['Sex'] == 'female')&(df['Age']>=63)]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
275,276,1,1,"Andrews, Miss. Kornelia Theodosia",female,63.0,1,0,13502,77.9583,D7,S
483,484,1,3,"Turkula, Mrs. (Hedwig)",female,63.0,0,0,4134,9.5875,,S


# 값 치환하기

# 열 이름 바꾸기

# 최솟값, 최댓값, 합, 평균 계산 및 개수 세기

# 고유한 값 찾기

# 누락된 값 다루기

# 열 삭제하기

# 행 삭제하기

# 중복된 행 삭제하기

# 값에 따라 행을 그룹핑하기

# 시간에 따라 행을 그룹핑하기

# 열 원소 순회하기

# 모든 열 원소에 함수 적용하기

# 그룹에 함수 적용하기

# 데이터프레임 연결하기

# 데이터프레임 병합하기