### 2. DataFrame Indexing

In [2]:
import numpy as np
import pandas as pd

In [3]:
data = {'name':['mark','mark','mark','lee','lee'], 'year':[2014,2015,2016,2015,2016],'points':[1.5,1.7,3.6,2.4,2.9]}
df = pd.DataFrame(data, columns = ['year','names','points','penalty'],
                 index = ['one','two','three','four','five'])
df

Unnamed: 0,year,names,points,penalty
one,2014,,1.5,
two,2015,,1.7,
three,2016,,3.6,
four,2015,,2.4,
five,2016,,2.9,


#### 2-1. DataFrame에서 열 선택하고 조작하기

In [4]:
df['year']

one      2014
two      2015
three    2016
four     2015
five     2016
Name: year, dtype: int64

In [5]:
# 동일한 의미를 갖는 다른 방법
df.year

one      2014
two      2015
three    2016
four     2015
five     2016
Name: year, dtype: int64

In [6]:
df[['year','points']]

Unnamed: 0,year,points
one,2014,1.5
two,2015,1.7
three,2016,3.6
four,2015,2.4
five,2016,2.9


In [15]:
# 특정 열에 대해 위와 같이 선택하고, 원하는 값 대입하기
df['penalty'] = 0.5
df

# 값 대입할 땐 하나의 값만 넣던가, 아니면 갯수만큼의 값을 넣어줘야함

Unnamed: 0,year,names,points,penalty
one,2014,,1.5,0.5
two,2015,,1.7,0.5
three,2016,,3.6,0.5
four,2015,,2.4,0.5
five,2016,,2.9,0.5


In [18]:
# 새로운 열(column) 추가
df['zeros'] = np.arange(5)
df

Unnamed: 0,year,names,points,penalty,zeros
one,2014,,1.5,0.5,0
two,2015,,1.7,0.5,1
three,2016,,3.6,0.5,2
four,2015,,2.4,0.5,3
five,2016,,2.9,0.5,4


In [20]:
# series 추가하기
val = pd.Series([-1.2,-1.5,-1.7], index=['two','four','five'])
df['debt']=val
df

Unnamed: 0,year,names,points,penalty,zeros,debt
one,2014,,1.5,0.5,0,
two,2015,,1.7,0.5,1,-1.2
three,2016,,3.6,0.5,2,
four,2015,,2.4,0.5,3,-1.5
five,2016,,2.9,0.5,4,-1.7


하지만 series로 넣을 땐 val와 같이 넣으려는 data의 index에 맞춰서 데이터가 들어감. 이점이 python의 list나 numpy array로 데이터를 넣을 때와 가장 큰 차이점

In [21]:
df['net_points'] = df['points'] - df['penalty']

In [22]:
df['high_points'] = df['net_points'] > 2.0

In [23]:
df

Unnamed: 0,year,names,points,penalty,zeros,debt,net_points,high_points
one,2014,,1.5,0.5,0,,1.0,False
two,2015,,1.7,0.5,1,-1.2,1.2,False
three,2016,,3.6,0.5,2,,3.1,True
four,2015,,2.4,0.5,3,-1.5,1.9,False
five,2016,,2.9,0.5,4,-1.7,2.4,True


In [24]:
# 열 삭제
del df['high_points']
del df['net_points']
del df['zeros']

In [25]:
df

Unnamed: 0,year,names,points,penalty,debt
one,2014,,1.5,0.5,
two,2015,,1.7,0.5,-1.2
three,2016,,3.6,0.5,
four,2015,,2.4,0.5,-1.5
five,2016,,2.9,0.5,-1.7


In [26]:
df.columns

Index(['year', 'names', 'points', 'penalty', 'debt'], dtype='object')

In [28]:
df.index

Index(['one', 'two', 'three', 'four', 'five'], dtype='object')

In [29]:
df.index.name = 'Order'
df.columns.name = 'Info'

In [30]:
df

Info,year,names,points,penalty,debt
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,2014,,1.5,0.5,
two,2015,,1.7,0.5,-1.2
three,2016,,3.6,0.5,
four,2015,,2.4,0.5,-1.5
five,2016,,2.9,0.5,-1.7


#### 2-2. DataFrame에서 행 선택하고 조작하기

pandas에서는 DataFrame에서 행을 인덱싱하는 방법이 무수히 많다

In [32]:
# 0번째 부터 2 (3-1)번째까지 가져온다
# 뒤에 써준 숫자번째의 행은 뺀다
df[0:3]

Info,year,names,points,penalty,debt
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,2014,,1.5,0.5,
two,2015,,1.7,0.5,-1.2
three,2016,,3.6,0.5,


In [33]:
# two라는 행에서 four라는 행까지 가져오기
# 뒤에 써준 이름의 행을 빼지 않는다
df['two':'four'] # 하지만 이렇게 사용 비추천!!

Info,year,names,points,penalty,debt
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
two,2015,,1.7,0.5,-1.2
three,2016,,3.6,0.5,
four,2015,,2.4,0.5,-1.5


In [34]:
# 이 방법을 권장함
# .loc or .iloc 함수 사용하는 방법
df.loc['two'] # Series의 형태로 결과 돌려줌

Info
year       2015
names       NaN
points      1.7
penalty     0.5
debt       -1.2
Name: two, dtype: object

In [35]:
df.loc['two':'four']

Info,year,names,points,penalty,debt
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
two,2015,,1.7,0.5,-1.2
three,2016,,3.6,0.5,
four,2015,,2.4,0.5,-1.5


In [36]:
df.loc['two':'four', 'points']

Order
two      1.7
three    3.6
four     2.4
Name: points, dtype: float64

In [37]:
df.loc[:,'year'] # == df['year']

Order
one      2014
two      2015
three    2016
four     2015
five     2016
Name: year, dtype: int64

In [40]:
df.loc[:, ['year','names']]

Info,year,names
Order,Unnamed: 1_level_1,Unnamed: 2_level_1
one,2014,
two,2015,
three,2016,
four,2015,
five,2016,


In [43]:
df.loc['three':'five', 'year':'penalty']

Info,year,names,points,penalty
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
three,2016,,3.6,0.5
four,2015,,2.4,0.5
five,2016,,2.9,0.5


In [44]:
# 새로운 row 삽입하기
df.loc['six',:] = [2013, 'Jun', 4.0, 0.1,2.1]

In [45]:
df

Info,year,names,points,penalty,debt
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,2014.0,,1.5,0.5,
two,2015.0,,1.7,0.5,-1.2
three,2016.0,,3.6,0.5,
four,2015.0,,2.4,0.5,-1.5
five,2016.0,,2.9,0.5,-1.7
six,2013.0,Jun,4.0,0.1,2.1


In [46]:
# .iloc 사용: index 번호를 사용함
df.iloc[3]

Info
year       2015.0
names         NaN
points        2.4
penalty       0.5
debt         -1.5
Name: four, dtype: object

In [47]:
df.iloc[3:5,0:2]

Info,year,names
Order,Unnamed: 1_level_1,Unnamed: 2_level_1
four,2015.0,
five,2016.0,


In [48]:
# 연속적X, 골라서
df.iloc[[0,1,3],[1,2]]

Info,names,points
Order,Unnamed: 1_level_1,Unnamed: 2_level_1
one,,1.5
two,,1.7
four,,2.4


In [49]:
# 모든 행, 열은 인덱스번호 1번부터 3까지
df.iloc[:, 1:4]

Info,names,points,penalty
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
one,,1.5,0.5
two,,1.7,0.5
three,,3.6,0.5
four,,2.4,0.5
five,,2.9,0.5
six,Jun,4.0,0.1


In [50]:
# 값만 뽑아내기
df.iloc[1,1]

nan