In [1]:
import pandas as pd
import numpy as np

# DataFrame Indexing

> Indexing : 데이터에서 어떤 특정 조건을 만족하는 원소를 찾는 방법.

> 전체 DataFrame에서 조건에 만족하는 데이터를 쉽게 찾아서 조작할 때 유용하게 사용할 수 있다.

In [2]:
df = pd.DataFrame(data=np.random.randn(6,4), columns=["A", "B","C", "D"])
df

Unnamed: 0,A,B,C,D
0,1.671261,1.27128,0.137696,-1.779091
1,0.367693,0.354213,-0.457541,0.645561
2,0.871325,0.764605,1.711951,0.646699
3,0.558165,-2.015445,-0.053304,1.121184
4,0.862613,0.441693,0.881315,0.009566
5,1.117428,0.427348,0.204088,-0.006585


In [3]:
# dataframe은 column 이름을 이용하여 기본적인 Indexing이 가능하다.
df["A"] # 기본적인 dictionary indexing과 같다.
# column == key

0    1.671261
1    0.367693
2    0.871325
3    0.558165
4    0.862613
5    1.117428
Name: A, dtype: float64

In [4]:
# row 기준으로 indexing, index name을 이용하여
df.loc[1] # pd.Series

A    0.367693
B    0.354213
C   -0.457541
D    0.645561
Name: 1, dtype: float64

In [5]:
type(df.loc[1]) 

pandas.core.series.Series

In [6]:
# 특정 위치를 통한 indexing
df.iloc[1]

A    0.367693
B    0.354213
C   -0.457541
D    0.645561
Name: 1, dtype: float64

In [7]:
# dataframe에서 slicing을 이용하면 row단위로 잘려나온다.
df[:3]

Unnamed: 0,A,B,C,D
0,1.671261,1.27128,0.137696,-1.779091
1,0.367693,0.354213,-0.457541,0.645561
2,0.871325,0.764605,1.711951,0.646699


In [8]:
# df에서 index value 기준으로 indexing도 가능하다. (row 단위)
df[3:5]

Unnamed: 0,A,B,C,D
3,0.558165,-2.015445,-0.053304,1.121184
4,0.862613,0.441693,0.881315,0.009566


In [9]:
# df.loc에 2차원 indexing도 가능하다.
df.loc[:, ["A", "B"]] # dataframe에서 2차원 indexing을 할 때, column들은 리스트로 넘겨줄 수 있다.

Unnamed: 0,A,B
0,1.671261,1.27128
1,0.367693,0.354213
2,0.871325,0.764605
3,0.558165,-2.015445
4,0.862613,0.441693
5,1.117428,0.427348


In [10]:
# slicing을 통해 특정 row중에서 columns는 A, B
df.loc[1:3, ["A", "B"]]

Unnamed: 0,A,B
1,0.367693,0.354213
2,0.871325,0.764605
3,0.558165,-2.015445


In [11]:
# 특정 row를 index값을 통한 indexing
df.loc[1, ["A", "C"]] # Series

A    0.367693
C   -0.457541
Name: 1, dtype: float64

In [12]:
# 2차원 리스트 indexing과 같은 원리
df.loc[1, "A"] # 특정 row(index)에 특정 columns 값

0.36769304780734846

In [13]:
df.iloc[3:5, 0:2] # 위치를 사용한 indexing

Unnamed: 0,A,B
3,0.558165,-2.015445
4,0.862613,0.441693


In [14]:
# slicing이 아닌 직접 리스트 형태로 기재하는 indexing
df.iloc[[1,2,4],[0,3]]

Unnamed: 0,A,D
1,0.367693,0.645561
2,0.871325,0.646699
4,0.862613,0.009566


In [15]:
# 2차원 indexing에 뒤에가 : 이면 전체를 의미하는 것
df.iloc[:, 1:3] # numpy array의 2차원 indexing과 같다.

Unnamed: 0,B,C
0,1.27128,0.137696
1,0.354213,-0.457541
2,0.764605,1.711951
3,-2.015445,-0.053304
4,0.441693,0.881315
5,0.427348,0.204088


In [16]:
df

Unnamed: 0,A,B,C,D
0,1.671261,1.27128,0.137696,-1.779091
1,0.367693,0.354213,-0.457541,0.645561
2,0.871325,0.764605,1.711951,0.646699
3,0.558165,-2.015445,-0.053304,1.121184
4,0.862613,0.441693,0.881315,0.009566
5,1.117428,0.427348,0.204088,-0.006585


In [17]:
# fancy indexing
df > 0

Unnamed: 0,A,B,C,D
0,True,True,True,False
1,True,True,False,True
2,True,True,True,True
3,True,False,False,True
4,True,True,True,True
5,True,True,True,False


In [18]:
df.A > 0 # == df["A"]

0    True
1    True
2    True
3    True
4    True
5    True
Name: A, dtype: bool

In [20]:
df[df["B"] > 0] # dataframe

Unnamed: 0,A,B,C,D
0,1.671261,1.27128,0.137696,-1.779091
1,0.367693,0.354213,-0.457541,0.645561
2,0.871325,0.764605,1.711951,0.646699
4,0.862613,0.441693,0.881315,0.009566
5,1.117428,0.427348,0.204088,-0.006585


In [21]:
df[df>0] # 음수로 찍혔던 값들이 NaN으로 바뀜

Unnamed: 0,A,B,C,D
0,1.671261,1.27128,0.137696,
1,0.367693,0.354213,,0.645561
2,0.871325,0.764605,1.711951,0.646699
3,0.558165,,,1.121184
4,0.862613,0.441693,0.881315,0.009566
5,1.117428,0.427348,0.204088,


In [23]:
df['A'][df["A"]>0]

0    1.671261
1    0.367693
2    0.871325
3    0.558165
4    0.862613
5    1.117428
Name: A, dtype: float64

In [24]:
df2 = df.copy()

In [26]:
df2["E"] = ['one', 'two', 'three', 'four', 'five', 'one']

In [27]:
df2

Unnamed: 0,A,B,C,D,E
0,1.671261,1.27128,0.137696,-1.779091,one
1,0.367693,0.354213,-0.457541,0.645561,two
2,0.871325,0.764605,1.711951,0.646699,three
3,0.558165,-2.015445,-0.053304,1.121184,four
4,0.862613,0.441693,0.881315,0.009566,five
5,1.117428,0.427348,0.204088,-0.006585,one


In [28]:
df2['E'].isin(['two', 'four'])

0    False
1     True
2    False
3     True
4    False
5    False
Name: E, dtype: bool

In [29]:
df2[df2['E'].isin(['two', 'four'])]

Unnamed: 0,A,B,C,D,E
1,0.367693,0.354213,-0.457541,0.645561,two
3,0.558165,-2.015445,-0.053304,1.121184,four
