## 판다스 데이터 필터링

조건을 만족하는 데이터 만 추려내는 과정

In [2]:
import pandas as pd
df = pd.read_csv('data/credit.csv')
df

Unnamed: 0,Income,Limit,Rating,Cards,Age,Education,Gender,Student,Married,Ethnicity,Balance
0,14.891,3606,283,2,34,11,Male,No,Yes,Caucasian,333
1,106.025,6645,483,3,82,15,Female,Yes,Yes,Asian,903
2,104.593,7075,514,4,71,11,Male,No,No,Asian,580
3,148.924,9504,681,3,36,11,Female,No,No,Asian,964
4,55.882,4897,357,2,68,16,Male,No,Yes,Caucasian,331
...,...,...,...,...,...,...,...,...,...,...,...
395,12.096,4100,307,3,32,13,Male,No,Yes,Caucasian,560
396,13.364,3838,296,5,65,17,Male,No,No,African American,480
397,57.872,4171,321,5,67,12,Female,No,Yes,Caucasian,138
398,37.728,2525,192,1,44,13,Male,No,Yes,Caucasian,0


## (Numerical Variable)에 대해서 조건을 만족하는 열 추려내기

In [3]:
# 소득이 100이 넘는 열은 어떤것인가?
df['Income']>100

0      False
1       True
2       True
3       True
4      False
       ...  
395    False
396    False
397    False
398    False
399    False
Name: Income, Length: 400, dtype: bool

In [4]:
# 소득이 150이 넘는 데이터만 필터링
df[df['Income']>150]

Unnamed: 0,Income,Limit,Rating,Cards,Age,Education,Gender,Student,Married,Ethnicity,Balance
28,186.634,13414,949,2,41,14,Female,No,Yes,African American,1809
85,152.298,12066,828,4,41,12,Female,No,Yes,Asian,1779
184,158.889,11589,805,1,62,17,Female,No,Yes,Caucasian,1448
209,151.947,9156,642,2,91,11,Female,No,Yes,African American,732
261,180.379,9310,665,3,67,8,Female,Yes,Yes,Asian,1050
275,163.329,8732,636,3,50,14,Male,No,Yes,Caucasian,529
323,182.728,13913,982,4,98,17,Male,No,Yes,Caucasian,1999
347,160.231,10748,754,2,69,17,Male,No,No,Caucasian,1192
355,180.682,11966,832,2,58,8,Female,No,Yes,African American,1405


In [5]:
# 필터링의 결과는 다른 데이터프레임으로 저장 가능
df2 = df[df['Income']>150]
df2

Unnamed: 0,Income,Limit,Rating,Cards,Age,Education,Gender,Student,Married,Ethnicity,Balance
28,186.634,13414,949,2,41,14,Female,No,Yes,African American,1809
85,152.298,12066,828,4,41,12,Female,No,Yes,Asian,1779
184,158.889,11589,805,1,62,17,Female,No,Yes,Caucasian,1448
209,151.947,9156,642,2,91,11,Female,No,Yes,African American,732
261,180.379,9310,665,3,67,8,Female,Yes,Yes,Asian,1050
275,163.329,8732,636,3,50,14,Male,No,Yes,Caucasian,529
323,182.728,13913,982,4,98,17,Male,No,Yes,Caucasian,1999
347,160.231,10748,754,2,69,17,Male,No,No,Caucasian,1192
355,180.682,11966,832,2,58,8,Female,No,Yes,African American,1405


## 여러개의 조건에 따른 필터링

In [6]:
## 두개 이상의 조건 &나 |로 선택 가능
df[(df['Income']>150)&(df['Cards']>3)]

Unnamed: 0,Income,Limit,Rating,Cards,Age,Education,Gender,Student,Married,Ethnicity,Balance
85,152.298,12066,828,4,41,12,Female,No,Yes,Asian,1779
323,182.728,13913,982,4,98,17,Male,No,Yes,Caucasian,1999


In [7]:
## or 조건
df[(df['Income']>150)|(df['Cards']==4)]

Unnamed: 0,Income,Limit,Rating,Cards,Age,Education,Gender,Student,Married,Ethnicity,Balance
2,104.593,7075,514,4,71,11,Male,No,No,Asian,580
5,80.180,8047,569,4,77,10,Male,No,No,Caucasian,1151
10,63.095,8117,589,4,30,14,Male,No,Yes,Caucasian,1407
20,17.700,2860,235,4,63,16,Female,No,No,Asian,89
28,186.634,13414,949,2,41,14,Female,No,Yes,African American,1809
...,...,...,...,...,...,...,...,...,...,...,...
365,113.772,6442,489,4,69,15,Male,Yes,Yes,Caucasian,790
370,35.610,6135,466,4,40,12,Male,No,No,Caucasian,992
371,39.116,2150,173,4,75,15,Male,No,No,Caucasian,0
386,19.253,3683,287,4,57,10,Male,No,No,African American,371


## (Categorical Variable)에 대하여 특정 값이 들어있는 행만 추리기

In [10]:
df['Ethnicity'].isin(['Asian']) ## 인종 칼럼에 Asian 값만 가진 행을 추려내라

0      False
1       True
2       True
3       True
4      False
       ...  
395    False
396    False
397    False
398    False
399     True
Name: Ethnicity, Length: 400, dtype: bool

In [11]:
df[df['Ethnicity'].isin(['Asian'])]

Unnamed: 0,Income,Limit,Rating,Cards,Age,Education,Gender,Student,Married,Ethnicity,Balance
1,106.025,6645,483,3,82,15,Female,Yes,Yes,Asian,903
2,104.593,7075,514,4,71,11,Male,No,No,Asian,580
3,148.924,9504,681,3,36,11,Female,No,No,Asian,964
7,71.408,7114,512,2,87,9,Male,No,No,Asian,872
12,80.616,5308,394,1,57,7,Female,No,Yes,Asian,204
...,...,...,...,...,...,...,...,...,...,...,...
385,26.400,5640,398,3,58,15,Female,No,No,Asian,905
387,16.529,1357,126,3,62,9,Male,No,No,Asian,0
390,135.118,10578,747,3,81,15,Female,No,Yes,Asian,1393
392,25.974,2308,196,2,24,10,Male,No,No,Asian,0


In [12]:
## 여러개의 값도 동시에 가능
df['Ethnicity'].isin(['Asian','Caucasian'])

0       True
1       True
2       True
3       True
4       True
       ...  
395     True
396    False
397     True
398     True
399     True
Name: Ethnicity, Length: 400, dtype: bool