In [1]:
import pandas as pd

In [2]:
data = {
    "group": ['A', 'B', 'A', 'A', 'B', 'C', 'C', 'B'],
    "score": [10, 9, 10, 20, 7, 8, 10, 6],
    "age": [27, 28, 30, 36, 24, 27, 33, 25]
}
df = pd.DataFrame(data =data,
    index = [f"x{i}" for i in range(8)]
)

In [3]:
df

Unnamed: 0,group,score,age
x0,A,10,27
x1,B,9,28
x2,A,10,30
x3,A,20,36
x4,B,7,24
x5,C,8,27
x6,C,10,33
x7,B,6,25


In [7]:
print(df['score'] >= 10)

x0     True
x1    False
x2     True
x3     True
x4    False
x5    False
x6     True
x7    False
Name: score, dtype: bool


In [5]:
df[df['score']>=10]

Unnamed: 0,group,score,age
x0,A,10,27
x2,A,10,30
x3,A,20,36
x6,C,10,33


- loc을 활용하여 행, 열 조건을 줄 수 있다.

In [8]:
df.loc[df['score'] >= 10]

Unnamed: 0,group,score,age
x0,A,10,27
x2,A,10,30
x3,A,20,36
x6,C,10,33


In [10]:
print(df.loc[df['score'] >= 10, 'age'])

x0    27
x2    30
x3    36
x6    33
Name: age, dtype: int64


In [11]:
df[df['group']=='A']

Unnamed: 0,group,score,age
x0,A,10,27
x2,A,10,30
x3,A,20,36


- 2개 이상의 조건
    - and 조건: &(shift + 7) A and B: A, B 둘 다 만족
    - or 조건: |(shift+원화키(\)) A or B: A 또는 B가 만족

In [13]:
print((df['group'] != 'A') & (df['age'] > 30))

x0    False
x1    False
x2    False
x3    False
x4    False
x5    False
x6     True
x7    False
dtype: bool


In [14]:
df.loc[(df['group'] != 'A') & (df['age'] > 30)]

Unnamed: 0,group,score,age
x6,C,10,33


> score가 10점 이상이거나 또는 age가 25살 초과 선택

In [15]:
df.loc[(df['score'] >= 10) | (df['age'] > 25)]

Unnamed: 0,group,score,age
x0,A,10,27
x1,B,9,28
x2,A,10,30
x3,A,20,36
x5,C,8,27
x6,C,10,33


> score가 10점 이상이거나 또는 age가 25살 초과 데이터 중 A group에 속하는 것 선택

In [16]:
df.loc[((df['score'] >= 10) | (df['age'] > 25)) & (df['group']=='A')]

Unnamed: 0,group,score,age
x0,A,10,27
x2,A,10,30
x3,A,20,36


# 행, 열 삭제
- drop(columns=, index=, axis=)
    - columns: 삭제한 열 정보
    - index: 삭제할 행 정보
    - axis: 행, 열 기준 정보
        - 0: 행 기준 삭제
        - 1: 열 기준 삭제

In [19]:
df.drop(columns=['age'])

Unnamed: 0,group,score
x0,A,10
x1,B,9
x2,A,10
x3,A,20
x4,B,7
x5,C,8
x6,C,10
x7,B,6


In [20]:
df.drop(columns=['group', 'age'])

Unnamed: 0,score
x0,10
x1,9
x2,10
x3,20
x4,7
x5,8
x6,10
x7,6


In [21]:
# 함수 안에 columns =를 기입하지 않을 경우 'axis='로 열 기준인 것을 입력
df.drop(['age','group'], axis=1)
# df.drop(['age', 'group']) --> Error, axis=0이 default이기 때문이다

Unnamed: 0,score
x0,10
x1,9
x2,10
x3,20
x4,7
x5,8
x6,10
x7,6


In [22]:
df.columns

Index(['group', 'score', 'age'], dtype='object')

In [23]:
df.drop(columns=df.columns[0:2])

Unnamed: 0,age
x0,27
x1,28
x2,30
x3,36
x4,24
x5,27
x6,33
x7,25


In [24]:
df

Unnamed: 0,group,score,age
x0,A,10,27
x1,B,9,28
x2,A,10,30
x3,A,20,36
x4,B,7,24
x5,C,8,27
x6,C,10,33
x7,B,6,25


In [26]:
df_temp = df.drop(['age', 'group'], axis=1)
print(df_temp)

    score
x0     10
x1      9
x2     10
x3     20
x4      7
x5      8
x6     10
x7      6


- 행 삭제

In [28]:
#df.drop(index = ['x1', 'x3'])
#df.drop(['x1', 'x3'], axis=0)
df.drop(['x1', 'x3'])

Unnamed: 0,group,score,age
x0,A,10,27
x2,A,10,30
x4,B,7,24
x5,C,8,27
x6,C,10,33
x7,B,6,25


In [30]:
df.index[0]
df.index[:4]

Index(['x0', 'x1', 'x2', 'x3'], dtype='object')

In [31]:
print(df.drop(df.index[0]))
print(df.drop(df.index[:4]))

   group  score  age
x1     B      9   28
x2     A     10   30
x3     A     20   36
x4     B      7   24
x5     C      8   27
x6     C     10   33
x7     B      6   25
   group  score  age
x4     B      7   24
x5     C      8   27
x6     C     10   33
x7     B      6   25


# 행, 열 추가
- 개수에 맞춰서 행 열 추가

In [33]:
df['new1'] = [0,1,0,1,0,1,0,1]
df

Unnamed: 0,group,score,age,new1
x0,A,10,27,0
x1,B,9,28,1
x2,A,10,30,0
x3,A,20,36,1
x4,B,7,24,0
x5,C,8,27,1
x6,C,10,33,0
x7,B,6,25,1


In [35]:
df.loc['x9'] = ['C', 14, 23, 1]
df

Unnamed: 0,group,score,age,new1
x0,A,10,27,0
x1,B,9,28,1
x2,A,10,30,0
x3,A,20,36,1
x4,B,7,24,0
x5,C,8,27,1
x6,C,10,33,0
x7,B,6,25,1
x9,C,14,23,1


- 조건에 따른 결과값 추가
    > score가 10 이상이면 'good', 미만이면 'bed' 추가

In [39]:
print(df['score']>=10)

x0     True
x1    False
x2     True
x3     True
x4    False
x5    False
x6     True
x7    False
x9     True
Name: score, dtype: bool


In [40]:
import numpy as np

In [43]:
print(np.where((df['score']>=10), 'good', 'bed'))

['good' 'bed' 'good' 'good' 'bed' 'bed' 'good' 'bed' 'good']


In [45]:
df['new1'] = np.where((df['score']>=10), 'good', 'bed')
df

Unnamed: 0,group,score,age,new1
x0,A,10,27,good
x1,B,9,28,bed
x2,A,10,30,good
x3,A,20,36,good
x4,B,7,24,bed
x5,C,8,27,bed
x6,C,10,33,good
x7,B,6,25,bed
x9,C,14,23,good


- score가 10 미만이녀 'low', 15이상이면 'high', 나머지는 'medium'

In [46]:
df['group_new'] = df['score'].map(lambda x: 'low' if x < 10 else 'high' if x > 15 else 'medium')
df

Unnamed: 0,group,score,age,new1,group_new
x0,A,10,27,good,medium
x1,B,9,28,bed,low
x2,A,10,30,good,medium
x3,A,20,36,good,high
x4,B,7,24,bed,low
x5,C,8,27,bed,low
x6,C,10,33,good,medium
x7,B,6,25,bed,low
x9,C,14,23,good,medium


In [48]:
print(df['new1']+df['group_new'])

x0    goodmedium
x1        bedlow
x2    goodmedium
x3      goodhigh
x4        bedlow
x5        bedlow
x6    goodmedium
x7        bedlow
x9    goodmedium
dtype: object


In [49]:
print(df['new1']+"_"+df['group_new'])

x0    good_medium
x1        bed_low
x2    good_medium
x3      good_high
x4        bed_low
x5        bed_low
x6    good_medium
x7        bed_low
x9    good_medium
dtype: object


In [51]:
df['group_new2'] = df['new1']+"_"+df['group_new']
df

Unnamed: 0,group,score,age,new1,group_new,group_new2
x0,A,10,27,good,medium,good_medium
x1,B,9,28,bed,low,bed_low
x2,A,10,30,good,medium,good_medium
x3,A,20,36,good,high,good_high
x4,B,7,24,bed,low,bed_low
x5,C,8,27,bed,low,bed_low
x6,C,10,33,good,medium,good_medium
x7,B,6,25,bed,low,bed_low
x9,C,14,23,good,medium,good_medium


In [52]:
df['total'] = df['score'] + df['age']
df

Unnamed: 0,group,score,age,new1,group_new,group_new2,total
x0,A,10,27,good,medium,good_medium,37
x1,B,9,28,bed,low,bed_low,37
x2,A,10,30,good,medium,good_medium,40
x3,A,20,36,good,high,good_high,56
x4,B,7,24,bed,low,bed_low,31
x5,C,8,27,bed,low,bed_low,35
x6,C,10,33,good,medium,good_medium,43
x7,B,6,25,bed,low,bed_low,31
x9,C,14,23,good,medium,good_medium,37


# 특정 열 선택 기타 함수

In [53]:
df['group_age'] = df['age'].map(lambda x: '청소년' if x < 20 else '20대' if x < 30 else '30대 이상')
df

Unnamed: 0,group,score,age,new1,group_new,group_new2,total,group_age
x0,A,10,27,good,medium,good_medium,37,20대
x1,B,9,28,bed,low,bed_low,37,20대
x2,A,10,30,good,medium,good_medium,40,30대 이상
x3,A,20,36,good,high,good_high,56,30대 이상
x4,B,7,24,bed,low,bed_low,31,20대
x5,C,8,27,bed,low,bed_low,35,20대
x6,C,10,33,good,medium,good_medium,43,30대 이상
x7,B,6,25,bed,low,bed_low,31,20대
x9,C,14,23,good,medium,good_medium,37,20대


- 참고: difference()
- 특정 컬럼 제외 후 선택

In [55]:
df.columns.difference(['group'])

Index(['age', 'group_age', 'group_new', 'group_new2', 'new1', 'score',
       'total'],
      dtype='object')

In [56]:
df[df.columns.difference(['group'])]

Unnamed: 0,age,group_age,group_new,group_new2,new1,score,total
x0,27,20대,medium,good_medium,good,10,37
x1,28,20대,low,bed_low,bed,9,37
x2,30,30대 이상,medium,good_medium,good,10,40
x3,36,30대 이상,high,good_high,good,20,56
x4,24,20대,low,bed_low,bed,7,31
x5,27,20대,low,bed_low,bed,8,35
x6,33,30대 이상,medium,good_medium,good,10,43
x7,25,20대,low,bed_low,bed,6,31
x9,23,20대,medium,good_medium,good,14,37


In [57]:
df[df.columns.difference(['group', 'score'])]

Unnamed: 0,age,group_age,group_new,group_new2,new1,total
x0,27,20대,medium,good_medium,good,37
x1,28,20대,low,bed_low,bed,37
x2,30,30대 이상,medium,good_medium,good,40
x3,36,30대 이상,high,good_high,good,56
x4,24,20대,low,bed_low,bed,31
x5,27,20대,low,bed_low,bed,35
x6,33,30대 이상,medium,good_medium,good,43
x7,25,20대,low,bed_low,bed,31
x9,23,20대,medium,good_medium,good,37


- 특정 이름이 포함된 컬럼 제외 후 선택
- str.contains()

In [58]:
df.head(3)

Unnamed: 0,group,score,age,new1,group_new,group_new2,total,group_age
x0,A,10,27,good,medium,good_medium,37,20대
x1,B,9,28,bed,low,bed_low,37,20대
x2,A,10,30,good,medium,good_medium,40,30대 이상


In [59]:
df.columns

Index(['group', 'score', 'age', 'new1', 'group_new', 'group_new2', 'total',
       'group_age'],
      dtype='object')

In [61]:
print(df.columns.str.contains(pat='group'))

[ True False False False  True  True False  True]


In [63]:
print(df.columns.str.contains(pat='group')==False)

[False  True  True  True False False  True False]


In [64]:
df.columns[df.columns.str.contains(pat='group')==False]

Index(['score', 'age', 'new1', 'total'], dtype='object')

In [65]:
df[df.columns[df.columns.str.contains(pat='group')==False]]

Unnamed: 0,score,age,new1,total
x0,10,27,good,37
x1,9,28,bed,37
x2,10,30,good,40
x3,20,36,good,56
x4,7,24,bed,31
x5,8,27,bed,35
x6,10,33,good,43
x7,6,25,bed,31
x9,14,23,good,37


- 특정 이름이 포함된 컬럼 선택

In [67]:
df[df.columns[df.columns.str.contains(pat='group')]]

Unnamed: 0,group,group_new,group_new2,group_age
x0,A,medium,good_medium,20대
x1,B,low,bed_low,20대
x2,A,medium,good_medium,30대 이상
x3,A,high,good_high,30대 이상
x4,B,low,bed_low,20대
x5,C,low,bed_low,20대
x6,C,medium,good_medium,30대 이상
x7,B,low,bed_low,20대
x9,C,medium,good_medium,20대


In [68]:
df[df.columns[df.columns.str.contains(pat='group|age')]]

Unnamed: 0,group,age,group_new,group_new2,group_age
x0,A,27,medium,good_medium,20대
x1,B,28,low,bed_low,20대
x2,A,30,medium,good_medium,30대 이상
x3,A,36,high,good_high,30대 이상
x4,B,24,low,bed_low,20대
x5,C,27,low,bed_low,20대
x6,C,33,medium,good_medium,30대 이상
x7,B,25,low,bed_low,20대
x9,C,23,medium,good_medium,20대
