# Pandas III: 데이터 처리

# 1. 데이터 합치기 

- merge()
- join()
- concat()

### 1.1 DBMS 스타일로 DataFrame 합치기

In [2]:
from pandas import DataFrame, Series 
import pandas as pd 
import numpy as np

#### merge() 함수

In [None]:
df1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                    'data1': range(7)})
df2 = pd.DataFrame({'key': ['a', 'b', 'd'],
                    'data2': range(3)})

In [None]:
df1

In [None]:
df2

In [None]:
#pd.merge(df1, df2)
pd.merge(df1, df2, on='key')

In [None]:
df3 = pd.DataFrame({'lkey': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                    'data1': range(7)})
                    
df4 = pd.DataFrame({'rkey': ['a', 'b', 'd'],
                    'data2': range(3)})

In [None]:
pd.merge(df3, df4, left_on='lkey', right_on='rkey')

####  1) inner join
####  2) outer join
- left outer join
- right outer join
- full outer join

In [None]:
pd.merge(df1, df2, how='outer')

In [None]:
df1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
                    'data1': range(6)})

df2 = pd.DataFrame({'key': ['a', 'b', 'a', 'b', 'd'],
                    'data2': range(5)})

In [None]:
df1

In [None]:
df2

In [None]:
pd.merge(df1, df2, on='key', how='left')

#### <참고> 데카르트 곱 ( Cartesian product )

In [None]:
pd.merge(df1, df2, how='inner')

In [None]:
left  = pd.DataFrame({'key1': ['foo', 'foo', 'bar'],
                     'key2': ['one', 'two', 'one'],
                     'lval': [1, 2, 3]})

right = pd.DataFrame({'key1': ['foo', 'foo', 'bar', 'bar'],
                      'key2': ['one', 'one', 'one', 'two'],
                      'rval': [4, 5, 6, 7]})
                     

In [None]:
# 여러 key 지정
pd.merge(left, right, on=['key1', 'key2'], how='outer')

In [None]:
# 중복되는 컬럼 이름 자동 변경
pd.merge(left, right, on='key1')

In [None]:
# 중복되는 컬럼 이름 뒤에 붙일 문자열 지정
pd.merge(left, right, on='key1', suffixes=('_left', '_right'))

#### merge 함수 인자 목록
인자 | 설명
:---|:---
left | 머지하려는 DataFrame 중 왼쪽에 위치한 DataFrame
right | 머지하려는 DataFrame 중 오른쪽에 위치한 DataFrame
how | 조인 방법. 'inner', 'outer', 'left', 'right' 기본 값은 inner
on | 조인하려는 로우 이름. 반드시 두 DataFrame 객체 모두에 있는 이름이어야 한다.
left_on | 조인 키로 사용할 left DataFrame 컬럼
right_on | 조인 키로 사용할 right DataFrame 컬럼
left_index | 조인 키로 사용할 left DataFrame의 색인 로우 (다중 색인일 경우의 키)
right_index | 조인 키로 사용할 right DataFrame의 색인 로우 (다중 색인일 경우의 키)
suffixes | 중복되는 컬럼의 이름 뒤에 붙인 문자열 지정


### 1.2 색인으로 머지하기

In [None]:
left1 = pd.DataFrame({'key': ['a', 'b', 'a', 'a', 'b', 'c'],
                      'value': range(6)})

right1 = pd.DataFrame({'group_val': [3.5, 7]}, index=['a', 'b'])

In [None]:
left1

In [None]:
right1

In [None]:
pd.merge(left1, right1, left_on='key', right_index=True)

In [None]:
pd.merge(left1, right1, left_on='key', right_index=True, how='outer')

#### 다중 색인

In [None]:
lefth = pd.DataFrame({'key1': ['Ohio', 'Ohio', 'Ohio',
                               'Nevada', 'Nevada'],
                      'key2': [2000, 2001, 2002, 2001, 2002],
                      'data': np.arange(5.)})

righth = pd.DataFrame(np.arange(12).reshape((6, 2)),
                      index=[['Nevada', 'Nevada', 'Ohio', 'Ohio',
                              'Ohio', 'Ohio'],
                             [2001, 2000, 2000, 2000, 2001, 2002]],
                      columns=['event1', 'event2'])

In [None]:
lefth

In [None]:
righth

In [None]:
pd.merge(lefth, righth, left_on=['key1', 'key2'], right_index=True)

In [None]:
pd.merge(lefth, righth, left_on=['key1', 'key2'],
         right_index=True, how='outer')

In [None]:
left2 = pd.DataFrame([[1., 2.], [3., 4.], [5., 6.]],
                     index=['a', 'c', 'e'],
                     columns=['Ohio', 'Nevada'])
right2 = pd.DataFrame([[7., 8.], [9., 10.], [11., 12.], [13, 14]],
                      index=['b', 'c', 'd', 'e'],
                      columns=['Missouri', 'Alabama'])

In [None]:
left2

In [None]:
right2

In [None]:
pd.merge(left2, right2, how='outer', left_index=True, right_index=True)

#### join() 함수: DataFrame 2개 이상 조인 가능

In [None]:
left2.join(right2, how='outer')

In [None]:
left1.join(right1, on='key')    # Default: how = 'left'

In [None]:
another = pd.DataFrame([[7., 8.], [9., 10.], [11., 12.], [16., 17.]],
                       index=['a', 'c', 'e', 'f'],
                       columns=['New York', 'Oregon'])

In [None]:
another

In [None]:
left2.join([right2, another])

In [None]:
left2.join([right2, another], how='outer', sort=True)

### 1.3 축 따라 이어붙이기

#### concat()

In [None]:
arr = np.arange(12).reshape((3, 4))
arr

In [None]:
s1 = pd.Series([0, 1],    index=['a', 'b'])
s2 = pd.Series([2, 3, 4], index=['c', 'd', 'e'])
s3 = pd.Series([5, 6],    index=['f', 'g'])

In [None]:
pd.concat([s1, s2, s3])

In [None]:
pd.concat([s1, s2, s3], axis=1, sort=False)

In [None]:
s4 = pd.concat([s1, s3])
s4

In [None]:
pd.concat([s1, s4], axis=1, sort=False)

In [None]:
# Default: 'outer'
pd.concat([s1, s4], axis=1, join='inner')

In [None]:
# 필요한 컬럼 선택
pd.concat([s1, s4], axis=1, join_axes=[['a', 'c', 'b', 'e']])

#### 인덱스, 컬럼 명 설정 (계층적 색인 생성)

In [None]:
result = pd.concat([s1, s2, s3], keys=['one', 'two', 'three'])
result

In [None]:
result.unstack()

In [None]:
pd.concat([s1, s2, s3], axis=1, keys=['one', 'two', 'three'], sort=False)

#### DataFrame

In [None]:
df1 = pd.DataFrame(np.arange(6).reshape(3, 2), index=['a', 'b', 'c'],
                   columns=['one', 'two'])
df2 = pd.DataFrame(5 + np.arange(4).reshape(2, 2), index=['a', 'c'],
                   columns=['three', 'four'])

In [None]:
df1

In [None]:
df2

In [None]:
pd.concat([df1, df2], axis=1, keys=['level1', 'level2'], sort=False)

In [None]:
pd.concat({'level1': df1, 'level2': df2}, axis=1, sort=False)

In [None]:
pd.concat([df1, df2], axis=1, keys =['level1', 'level2'],
          names=['upper', 'lower'], sort=False)

#### 인덱스 삭제

In [None]:
df1 = pd.DataFrame(np.random.randn(3, 4), columns=['a', 'b', 'c', 'd'])
df2 = pd.DataFrame(np.random.randn(2, 3), columns=['b', 'd', 'a'])

In [None]:
df1

In [None]:
df2

In [None]:
pd.concat([df1, df2], ignore_index=True, sort=False)

### 1.4 겹치는 데이터 합치기

#### combine_first()

In [None]:
a = pd.Series([np.nan, 2.5, np.nan, 3.5, 4.5, np.nan],
              index=['f', 'e', 'd', 'c', 'b', 'a'])
b = pd.Series(np.arange(len(a), dtype=np.float64),
              index=['f', 'e', 'd', 'c', 'b', 'a'])
b[-1] = np.nan

In [None]:
a

In [None]:
b

In [None]:
np.where(pd.notnull(b), b, a)
#np.where(pd.notnull(a), b, a)

In [None]:
b.combine_first(a)
#b[:-2].combine_first(a[2:])

In [None]:
df1 = pd.DataFrame({'a': [1., np.nan, 5., np.nan],
                    'b': [np.nan, 2., np.nan, 6.],
                    'c': range(2, 18, 4)})
df2 = pd.DataFrame({'a': [5., 4., np.nan, 3., 7.],
                    'b': [np.nan, 3., 4., 6., 8.]})

In [None]:
df1

In [None]:
df2

In [None]:
df1.combine_first(df2)

---

# 2. 재형성과 피벗

### 2.1 계층적 색인으로 재형성하기

- stack(): 데이터의 컬럼을 로우로 피벗시킨다.
- unstack(): 로우를 컬럼으로 피벗시킨다.

In [None]:
data = pd.DataFrame(np.arange(6).reshape((2, 3)),
                    index=pd.Index(['Ohio', 'Colorado'], name='state'),
                    columns=pd.Index(['one', 'two', 'three'],
                    name='number'))
data

In [None]:
result = data.stack()
result

In [None]:
result.unstack()

In [None]:
result.unstack(0)
result.unstack('state')

#### 누락된 데이터 처리

In [None]:
s1 = pd.Series([0, 1, 2, 3], index=['a', 'b', 'c', 'd'])
s2 = pd.Series([4, 5, 6], index=['c', 'd', 'e'])
data2 = pd.concat([s1, s2], keys=['one', 'two'])
data2

In [None]:
data2.unstack()

In [None]:
data2.unstack()

In [None]:
data2.unstack().stack()

In [None]:
data2.unstack().stack(dropna=False)

In [None]:
df = pd.DataFrame({'left': result, 'right': result + 5},
                  columns=pd.Index(['left', 'right'], name='side'))
df

In [None]:
df.unstack('state')

In [None]:
df.unstack('state').stack('side')

### 2.2 데이터 나열 형식 변경

#### Pivoting “Long” to “Wide” Format

#### pivot()

In [None]:
data = pd.read_csv('data/macrodata.csv')
data.head()

In [None]:
periods = pd.PeriodIndex(year=data.year, quarter=data.quarter, name='date')
columns = pd.Index(['realgdp', 'infl', 'unemp'], name='item')
data = data.reindex(columns=columns)
data.index = periods.to_timestamp('D', 'end')

In [None]:
ldata = data.stack().reset_index().rename(columns={0: 'value'})
ldata.head()

In [None]:
pivoted = ldata.pivot('date', 'item', 'value')
pivoted.head()

In [None]:
ldata['value2'] = np.random.randn(len(ldata))
ldata.head()

In [None]:
pivoted = ldata.pivot('date', 'item')
pivoted.head()

In [None]:
pivoted['value'].head()

#### <비교>
- pivot()
- set_index().unstack()

In [None]:
unstacked = ldata.set_index(['date', 'item']).unstack('item')
unstacked.head()

#### Pivoting “Wide” to “Long” Format

#### melt()

In [None]:
df = pd.DataFrame({'key': ['foo', 'bar', 'baz'],
                   'A': [1, 2, 3],
                   'B': [4, 5, 6],
                   'C': [7, 8, 9]})
df

In [None]:
melted = pd.melt(df, ['key'])
melted

In [None]:
reshaped = melted.pivot('key', 'variable', 'value')
reshaped

In [None]:
reshaped.reset_index()

In [None]:
pd.melt(df, id_vars=['key'], value_vars=['A', 'B'])

In [None]:
pd.melt(df, value_vars=['A', 'B', 'C'])

In [None]:
pd.melt(df, value_vars=['key', 'A', 'B'])

---

# 3. 데이터 변형

### 3.1 중복 제거하기

In [3]:
data = DataFrame({'k1': ['one'] * 3 + ['two'] * 4,
                  'k2': [1, 1, 2, 3, 3, 4, 4]})
data

Unnamed: 0,k1,k2
0,one,1
1,one,1
2,one,2
3,two,3
4,two,3
5,two,4
6,two,4


In [4]:
data.duplicated()

0    False
1     True
2    False
3    False
4     True
5    False
6     True
dtype: bool

In [5]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
2,one,2
3,two,3
5,two,4


In [None]:
data['v1'] = range(7)
data

In [6]:
data.drop_duplicates(['k1'])

Unnamed: 0,k1,k2
0,one,1
3,two,3


In [7]:
data.drop_duplicates(['k1', 'k2'], keep='last')

Unnamed: 0,k1,k2
1,one,1
2,one,2
4,two,3
6,two,4


### 3.2 함수나 매핑 이용해 데이터 변형하기

#### map(), apply()

In [8]:
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon',
                              'Pastrami', 'corned beef', 'Bacon',
                              'pastrami', 'honey ham', 'nova lox'],
                     'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [9]:
meat_to_animal = {
  'bacon': 'pig',
  'pulled pork': 'pig',
  'pastrami': 'cow',
  'corned beef': 'cow',
  'honey ham': 'pig',
  'nova lox': 'salmon'
}

In [10]:
lowercased = data['food'].str.lower()
lowercased

0          bacon
1    pulled pork
2          bacon
3       pastrami
4    corned beef
5          bacon
6       pastrami
7      honey ham
8       nova lox
Name: food, dtype: object

In [11]:
data['animal'] = lowercased.map(meat_to_animal)
#data['animal'] = lowercased.apply(meat_to_animal)
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [12]:
data['food'].map(lambda x: meat_to_animal[x.lower()])
#data['food'].apply(lambda x: meat_to_animal[x.lower()])

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

### 3.3 값 치환하기

#### replace()

In [None]:
data = pd.Series([1., -999., 2., -999., -1000., 3.])
data

In [None]:
data.replace(-999, np.nan)

In [None]:
data.replace([-999, -1000], np.nan)

In [None]:
data.replace([-999, -1000], [np.nan, 0])

In [None]:
data.replace({-999: np.nan, -1000: 0})

### 3.4 축 색인 이름 바꾸기

In [26]:
data = pd.DataFrame(np.arange(12).reshape((3, 4)),
                    index=['Ohio', 'Colorado', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [27]:
transform = lambda x: x[:4].upper()
data.index.map(transform)

Index(['OHIO', 'COLO', 'NEW '], dtype='object')

In [28]:
data.index = data.index.map(transform)
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [29]:
data.rename(index=str.title, columns=str.upper)

Unnamed: 0,ONE,TWO,THREE,FOUR
Ohio,0,1,2,3
Colo,4,5,6,7
New,8,9,10,11


In [31]:
data.rename(index={'OHIO': 'INDIANA'},
            columns={'three': 'peekaboo'})

Unnamed: 0,one,two,peekaboo,four
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [32]:
data.rename(index={'OHIO': 'INDIANA'}, inplace=True)
data

Unnamed: 0,one,two,three,four
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


### 3.5 데이터 분류(빈도수 세기)

- cut()
- qcut()
- value_counts()

In [33]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

In [34]:
bins = [18, 25, 35, 60, 100]
cats = pd.cut(ages, bins)
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [35]:
cats.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [36]:
cats.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]]
              closed='right',
              dtype='interval[int64]')

In [37]:
# 빈도수 세기
pd.value_counts(cats)

(18, 25]     5
(35, 60]     3
(25, 35]     3
(60, 100]    1
dtype: int64

In [38]:
# 경계값 포함 여부
pd.cut(ages, [18, 26, 36, 61, 100], right=False)

[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 12
Categories (4, interval[int64]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]

In [39]:
# 그룹 이름 설정
group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']
pd.cut(ages, bins, labels=group_names)

[Youth, Youth, Youth, YoungAdult, Youth, ..., YoungAdult, Senior, MiddleAged, MiddleAged, YoungAdult]
Length: 12
Categories (4, object): [Youth < YoungAdult < MiddleAged < Senior]

In [40]:
# 4개의 그룹으로 분류: 데이터의 최소값과 최대값을 기준으로 4등분
data = np.random.rand(20)
pd.cut(data, 4, precision=2)

[(0.022, 0.23], (0.022, 0.23], (0.44, 0.66], (0.66, 0.87], (0.022, 0.23], ..., (0.022, 0.23], (0.022, 0.23], (0.23, 0.44], (0.66, 0.87], (0.44, 0.66]]
Length: 20
Categories (4, interval[float64]): [(0.022, 0.23] < (0.23, 0.44] < (0.44, 0.66] < (0.66, 0.87]]

In [41]:
data = np.random.randn(1000)

In [42]:
# 4개의 그룹으로 분류: 표본 변위치를 기준으로 4등분
cats = pd.qcut(data, 4)  # Cut into quartiles
cats

[(0.698, 2.615], (-0.712, 0.00848], (0.698, 2.615], (0.00848, 0.698], (-2.8649999999999998, -0.712], ..., (-0.712, 0.00848], (0.698, 2.615], (-2.8649999999999998, -0.712], (-2.8649999999999998, -0.712], (-0.712, 0.00848]]
Length: 1000
Categories (4, interval[float64]): [(-2.8649999999999998, -0.712] < (-0.712, 0.00848] < (0.00848, 0.698] < (0.698, 2.615]]

In [43]:
pd.value_counts(cats)

(0.698, 2.615]                   250
(0.00848, 0.698]                 250
(-0.712, 0.00848]                250
(-2.8649999999999998, -0.712]    250
dtype: int64

In [44]:
# 변위치 지정( 0 ~ 1 )
pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.])

[(0.00848, 1.266], (-1.251, 0.00848], (0.00848, 1.266], (0.00848, 1.266], (-1.251, 0.00848], ..., (-1.251, 0.00848], (1.266, 2.615], (-2.8649999999999998, -1.251], (-1.251, 0.00848], (-1.251, 0.00848]]
Length: 1000
Categories (4, interval[float64]): [(-2.8649999999999998, -1.251] < (-1.251, 0.00848] < (0.00848, 1.266] < (1.266, 2.615]]

### 3.6 이상치(Outliers) 제거

In [57]:
data = pd.DataFrame(np.random.randn(1000, 4))
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.024644,-0.028358,0.010902,0.026
std,0.987297,1.00315,1.002112,0.974601
min,-2.759573,-3.328307,-3.823943,-3.332879
25%,-0.679461,-0.688199,-0.660337,-0.610975
50%,0.058974,-0.017124,0.032153,0.020415
75%,0.706663,0.656254,0.658004,0.710771
max,3.176884,3.376695,3.557327,2.705448


In [58]:
col = data[2]
col[np.abs(col) > 3]

92     3.557327
356   -3.823943
384    3.261975
387    3.182531
Name: 2, dtype: float64

In [61]:
data[(np.abs(data) > 3).any(1)]
#data[(np.abs(data) > 3).any(axis=1)]

Unnamed: 0,0,1,2,3
92,1.062177,-0.71836,3.557327,-0.876048
356,-0.078953,0.358258,-3.823943,-0.861391
360,-0.357109,-3.328307,-2.425898,0.069372
384,0.517983,0.936391,3.261975,1.136164
387,-0.642692,0.413747,3.182531,-0.325896
802,-1.216551,3.376695,-0.775481,-0.938146
879,3.176884,-0.521256,0.421837,1.249835
882,1.294153,0.834388,0.233127,-3.332879


In [48]:
data[np.abs(data) > 3] = np.sign(data) * 3
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.068702,0.016267,0.056671,0.009507
std,1.001996,1.020993,0.999159,0.990041
min,-3.0,-2.882778,-3.0,-3.0
25%,-0.607227,-0.704601,-0.609302,-0.668309
50%,0.109952,0.020047,0.014855,0.032033
75%,0.746505,0.740368,0.712225,0.699164
max,2.864879,2.923578,3.0,3.0


In [49]:
np.sign(data).head()

Unnamed: 0,0,1,2,3
0,1.0,1.0,1.0,-1.0
1,1.0,-1.0,-1.0,1.0
2,-1.0,1.0,1.0,1.0
3,-1.0,1.0,-1.0,-1.0
4,1.0,1.0,-1.0,1.0


### 3.7 치환과 임의 샘플링

In [50]:
df = pd.DataFrame(np.arange(5 * 4).reshape((5, 4)))
sampler = np.random.permutation(5)
sampler

array([3, 4, 1, 0, 2])

In [51]:
df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [52]:
df.take(sampler)

Unnamed: 0,0,1,2,3
3,12,13,14,15
4,16,17,18,19
1,4,5,6,7
0,0,1,2,3
2,8,9,10,11


In [53]:
df.sample(n=3)

Unnamed: 0,0,1,2,3
2,8,9,10,11
3,12,13,14,15
0,0,1,2,3


In [54]:
choices = pd.Series([5, 7, -1, 6, 4])
choices

0    5
1    7
2   -1
3    6
4    4
dtype: int64

In [55]:
draws = choices.sample(n=10, replace=True)
draws

2   -1
4    4
2   -1
1    7
0    5
4    4
4    4
1    7
4    4
1    7
dtype: int64

### 3.8 더미 변수 ( One-Hot-Encoding )

In [65]:
df = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
                   'data1': range(6)})
df

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [66]:
pd.get_dummies(df['key'])

Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [64]:
dummies = pd.get_dummies(df['key'], prefix='key')
df_with_dummy = df[['data1']].join(dummies)
df_with_dummy

Unnamed: 0,data1,key_a,key_b,key_c
0,0,0,1,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0


In [None]:
np.random.seed(12345)
values = np.random.rand(10)
values
bins = [0, 0.2, 0.4, 0.6, 0.8, 1]
pd.get_dummies(pd.cut(values, bins))

---

# 4. 문자열 다루기

### 4.1 문자열 객체 메서드

In [67]:
val = 'a,b,  guido'
val.split(',')

['a', 'b', '  guido']

In [68]:
pieces = [x.strip() for x in val.split(',')]
pieces

['a', 'b', 'guido']

In [69]:
first, second, third = pieces
first + '::' + second + '::' + third

'a::b::guido'

In [70]:
'::'.join(pieces)

'a::b::guido'

In [None]:
'guido' in val

In [None]:
val.index(',')

In [None]:
# 문자열 찾지 못하면 -1 반환
val.find(':')

In [None]:
# 문자열 찾지 못하면 예외 발생
val.index(':')

In [None]:
val.count(',')

In [None]:
val.replace(',', '::')

In [None]:
val.replace(',', '')

### 4.2 정규표현식

In [71]:
import re

In [72]:
text = "foo    bar\t baz  \tqux"
re.split('\s+', text)

['foo', 'bar', 'baz', 'qux']

In [73]:
regex = re.compile('\s+')
regex.split(text)

['foo', 'bar', 'baz', 'qux']

In [None]:
regex.findall(text)

In [None]:
text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com
"""
pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'

# re.IGNORECASE : 대,소문자 무시
regex = re.compile(pattern, flags=re.IGNORECASE)

In [None]:
# findall(): 매칭되는 모든 문자열을 찾아준다.
regex.findall(text)

In [None]:
# search(): 첫 번째 매칭되는 문자열(위치)만 찾아준다.

m = regex.search(text)
m
text[m.start():m.end()]

In [None]:
# match(): 문자열의 시작부분에서 일치하는 것만 찾아준다.
print(regex.match(text))

In [None]:
# sub(): 찾은 패턴을 주어진 문자열로 치환
print(regex.sub('REDACTED', text))

In [None]:
# 각 패턴을 그룹으로 설정
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
regex = re.compile(pattern, flags=re.IGNORECASE)

In [None]:
m = regex.match('wesm@bright.net')
m.groups()

In [None]:
# findall(): 그룹이 존재하면 튜플로 반환
regex.findall(text)

In [None]:
# sub(): 각 패턴의 그룹에 접근( 기호: \1, \2 )
print(regex.sub(r'Username: \1, Domain: \2, Suffix: \3', text))

### 4.3 pandas의 벡터화된 문자열

#### Series의 각 요소에 순차적으로 적용

In [None]:
data = {'1': 'a,b,c,d', '2': 'a b c d',
         '3': '  a_b_c_d  '}
sr = pd.Series(data)
sr

In [None]:
# 문자열을 담고 있는지 검사
sr.str.contains('a')

In [None]:
sr.str[:5]

In [None]:
sr.str.split(',')

In [None]:
sr.str.strip()

In [None]:
sr.str.replace('a', 'kk')

In [None]:
sr.str.len()

In [None]:
#sr.str.lower()
sr.str.upper()

In [None]:
sr.str.join('=')

#### pandas 문자열의 정규표현식

In [None]:
data1 = {'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com',
        'Rob': 'rob@gmail.com', 'Wes': np.nan}
sr1 = pd.Series(data1)
sr1

In [None]:
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'

In [None]:
sr1.str.findall(pattern, flags=re.IGNORECASE)

---

In [None]:
# end of file