In [1]:
import numpy as np
import pandas as pd

### 1) 예시 DataFrame 생성

In [2]:
df = pd.DataFrame(columns = ['a', 'b'])

df['a'] = [1, 2, 3, 4, 5, 6 ]
df['b'] = ['FF CC', "AABB", "DD AA", "AA BB", "AA", "DD" ]

df

Unnamed: 0,a,b
0,1,FF CC
1,2,AABB
2,3,DD AA
3,4,AA BB
4,5,AA
5,6,DD


### 2) 공백(' ') 기준으로 분할 후 고유 값으로 갈 열을 생성
- 하지만 공백으로 분류가 되지 않는 경우에는 분할 되지 않음.

In [3]:
cs_iter = (set(x.split(' ')) for x in df.b)
cs = sorted(set.union(*cs_iter))

dummies = pd.DataFrame(np.zeros((len(df), len(cs))), columns = cs)

dummies


Unnamed: 0,AA,AABB,BB,CC,DD,FF
0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0


###  3) DataFrame과 병합 후 'b'열에 해당 문자가 있는 경우 해당 열에 1 할당

In [4]:
for i, gen in enumerate(df.b):
    dummies.ix[i, gen.split(' ')] = 1
    
df = df.join(dummies)

df

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  


Unnamed: 0,a,b,AA,AABB,BB,CC,DD,FF
0,1,FF CC,0.0,0.0,0.0,1.0,0.0,1.0
1,2,AABB,0.0,1.0,0.0,0.0,0.0,0.0
2,3,DD AA,1.0,0.0,0.0,0.0,1.0,0.0
3,4,AA BB,1.0,0.0,1.0,0.0,0.0,0.0
4,5,AA,1.0,0.0,0.0,0.0,0.0,0.0
5,6,DD,0.0,0.0,0.0,0.0,1.0,0.0


# 기준 list 생성 후 적용

### 1) 예시 DataFrame 생성

In [12]:
df2 = pd.DataFrame(columns = ['a', 'b'])

df2['a'] = [1, 2, 3, 4, 5, 6 ]
df2['b'] = ['FF CC', "AABB", "DD AA", "AA BB", "AA", "DD" ]

### 2) 기준 list 및 적용 함수 생성

In [6]:

ls = ['AA', 'BB', 'CC', 'DD', 'EE', 'FF']

token = 'AABBCCDDEEFF'

# 함수 생성

def validation(token):
    result = []
    for i in ls:
        if i in token:
            token = token.replace(i, '')
            result.append(i)
    return result

### 3) 위 함수 기준으로 분할 후 고유 값으로 갈 열을 생성

In [8]:
cs_iter = (set(validation(token)) for x in df.b)
cs = sorted(set.union(*cs_iter))

dummies = pd.DataFrame(np.zeros((len(df), len(cs))), columns = cs)

dummies

Unnamed: 0,AA,BB,CC,DD,EE,FF
0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0


### 4) DataFrame과 병합 후 'b'열에 해당 문자가 있는 경우 해당 열에 1 할당

In [11]:
for i, gen in enumerate(df2.b):
    dummies.ix[i, validation(gen)] = 1
    
df2 = df2.join(dummies)

df2

Unnamed: 0,a,b,AA,BB,CC,DD,EE,FF
0,1,FF CC,0.0,0.0,1.0,0.0,0.0,1.0
1,2,AABB,1.0,1.0,0.0,0.0,0.0,0.0
2,3,DD AA,1.0,0.0,0.0,1.0,0.0,0.0
3,4,AA BB,1.0,1.0,0.0,0.0,0.0,0.0
4,5,AA,1.0,0.0,0.0,0.0,0.0,0.0
5,6,DD,0.0,0.0,0.0,1.0,0.0,0.0
