In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, MaxAbsScaler

### Scaler

* ~~StandardScaler~~
    - 특성들의 평균을 0, 분산을 1로 스케일링하는 것 (특성들을 정규분포로)
    - 이상치에 민감
    - 회귀보다 분류에 유용
* MinMaxScaler
    - 가장 작은 값은 0, 가장 큰 값은 1로 변환되므로, 모든 특성들은 0 ~ 1 범위를 갖는다
    - 이상치에 민감
    - 분류보다 회귀에 유용
* ~~RobustScaler~~
    - 평균과 분산 대신에 중간값과 사분위값 사용
    - 이상치 영향을 최소화
* ~~MaxAbsScaler~~
    - 각 특성의 절대값이 0과 1사이가 되도록 스케일링
    - 모든 값이 -1 ~ 1 사이로 표현되며, 데이터가 양수일 경우 MinMaxScaler 와 동일
    - 이상치에 민감

In [12]:
df = pd.read_csv('../data/완성data/최종사용data/oversampled_data.csv')

In [13]:
pd.set_option('display.max_seq_items', None)
df.columns

Index(['벤처확인 유형', '기업성장단계', '제조 및 비제조', '대표이사_창업자여부', '창업 당시 창업자 연령',
       '창업 당시 창업자 최종학력', '창업 직전 근무지', '창업방식', '단독창업 여부', '창업 당시 목표시장',
       '창업자 과거창업 경험 여부', '융자(정책)_경험여부', '장기적성장전망성', '자체브랜드보유여부', '영업및마케팅방식',
       '전공과 업종 일치 여부', '연구개발관련전담부서보유여부', '엔젤투자자 혹은 액셀러레이터 투자 경험여부',
       '해외수출 및 진출과정 애로사항', '제품및서비스구조', '기술력수준비교(세계)', '기술력수준비교(국내)',
       '주력제품 국내 시장점유율(퍼센트)', '주력제품 해외 시장점유율(퍼센트)', '정규직비율', '학력별 인력구성비(고졸이하)',
       '학력별 인력구성비(전문대졸)', '학력별 인력구성비(대졸)', '학력별 인력구성비(석사)', '학력별 인력구성비(박사)',
       '부서별_인력구성(관리)', '부서별_인력구성(생산)', '부서별_인력구성(영업)', '부서별_인력구성(RnD)',
       '부서별_인력구성(기타)', '매출구조비율(대기업)', '매출구조비율(12차벤더)', '매출구조비율(중소기업)',
       '매출구조비율(B2C)', '매출구조비율(B2G)', '매출구조비율(글로벌)', '투자액비(국내설비)', '투자액비(RnD)',
       '투자액비(해외)', '투자액비(기타)', '부채비율', '자기자본비율', '순이익률', '총자산회전율', '자기자본회전율',
       'ROE', '영업이익률', '이자보상배율', '종사자생산성', '창업 당시 창업자 실무경험 년수(년)',
       '벤처확인제도 혜택(평균)', '벤처인프라 수준(평균)', '등록보유(합계)', '출원중(합계)', '경영상 애로사항(개수)',
       '벤처캐피털(투자조합포함) 투자 경험여부'],
      dtyp

#### 비율형 변수만 분리

In [14]:
# 컬럼 위치 확인
column_name = '주력제품 국내 시장점유율(퍼센트)'
column_index = df.columns.tolist().index(column_name)
column_index

22

In [15]:
column_name = '경영상 애로사항(개수)'
column_index = df.columns.tolist().index(column_name)
column_index

59

In [16]:
df.iloc[:,22:60].columns

Index(['주력제품 국내 시장점유율(퍼센트)', '주력제품 해외 시장점유율(퍼센트)', '정규직비율', '학력별 인력구성비(고졸이하)',
       '학력별 인력구성비(전문대졸)', '학력별 인력구성비(대졸)', '학력별 인력구성비(석사)', '학력별 인력구성비(박사)',
       '부서별_인력구성(관리)', '부서별_인력구성(생산)', '부서별_인력구성(영업)', '부서별_인력구성(RnD)',
       '부서별_인력구성(기타)', '매출구조비율(대기업)', '매출구조비율(12차벤더)', '매출구조비율(중소기업)',
       '매출구조비율(B2C)', '매출구조비율(B2G)', '매출구조비율(글로벌)', '투자액비(국내설비)', '투자액비(RnD)',
       '투자액비(해외)', '투자액비(기타)', '부채비율', '자기자본비율', '순이익률', '총자산회전율', '자기자본회전율',
       'ROE', '영업이익률', '이자보상배율', '종사자생산성', '창업 당시 창업자 실무경험 년수(년)',
       '벤처확인제도 혜택(평균)', '벤처인프라 수준(평균)', '등록보유(합계)', '출원중(합계)', '경영상 애로사항(개수)'],
      dtype='object')

In [17]:
tran_df = df.iloc[:,22:60]

#### 분리한 데이터 Scaling

In [18]:
# Minmaxscaler -> 인코딩된 범주형 변수 값인 0, 1 값과 유사한 범위 값으로 바꿔주기 위해서
mm = MinMaxScaler()
mm_tran_df = mm.fit_transform(tran_df)
mm_tran_df = pd.DataFrame(mm_tran_df, columns=tran_df.columns)

#### 원본 데이터를 Scaling 된 데이터로 치환

In [19]:
df.iloc[:,22:60] = mm_tran_df

In [20]:
df.iloc[:,22:60]

Unnamed: 0,주력제품 국내 시장점유율(퍼센트),주력제품 해외 시장점유율(퍼센트),정규직비율,학력별 인력구성비(고졸이하),학력별 인력구성비(전문대졸),학력별 인력구성비(대졸),학력별 인력구성비(석사),학력별 인력구성비(박사),부서별_인력구성(관리),부서별_인력구성(생산),...,ROE,영업이익률,이자보상배율,종사자생산성,창업 당시 창업자 실무경험 년수(년),벤처확인제도 혜택(평균),벤처인프라 수준(평균),등록보유(합계),출원중(합계),경영상 애로사항(개수)
0,0.010000,0.000000,1.000000,0.000000,0.133333,0.466667,0.333333,0.066667,0.333330,0.000000,...,0.740643,0.000000,0.197529,0.000066,0.080000,0.416750,0.437500,0.002055,0.000000,0.000000
1,0.010000,0.000000,1.000000,0.600000,0.200000,0.200000,0.000000,0.000000,0.400000,0.000000,...,0.623444,0.949028,0.201182,0.287093,0.200000,0.708250,0.562500,0.005139,0.003584,0.466667
2,0.100000,0.000000,1.000000,0.266667,0.533333,0.133333,0.066667,0.000000,0.266670,0.000000,...,0.618330,0.952751,0.200413,0.057571,0.140000,0.791750,0.562500,0.017472,0.000000,0.600000
3,0.300000,0.050000,1.000000,0.350000,0.100000,0.200000,0.350000,0.000000,0.230770,0.266270,...,0.611608,0.946372,0.198994,0.027605,0.500000,0.958250,0.750000,0.040082,0.000000,0.133333
4,0.300000,0.000000,1.000000,0.300000,0.220000,0.200000,0.280000,0.000000,0.222730,0.281820,...,0.610555,0.941655,0.198357,0.029073,0.500000,0.916750,0.687500,0.002055,0.000000,0.266667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17195,0.424455,0.077336,0.995438,0.034912,0.177066,0.638390,0.124694,0.024939,0.342275,0.368292,...,0.603763,0.907246,0.180521,0.024546,0.000000,0.468523,0.484261,0.005139,0.000000,0.066667
17196,0.052224,0.000098,1.000000,0.239954,0.278206,0.460249,0.020818,0.000773,0.528846,0.240259,...,0.617941,0.952328,0.199532,0.056497,0.360000,0.706826,0.439638,0.006166,0.000000,0.266667
17197,0.112294,0.016888,1.000000,0.016844,0.148670,0.801330,0.033156,0.000000,0.091709,0.433133,...,0.619048,0.946846,0.199731,0.071216,0.112624,0.508074,0.449541,0.001028,0.000000,0.400000
17198,0.377663,0.081631,1.000000,0.317472,0.167377,0.480588,0.027281,0.007281,0.430578,0.288449,...,0.609463,0.946281,0.199301,0.117957,0.320382,0.662985,0.591018,0.008222,0.000000,0.400000


In [21]:
# df.to_csv('mmscaled_data.csv', index=False, encoding='utf-8')