# 데이터 정규화

데이터를 특정 범위나 척도로 변환하여 처리하거나 분석할 때 사용되는 기술

데이터 정규화의 목표는 서로 다른 단위나 범위를 가진 데이터를 동일한 기준으로 맞춤으로써, 데이터 분석이나 머신러닝 모델의 성능을 향상시키는 것

## #01. 준비과정

### [1] 패키지 참조

`scikit-learn` 패키지의 설치가 필요하다.

`scikit-learn`의 `preprocessing` 패키지 안에는 머신러닝에 적합한 데이터 전처리 기능을 제공하는 각종 클래스가 포함되어 있다.

In [2]:
from pandas import read_excel, DataFrame
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler

### [2] 샘플 데이터 가져오기

어느 간호학과 대학원에 지원한 학생들에 대한 합격/불합격 여부를 조사한 가상의 데이터이다.

In [3]:
origin = read_excel('https://data.hossam.kr/pydata/gradeuate.xlsx')
origin

Unnamed: 0,합격여부,필기점수,학부성적,병원경력
0,0,380,3.61,3
1,1,660,3.67,3
2,1,800,4.00,1
3,1,640,3.19,4
4,0,520,2.93,4
...,...,...,...,...
395,0,620,4.00,2
396,0,560,3.04,3
397,0,460,2.63,2
398,0,700,3.65,2


## #02. Min-Max Scaler (Normalization, 정규화)

모든 데이터의 범위를 `0~1`로 변환하는 것.

데이터에서 최소값을 0으로, 최대값을 1로 매핑

$정규화된 값 = (X - Xmin) / (Xmax - Xmin)$

이 방법은 데이터의 분포를 유지하면서 데이터를 특정 범위로 축소시키는 데에 유용

### [1] 직접 계산하기

In [4]:
df = origin.copy()

xmin = df['필기점수'].min()
xmax = df['필기점수'].max()

df['필기점수_MinMax(1)'] = (df['필기점수'] - xmin) / (xmax - xmin)
df

Unnamed: 0,합격여부,필기점수,학부성적,병원경력,필기점수_MinMax(1)
0,0,380,3.61,3,0.275862
1,1,660,3.67,3,0.758621
2,1,800,4.00,1,1.000000
3,1,640,3.19,4,0.724138
4,0,520,2.93,4,0.517241
...,...,...,...,...,...
395,0,620,4.00,2,0.689655
396,0,560,3.04,3,0.586207
397,0,460,2.63,2,0.413793
398,0,700,3.65,2,0.827586


### [2] sklearn 활용

`sklearn`은 데이터 표준화를 위한 `MinMaxScaler` 클래스를 제공한다.

이 클래스에 데이터를 전달할 때는 `n행1열`로 구성된 2차 배열 형식이거나 표준화를 처리할 필드만 갖는 데이터 프레임이어야 한다.

#### (1) 2차 배열로 처리하는 경우

##### 특정 필드의 값만 추출

`1차 배열`형태로 추출된다.

In [5]:
n = df['필기점수'].values
n

array([380, 660, 800, 640, 520, 760, 560, 400, 540, 700, 800, 440, 760,
       700, 700, 480, 780, 360, 800, 540, 500, 660, 600, 680, 760, 800,
       620, 520, 780, 520, 540, 760, 600, 800, 360, 400, 580, 520, 500,
       520, 560, 580, 600, 500, 700, 460, 580, 500, 440, 400, 640, 440,
       740, 680, 660, 740, 560, 380, 400, 600, 620, 560, 640, 680, 580,
       600, 740, 620, 580, 800, 640, 300, 480, 580, 720, 720, 560, 800,
       540, 620, 700, 620, 500, 380, 500, 520, 600, 600, 700, 660, 700,
       720, 800, 580, 660, 660, 640, 480, 700, 400, 340, 580, 380, 540,
       660, 740, 700, 480, 400, 480, 680, 420, 360, 600, 720, 620, 440,
       700, 800, 340, 520, 480, 520, 500, 720, 540, 600, 740, 540, 460,
       620, 640, 580, 500, 560, 500, 560, 700, 620, 600, 640, 700, 620,
       580, 580, 380, 480, 560, 480, 740, 800, 400, 640, 580, 620, 580,
       560, 480, 660, 700, 600, 640, 700, 520, 580, 700, 440, 720, 500,
       600, 400, 540, 680, 800, 500, 620, 520, 620, 620, 300, 62

##### 추출된 값의 차원 변환

In [6]:
n_re = n.reshape(-1, 1)
n_re

array([[380],
       [660],
       [800],
       [640],
       [520],
       [760],
       [560],
       [400],
       [540],
       [700],
       [800],
       [440],
       [760],
       [700],
       [700],
       [480],
       [780],
       [360],
       [800],
       [540],
       [500],
       [660],
       [600],
       [680],
       [760],
       [800],
       [620],
       [520],
       [780],
       [520],
       [540],
       [760],
       [600],
       [800],
       [360],
       [400],
       [580],
       [520],
       [500],
       [520],
       [560],
       [580],
       [600],
       [500],
       [700],
       [460],
       [580],
       [500],
       [440],
       [400],
       [640],
       [440],
       [740],
       [680],
       [660],
       [740],
       [560],
       [380],
       [400],
       [600],
       [620],
       [560],
       [640],
       [680],
       [580],
       [600],
       [740],
       [620],
       [580],
       [800],
       [640],
      

In [18]:
# 전치행렬

# transpose는 2차원에만 쓸 수 있다.

# n_re1 = DataFrame.t(n)
# n_re1

AttributeError: type object 'DataFrame' has no attribute 't'

In [19]:
# # 표준화 기능을 제공하는 객체를 생성
scaler = MinMaxScaler()

# 표준화를 적용할 필드를 scaler 객체에게 알려준다.
scaler.fit(n_re)

# 표준화 적용
df['필기점수_MinMax(2)'] = scaler.transform(n_re)

df

Unnamed: 0,합격여부,필기점수,학부성적,병원경력,필기점수_MinMax(1),필기점수_MinMax(2)
0,0,380,3.61,3,0.275862,0.275862
1,1,660,3.67,3,0.758621,0.758621
2,1,800,4.00,1,1.000000,1.000000
3,1,640,3.19,4,0.724138,0.724138
4,0,520,2.93,4,0.517241,0.517241
...,...,...,...,...,...,...
395,0,620,4.00,2,0.689655,0.689655
396,0,560,3.04,3,0.586207,0.586207
397,0,460,2.63,2,0.413793,0.413793
398,0,700,3.65,2,0.827586,0.827586


#### (2) 표준화를 적용할 필드만 갖는 데이터 프레임을 사용하는 경우

##### 특정 필드만 추출

In [20]:
tmp = df.filter(['필기점수'])
tmp

Unnamed: 0,필기점수
0,380
1,660
2,800
3,640
4,520
...,...
395,620
396,560
397,460
398,700


##### 해당 필드에 대한 데이터 표준화

In [21]:
scaler = MinMaxScaler()
scaler.fit(tmp)
df['필기점수_MinMax(3)'] = scaler.transform(tmp)
df

Unnamed: 0,합격여부,필기점수,학부성적,병원경력,필기점수_MinMax(1),필기점수_MinMax(2),필기점수_MinMax(3)
0,0,380,3.61,3,0.275862,0.275862,0.275862
1,1,660,3.67,3,0.758621,0.758621,0.758621
2,1,800,4.00,1,1.000000,1.000000,1.000000
3,1,640,3.19,4,0.724138,0.724138,0.724138
4,0,520,2.93,4,0.517241,0.517241,0.517241
...,...,...,...,...,...,...,...
395,0,620,4.00,2,0.689655,0.689655,0.689655
396,0,560,3.04,3,0.586207,0.586207,0.586207
397,0,460,2.63,2,0.413793,0.413793,0.413793
398,0,700,3.65,2,0.827586,0.827586,0.827586


#### (3) 데이터 프레임 전체에 대한 처리

##### 표준화 처리

시리즈 객체로 반환되기 때문에 이 결과를 다시 데이터프레임으로 재구성해야 한다.

`scaler.fit(origin)`: MinMaxScaler를 데이터에 맞추기 위해 사용됩니다. 이는 각 특성(변수)에 대해 최소값과 최대값을 계산하는 등의 작업을 수행합니다.

`scaler.transform(origin)`: fit 단계에서 계산한 최소값과 최대값을 사용하여 데이터를 변환합니다. 각 변수에 대해 최소값을 0으로, 최대값을 1로 스케일링합니다.

In [22]:
scaler = MinMaxScaler()
scaler.fit(origin)
mms = scaler.transform(origin)
mms

array([[0.        , 0.27586207, 0.77586201, 0.66666667],
       [1.        , 0.75862069, 0.81034487, 0.66666667],
       [1.        , 1.        , 1.        , 0.        ],
       ...,
       [0.        , 0.4137931 , 0.21264375, 0.33333333],
       [0.        , 0.82758621, 0.79885063, 0.33333333],
       [0.        , 0.65517241, 0.93678167, 0.66666667]])

##### DataFrame 으로 재구성

In [23]:
minmax_df = DataFrame(mms, index=origin.index, columns=origin.columns)
minmax_df

Unnamed: 0,합격여부,필기점수,학부성적,병원경력
0,0.0,0.275862,0.775862,0.666667
1,1.0,0.758621,0.810345,0.666667
2,1.0,1.000000,1.000000,0.000000
3,1.0,0.724138,0.534483,1.000000
4,0.0,0.517241,0.385058,1.000000
...,...,...,...,...
395,0.0,0.689655,1.000000,0.333333
396,0.0,0.586207,0.448276,0.666667
397,0.0,0.413793,0.212644,0.333333
398,0.0,0.827586,0.798851,0.333333


## #03. Standard Scaler (z-score, 표준화)

데이터를 평균이 `0`, 표준편차가 `1`인 표준정규분포를 따르도록 변환

$ 정규화된 값 = (X - 평균) / 표준편차 $

데이터를 정규분포에 근사시켜서 이상치에 덜 민감하게 만들어 줌

- 값들의 단위가 비슷하다면 MinMax
- 값들의 단위가 상이하다면 Standard
- 잘 모르겠으면 Standard

> 분류 문제에서는 종속변수가 범주형(0, 1)이므로 종속변수는 표준화를 적용하지 않는다.

### [1] 직접 계산

In [24]:
평균 = df['학부성적'].mean()
표준편차 = df['학부성적'].std()
df['학부성적_Standard(1)'] = (df['학부성적'] - 평균) / 표준편차
df

Unnamed: 0,합격여부,필기점수,학부성적,병원경력,필기점수_MinMax(1),필기점수_MinMax(2),필기점수_MinMax(3),학부성적_Standard(1)
0,0,380,3.61,3,0.275862,0.275862,0.275862,0.578348
1,1,660,3.67,3,0.758621,0.758621,0.758621,0.736008
2,1,800,4.00,1,1.000000,1.000000,1.000000,1.603135
3,1,640,3.19,4,0.724138,0.724138,0.724138,-0.525269
4,0,520,2.93,4,0.517241,0.517241,0.517241,-1.208461
...,...,...,...,...,...,...,...,...
395,0,620,4.00,2,0.689655,0.689655,0.689655,1.603135
396,0,560,3.04,3,0.586207,0.586207,0.586207,-0.919418
397,0,460,2.63,2,0.413793,0.413793,0.413793,-1.996758
398,0,700,3.65,2,0.827586,0.827586,0.827586,0.683455


### [2] sklearn 활용

In [25]:
n = df['학부성적'].values.reshape(-1, 1)
n

array([[3.6099999 ],
       [3.67000008],
       [4.        ],
       [3.19000006],
       [2.93000007],
       [3.        ],
       [2.98000002],
       [3.07999992],
       [3.3900001 ],
       [3.92000008],
       [4.        ],
       [3.22000003],
       [4.        ],
       [3.07999992],
       [4.        ],
       [3.44000006],
       [3.86999989],
       [2.55999994],
       [3.75      ],
       [3.80999994],
       [3.17000008],
       [3.63000011],
       [2.81999993],
       [3.19000006],
       [3.3499999 ],
       [3.66000009],
       [3.6099999 ],
       [3.74000001],
       [3.22000003],
       [3.28999996],
       [3.77999997],
       [3.3499999 ],
       [3.4000001 ],
       [4.        ],
       [3.1400001 ],
       [3.04999995],
       [3.25      ],
       [2.9000001 ],
       [3.13000011],
       [2.68000007],
       [2.42000008],
       [3.31999993],
       [3.1500001 ],
       [3.30999994],
       [2.94000006],
       [3.45000005],
       [3.46000004],
       [2.970

In [26]:
scaler = StandardScaler()
scaler.fit(n)
df['학부성적_Standard(2)'] = scaler.transform(n)
df

Unnamed: 0,합격여부,필기점수,학부성적,병원경력,필기점수_MinMax(1),필기점수_MinMax(2),필기점수_MinMax(3),학부성적_Standard(1),학부성적_Standard(2)
0,0,380,3.61,3,0.275862,0.275862,0.275862,0.578348,0.579072
1,1,660,3.67,3,0.758621,0.758621,0.758621,0.736008,0.736929
2,1,800,4.00,1,1.000000,1.000000,1.000000,1.603135,1.605143
3,1,640,3.19,4,0.724138,0.724138,0.724138,-0.525269,-0.525927
4,0,520,2.93,4,0.517241,0.517241,0.517241,-1.208461,-1.209974
...,...,...,...,...,...,...,...,...,...
395,0,620,4.00,2,0.689655,0.689655,0.689655,1.603135,1.605143
396,0,560,3.04,3,0.586207,0.586207,0.586207,-0.919418,-0.920570
397,0,460,2.63,2,0.413793,0.413793,0.413793,-1.996758,-1.999259
398,0,700,3.65,2,0.827586,0.827586,0.827586,0.683455,0.684310


## #04. RobustScaler

이상치가 존재할 경우 사용하는 방법.

이상치(outliers)에 영향을 최소화하여 데이터를 스케일링하는 방법

이상치가 포함된 데이터를 표준화(Standardization)하거나 정규화(Normalization)할 때, 이상치의 영향으로 전체 데이터의 분포가 왜곡됨

RobustScaler는 이 문제를 해결하기 위해 중앙값과 사분위수를 사용하여 데이터를 스케일링 함

$ 정규화된 값 = (X - median) / iqr $

$ iqr = Q3 - Q1 $

### [1] 직접계산

In [27]:
중앙값 = df['병원경력'].median()
iqr = df['병원경력'].quantile(0.75) - df['병원경력'].quantile(0.25)
df['병원경력_Robust(1)'] = (df['병원경력'] - 중앙값) / iqr
df

Unnamed: 0,합격여부,필기점수,학부성적,병원경력,필기점수_MinMax(1),필기점수_MinMax(2),필기점수_MinMax(3),학부성적_Standard(1),학부성적_Standard(2),병원경력_Robust(1)
0,0,380,3.61,3,0.275862,0.275862,0.275862,0.578348,0.579072,1.0
1,1,660,3.67,3,0.758621,0.758621,0.758621,0.736008,0.736929,1.0
2,1,800,4.00,1,1.000000,1.000000,1.000000,1.603135,1.605143,-1.0
3,1,640,3.19,4,0.724138,0.724138,0.724138,-0.525269,-0.525927,2.0
4,0,520,2.93,4,0.517241,0.517241,0.517241,-1.208461,-1.209974,2.0
...,...,...,...,...,...,...,...,...,...,...
395,0,620,4.00,2,0.689655,0.689655,0.689655,1.603135,1.605143,0.0
396,0,560,3.04,3,0.586207,0.586207,0.586207,-0.919418,-0.920570,1.0
397,0,460,2.63,2,0.413793,0.413793,0.413793,-1.996758,-1.999259,0.0
398,0,700,3.65,2,0.827586,0.827586,0.827586,0.683455,0.684310,0.0


### [2] sklearn 활용

In [30]:
n = df['병원경력'].values.reshape(-1, 1)
scaler = RobustScaler()
scaler.fit(n)
df['병원경력_Robust(2)'] = scaler.transform(n)
df

Unnamed: 0,합격여부,필기점수,학부성적,병원경력,필기점수_MinMax(1),필기점수_MinMax(2),필기점수_MinMax(3),학부성적_Standard(1),학부성적_Standard(2),병원경력_Robust(1),병원경력_Robust(2)
0,0,380,3.61,3,0.275862,0.275862,0.275862,0.578348,0.579072,1.0,1.0
1,1,660,3.67,3,0.758621,0.758621,0.758621,0.736008,0.736929,1.0,1.0
2,1,800,4.00,1,1.000000,1.000000,1.000000,1.603135,1.605143,-1.0,-1.0
3,1,640,3.19,4,0.724138,0.724138,0.724138,-0.525269,-0.525927,2.0,2.0
4,0,520,2.93,4,0.517241,0.517241,0.517241,-1.208461,-1.209974,2.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...
395,0,620,4.00,2,0.689655,0.689655,0.689655,1.603135,1.605143,0.0,0.0
396,0,560,3.04,3,0.586207,0.586207,0.586207,-0.919418,-0.920570,1.0,1.0
397,0,460,2.63,2,0.413793,0.413793,0.413793,-1.996758,-1.999259,0.0,0.0
398,0,700,3.65,2,0.827586,0.827586,0.827586,0.683455,0.684310,0.0,0.0
