In [1]:
import numpy as np
from scipy.stats import *
import pandas as pd

#### 분산 계산

In [1]:
x = [1, 2, 3, 4, 5]
print(np.var(x, ddof = 1)) # 분모 = n - 1 ( 5 - 1)
print(np.array(x).var()) # 분모 = n
print(pd.Series(x).var(ddof = 0)) # 분모 = n

2.5
2.0
2.0


#### 표준편차 계산

In [3]:
x = [1, 2, 3, 4, 5]
print(np.std(x, ddof = 1))
print(np.array(x).std())
print(pd.Series(x).std(ddof = 1))

1.5811388300841898
1.4142135623730951
1.5811388300841898


#### 변동계수의 필요성

- 스케일에 따라, 분산과 표준편차가 달라진다. 
- 변동계수 = 표준편차 / 평균
- `variation()` command

In [4]:
x1 = np.array([1, 2, 3, 4, 5])
x2 = x1 * 10

print(np.std(x1, ddof = 1))
print(np.std(x2, ddof = 1))

1.5811388300841898
15.811388300841896


In [5]:
print(variation(x1)) # 변동 계수
print(variation(x2))

0.47140452079103173
0.4714045207910317


In [6]:
print(np.std(x1, ddof = 1) / np.mean(x1))
print(np.std(x2, ddof = 1) / np.mean(x2))

0.5270462766947299
0.5270462766947299


#### 스케일링

In [7]:
x1

array([1, 2, 3, 4, 5])

In [8]:
x2

array([10, 20, 30, 40, 50])

##### standardize

In [10]:
z1 = (x1 - x1.mean()) / x1.std()
z2 = (x2 - x2.mean()) / x2.std()

print(z1)
print(z2)

[-1.41421356 -0.70710678  0.          0.70710678  1.41421356]
[-1.41421356 -0.70710678  0.          0.70710678  1.41421356]


##### minmax scaling

In [11]:
z1 = (x1 - x1.min()) / (x1.max() - x1.min())
z2 = (x2 - x2.min()) / (x2.max() - x2.min())

print(z1)
print(z2)

[0.   0.25 0.5  0.75 1.  ]
[0.   0.25 0.5  0.75 1.  ]


In [2]:
# sklearn을 이용한 스케일링
X = pd.DataFrame({"X1":[1, 2, 3, 4, 5],
    "X2": [10, 20, 30, 40, 50]})

X

Unnamed: 0,X1,X2
0,1,10
1,2,20
2,3,30
3,4,40
4,5,50


scaling by column

In [3]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler() # 인스턴스화
Z = scaler.fit_transform(X) # fit_transform => ndarray
pd.DataFrame(Z)

Unnamed: 0,0,1
0,0.0,0.0
1,0.25,0.25
2,0.5,0.5
3,0.75,0.75
4,1.0,1.0


#### 범위와 사분위 범위 계산 

- IQR(interquartile range) : Q3 - Q1

In [14]:
x = np.random.normal(100, 20, size = 1000)
x

array([ 70.96459463, 101.01216801, 113.47455271,  85.19206086,
       107.83666542, 106.21259467,  91.73459433, 114.22671722,
       100.57812341, 128.36490531,  80.09741638, 135.92623035,
       115.32291185, 122.61493258,  73.1540519 , 159.18452585,
        90.95070299, 107.74752861,  85.57363019, 128.52772935,
       130.45628863, 108.87458366, 135.60638303, 101.30697981,
       118.31910967, 106.85727853,  80.36190618, 117.37992076,
        57.53314358, 142.22889463, 101.71350713,  96.78461184,
       107.25224602, 108.75099507,  89.39275047, 102.5020707 ,
       113.60448258, 125.05971259,  92.95119655, 119.97252069,
        77.60940574,  89.41444237, 120.57865792, 101.50274537,
        81.15994575,  81.29100162,  76.99149417,  75.06221369,
       125.4805894 ,  93.73368021, 109.70593352,  72.1738864 ,
       110.8150474 , 138.43847726,  80.93031071, 122.72270359,
        95.78941479,  89.98753779,  86.61935052,  62.23953215,
       101.16322715, 108.23501516, 118.85070362,  76.43

##### range

In [15]:
print(np.ptp(x))
print(np.max(x) - np.min(x))

134.0708987286601
134.0708987286601


##### IQR

In [17]:
print(np.quantile(x, 0.75) - np.quantile(x, 0.25))
print(iqr(x))

27.182785510295744
27.182785510295744
