# 중복/누락 데이터 처리

## 1. 누락 데이터 제거

In [2]:
import pandas as pd
df=pd.read_csv('datacleaning.csv')

In [7]:
df

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
0,60,117.0,145,479.0
1,60,103.0,135,340.0
2,45,109.0,175,282.4
3,45,,148,406.0
4,60,100.0,120,250.7
5,450,104.0,134,253.3
6,30,109.0,133,195.1
7,60,,124,269.0
8,60,103.0,147,329.3
9,60,100.0,120,250.7


### 1-1. 누락 데이터 확인하기

In [8]:
# 열단위로 누락 데이터 확인하기
df.isnull().sum()

Duration    0
Pulse       3
Maxpulse    0
Calories    2
dtype: int64

### 1-2. 결측치가 있는 행 날리기

In [11]:
df2=df.dropna(axis=0) # 행단위로 제거
df2

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
0,60,117.0,145,479.0
1,60,103.0,135,340.0
2,45,109.0,175,282.4
4,60,100.0,120,250.7
5,450,104.0,134,253.3
6,30,109.0,133,195.1
8,60,103.0,147,329.3
9,60,100.0,120,250.7
10,60,100.0,120,250.7
11,60,98.0,120,215.2


### 1-3. 결측치가 있는 열 날리기

In [29]:
df2=df.dropna(axis=1) #열단위로 제거
df2

Unnamed: 0,Duration,Maxpulse
0,60,145
1,60,135
2,45,175
3,45,148
4,60,120
5,450,134
6,30,133
7,60,124
8,60,147
9,60,120


### 1-4. 결측치 메꾸기

In [30]:
df.fillna(130)

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
0,60,117.0,145,479.0
1,60,103.0,135,340.0
2,45,109.0,175,282.4
3,45,130.0,148,406.0
4,60,100.0,120,250.7
5,450,104.0,134,253.3
6,30,109.0,133,195.1
7,60,130.0,124,269.0
8,60,103.0,147,329.3
9,60,100.0,120,250.7


In [31]:
df['Calories'].fillna(130)

0     479.0
1     340.0
2     282.4
3     406.0
4     250.7
5     253.3
6     195.1
7     269.0
8     329.3
9     250.7
10    250.7
11    215.2
12    130.0
13    323.0
14    250.7
15    246.0
16    334.5
17    250.0
18    130.0
Name: Calories, dtype: float64

### 1-5. 평균값으로 결측치 채우기

In [32]:
m =df['Calories'].mean()
m

289.74117647058824

In [33]:
df['Calories'].fillna(m)

0     479.000000
1     340.000000
2     282.400000
3     406.000000
4     250.700000
5     253.300000
6     195.100000
7     269.000000
8     329.300000
9     250.700000
10    250.700000
11    215.200000
12    289.741176
13    323.000000
14    250.700000
15    246.000000
16    334.500000
17    250.000000
18    289.741176
Name: Calories, dtype: float64

### 1-6. 중복값 제거

In [34]:
df.drop_duplicates()

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
0,60,117.0,145,479.0
1,60,103.0,135,340.0
2,45,109.0,175,282.4
3,45,,148,406.0
4,60,100.0,120,250.7
5,450,104.0,134,253.3
6,30,109.0,133,195.1
7,60,,124,269.0
8,60,103.0,147,329.3
11,60,98.0,120,215.2


In [35]:
# keep 패러미터를 통해 중복값중 처음값을 살릴지 마지막값을 살릴지 선택 가능
df.drop_duplicates(keep='last')

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
0,60,117.0,145,479.0
1,60,103.0,135,340.0
2,45,109.0,175,282.4
3,45,,148,406.0
5,450,104.0,134,253.3
6,30,109.0,133,195.1
7,60,,124,269.0
8,60,103.0,147,329.3
11,60,98.0,120,215.2
12,45,90.0,112,


### 1-7. 연쇄적 명령

In [36]:
# 여려 명령어를 다음과 같이 연쇄적으로 사용할 수 있음
df.dropna().drop_duplicates()
# df.dropna().drop_duplicates().describe()
# df.dropna().drop_duplicates().head()

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
0,60,117.0,145,479.0
1,60,103.0,135,340.0
2,45,109.0,175,282.4
4,60,100.0,120,250.7
5,450,104.0,134,253.3
6,30,109.0,133,195.1
8,60,103.0,147,329.3
11,60,98.0,120,215.2
13,60,103.0,123,323.0
16,60,102.0,126,334.5


## MinMax Scaling

In [41]:
df2=df.dropna(axis=0)

In [43]:
(df2.max()-df2)/(df2.max()-df2.min())

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
0,0.928571,0.0,0.545455,0.0
1,0.928571,0.736842,0.727273,0.489609
2,0.964286,0.421053,0.0,0.692497
4,0.928571,0.894737,1.0,0.804156
5,0.0,0.684211,0.745455,0.794998
6,1.0,0.421053,0.763636,1.0
8,0.928571,0.736842,0.509091,0.527298
9,0.928571,0.894737,1.0,0.804156
10,0.928571,0.894737,1.0,0.804156
11,0.928571,1.0,1.0,0.9292


## Standard Scaling

In [45]:
(df2-df2.mean())/df2.std()

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
0,-0.233557,2.663829,0.869933,2.699922
1,-0.233557,-0.084121,0.235607,0.754987
2,-0.375723,1.093572,2.772913,-0.050972
4,-0.233557,-0.672967,-0.715883,-0.494529
5,3.462742,0.112161,0.172174,-0.458149
6,-0.517888,1.093572,0.108742,-1.272503
8,-0.233557,-0.084121,0.996799,0.605269
9,-0.233557,-0.672967,-0.715883,-0.494529
10,-0.233557,-0.672967,-0.715883,-0.494529
11,-0.233557,-1.065532,-0.715883,-0.991257
