## 머신러닝 2주차

### 전처리 개념

In [2]:
import pandas as pd

In [6]:
df = pd.read_csv('./titanic/train.csv')

In [7]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [None]:
print(df.shape)
print(df.columns)
print(df.dtypes)
print(df.describe())

In [None]:

print(df.info())

### 전처리 실습

In [14]:
import numpy as np

data = {
    'A': [1, 2, np.nan, 4, 5, 100, 1, 2, 2, 4, '1', '2', '3', '4', '5', 10, 20, 30, 40, 50],
    'B': [5, np.nan, np.nan, 8, 10, 60, 10, 20, 20, 40, '10', '20', '30', '40', '50', 5, 4, 3, 2, 1],
    'C': [1, 2, 3, 4, 5, 5, 100, 200, 200, 400, 100, 200, 300, 400, 500, 1, 2, 3, 4, 5],
    'D': [np.nan, np.nan, 3, 3, 3, 5, 5, 5, 5, 5, np.nan, np.nan, np.nan, np.nan, np.nan, 2, 3, 4, 5, 6],
    'category_column': [np.nan]*10 + ['A', 'B', 'A', 'C', 'B'] + [np.nan]*5,
    'value_column': [np.nan]*10 + [1, 2, 3, 4, 5] + [np.nan]*5,
    'target': [np.nan]*15 + [1, 0, 1, 0, 1]
}

df = pd.DataFrame(data)
df

Unnamed: 0,A,B,C,D,category_column,value_column,target
0,1.0,5.0,1,,,,
1,2.0,,2,,,,
2,,,3,3.0,,,
3,4.0,8.0,4,3.0,,,
4,5.0,10.0,5,3.0,,,
5,100.0,60.0,5,5.0,,,
6,1.0,10.0,100,5.0,,,
7,2.0,20.0,200,5.0,,,
8,2.0,20.0,200,5.0,,,
9,4.0,40.0,400,5.0,,,


#### 결측값 처리

##### 제거

In [15]:
# 결측값이 포함된 행 제거
df_dropped_rows = df.dropna()

# 결측값이 포함된 열 제거
df_dropped_cols = df.dropna(axis=1)

In [16]:
df_dropped_rows

Unnamed: 0,A,B,C,D,category_column,value_column,target


In [17]:
df_dropped_cols

Unnamed: 0,C
0,1
1,2
2,3
3,4
4,5
5,5
6,100
7,200
8,200
9,400


##### 대체

In [19]:
# 결측값을 0으로 대체
df_filled = df.fillna(0)

# 결측값을 각 열의 평균값으로 대체
#df_filled_mean = df.fillna(df.mean())

# 결측값을 각 열의 중간값으로 대체
#df_filled_median = df.fillna(df.median())

# 결측값을 각 열의 최빈값으로 대체
#df_filled_mode = df.fillna(df.mode().iloc[0])

In [20]:
df_filled

Unnamed: 0,A,B,C,D,category_column,value_column,target
0,1,5,1,0.0,0,0.0,0.0
1,2,0,2,0.0,0,0.0,0.0
2,0,0,3,3.0,0,0.0,0.0
3,4,8,4,3.0,0,0.0,0.0
4,5,10,5,3.0,0,0.0,0.0
5,100,60,5,5.0,0,0.0,0.0
6,1,10,100,5.0,0,0.0,0.0
7,2,20,200,5.0,0,0.0,0.0
8,2,20,200,5.0,0,0.0,0.0
9,4,40,400,5.0,0,0.0,0.0


#### 이상값 처리

In [24]:
# 특정 열의 이상치 확인 (IQR 방법)
Q1 = df['C'].quantile(0.25)
Q3 = df['C'].quantile(0.75)
IQR = Q3 - Q1

# 이상치 범위 설정
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# 이상치 확인
outliers = df[(df['C'] < lower_bound) | (df['C'] > upper_bound)]
print(outliers)

print('\n')

df_no_outliers = df[(df['C'] >= lower_bound) & (df['C'] <= upper_bound)]
print(df_no_outliers)

    A   B    C   D category_column  value_column  target
14  5  50  500 NaN               B           5.0     NaN


      A    B    C    D category_column  value_column  target
0     1    5    1  NaN             NaN           NaN     NaN
1     2  NaN    2  NaN             NaN           NaN     NaN
2   NaN  NaN    3  3.0             NaN           NaN     NaN
3     4    8    4  3.0             NaN           NaN     NaN
4     5   10    5  3.0             NaN           NaN     NaN
5   100   60    5  5.0             NaN           NaN     NaN
6     1   10  100  5.0             NaN           NaN     NaN
7     2   20  200  5.0             NaN           NaN     NaN
8     2   20  200  5.0             NaN           NaN     NaN
9     4   40  400  5.0             NaN           NaN     NaN
10    1   10  100  NaN               A           1.0     NaN
11    2   20  200  NaN               B           2.0     NaN
12    3   30  300  NaN               A           3.0     NaN
13    4   40  400  NaN        

#### 중복값 제거

In [26]:
# 중복된 행 확인
print(df.duplicated())

# 중복된 행 제거
df_no_duplicates = df.drop_duplicates()
print(df_no_duplicates)

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8      True
9     False
10    False
11    False
12    False
13    False
14    False
15    False
16    False
17    False
18    False
19    False
dtype: bool
      A    B    C    D category_column  value_column  target
0     1    5    1  NaN             NaN           NaN     NaN
1     2  NaN    2  NaN             NaN           NaN     NaN
2   NaN  NaN    3  3.0             NaN           NaN     NaN
3     4    8    4  3.0             NaN           NaN     NaN
4     5   10    5  3.0             NaN           NaN     NaN
5   100   60    5  5.0             NaN           NaN     NaN
6     1   10  100  5.0             NaN           NaN     NaN
7     2   20  200  5.0             NaN           NaN     NaN
9     4   40  400  5.0             NaN           NaN     NaN
10    1   10  100  NaN               A           1.0     NaN
11    2   20  200  NaN               B           2.0     NaN
12    3   30  30

#### 데이터 타입 변환

In [27]:
df['C'] = df['C'].astype(int)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   A                19 non-null     object 
 1   B                18 non-null     object 
 2   C                20 non-null     int32  
 3   D                13 non-null     float64
 4   category_column  5 non-null      object 
 5   value_column     5 non-null      float64
 6   target           5 non-null      float64
dtypes: float64(3), int32(1), object(3)
memory usage: 1.1+ KB
None


#### 인코딩

In [28]:
df_encoded = pd.get_dummies(df, columns=['category_column'])
df_encoded

Unnamed: 0,A,B,C,D,value_column,target,category_column_A,category_column_B,category_column_C
0,1.0,5.0,1,,,,False,False,False
1,2.0,,2,,,,False,False,False
2,,,3,3.0,,,False,False,False
3,4.0,8.0,4,3.0,,,False,False,False
4,5.0,10.0,5,3.0,,,False,False,False
5,100.0,60.0,5,5.0,,,False,False,False
6,1.0,10.0,100,5.0,,,False,False,False
7,2.0,20.0,200,5.0,,,False,False,False
8,2.0,20.0,200,5.0,,,False,False,False
9,4.0,40.0,400,5.0,,,False,False,False


#### 샘플링

In [30]:
print(df)
df_sampled = df.sample(frac=0.5)  # 랜덤하게 50% 샘플 추출
# df_sampled = df.sample(n=100)  # 랜덤하게 100개의 샘플 추출

      A    B    C    D category_column  value_column  target
0     1    5    1  NaN             NaN           NaN     NaN
1     2  NaN    2  NaN             NaN           NaN     NaN
2   NaN  NaN    3  3.0             NaN           NaN     NaN
3     4    8    4  3.0             NaN           NaN     NaN
4     5   10    5  3.0             NaN           NaN     NaN
5   100   60    5  5.0             NaN           NaN     NaN
6     1   10  100  5.0             NaN           NaN     NaN
7     2   20  200  5.0             NaN           NaN     NaN
8     2   20  200  5.0             NaN           NaN     NaN
9     4   40  400  5.0             NaN           NaN     NaN
10    1   10  100  NaN               A           1.0     NaN
11    2   20  200  NaN               B           2.0     NaN
12    3   30  300  NaN               A           3.0     NaN
13    4   40  400  NaN               C           4.0     NaN
14    5   50  500  NaN               B           5.0     NaN
15   10    5    1  2.0  

In [31]:
df_sampled

Unnamed: 0,A,B,C,D,category_column,value_column,target
13,4.0,40.0,400,,C,4.0,
16,20.0,4.0,2,3.0,,,0.0
19,50.0,1.0,5,6.0,,,1.0
6,1.0,10.0,100,5.0,,,
8,2.0,20.0,200,5.0,,,
12,3.0,30.0,300,,A,3.0,
11,2.0,20.0,200,,B,2.0,
2,,,3,3.0,,,
1,2.0,,2,,,,
0,1.0,5.0,1,,,,
