# auto_mpg 데이터 전처리
- 1. 단위 변환 => 컬럼 추가
- 2. 자료형 검사 및 변환
- 3. origin 컬럼에 대한 라벨 => 컬럼추가

## (1) 파일 불러오기

In [204]:
import pandas as pd
import numpy as np 

In [205]:
DIR = '../Data/'
FILE = 'auto_mpg.csv'

mpg = pd.read_csv(DIR+FILE)

## (2) 데이터 확인

In [206]:
mpg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    object 
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model year    398 non-null    int64  
 7   origin        398 non-null    int64  
 8   car name      398 non-null    object 
dtypes: float64(3), int64(4), object(2)
memory usage: 28.1+ KB


In [207]:
mpg.head(3)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite


In [208]:
mpg.isnull().sum()

mpg             0
cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
model year      0
origin          0
car name        0
dtype: int64

In [209]:
mpg.duplicated().sum()

0

## (3) 데이터 전처리

### (3-1) 데이터 정제

In [210]:
mpg['horsepower'].unique()

array(['130', '165', '150', '140', '198', '220', '215', '225', '190',
       '170', '160', '95', '97', '85', '88', '46', '87', '90', '113',
       '200', '210', '193', '?', '100', '105', '175', '153', '180', '110',
       '72', '86', '70', '76', '65', '69', '60', '80', '54', '208', '155',
       '112', '92', '145', '137', '158', '167', '94', '107', '230', '49',
       '75', '91', '122', '67', '83', '78', '52', '61', '93', '148',
       '129', '96', '71', '98', '115', '53', '81', '79', '120', '152',
       '102', '108', '68', '58', '149', '89', '63', '48', '66', '139',
       '103', '125', '133', '138', '135', '142', '77', '62', '132', '84',
       '64', '74', '116', '82'], dtype=object)

In [211]:
mpg = mpg.replace({'horsepower': {'?': np.nan}})

In [212]:
mpg['horsepower'].unique()

array(['130', '165', '150', '140', '198', '220', '215', '225', '190',
       '170', '160', '95', '97', '85', '88', '46', '87', '90', '113',
       '200', '210', '193', nan, '100', '105', '175', '153', '180', '110',
       '72', '86', '70', '76', '65', '69', '60', '80', '54', '208', '155',
       '112', '92', '145', '137', '158', '167', '94', '107', '230', '49',
       '75', '91', '122', '67', '83', '78', '52', '61', '93', '148',
       '129', '96', '71', '98', '115', '53', '81', '79', '120', '152',
       '102', '108', '68', '58', '149', '89', '63', '48', '66', '139',
       '103', '125', '133', '138', '135', '142', '77', '62', '132', '84',
       '64', '74', '116', '82'], dtype=object)

In [213]:
mpg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    392 non-null    object 
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model year    398 non-null    int64  
 7   origin        398 non-null    int64  
 8   car name      398 non-null    object 
dtypes: float64(3), int64(4), object(2)
memory usage: 28.1+ KB


In [214]:
mpg = mpg.dropna(axis=0, how='all',subset='horsepower').reset_index(drop=True)
mpg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 392 entries, 0 to 391
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           392 non-null    float64
 1   cylinders     392 non-null    int64  
 2   displacement  392 non-null    float64
 3   horsepower    392 non-null    object 
 4   weight        392 non-null    int64  
 5   acceleration  392 non-null    float64
 6   model year    392 non-null    int64  
 7   origin        392 non-null    int64  
 8   car name      392 non-null    object 
dtypes: float64(3), int64(4), object(2)
memory usage: 27.7+ KB


### (3-2) 데이터 표준화

In [215]:
# horsepower 컬럼 int 로 타입 변경
mpg.horsepower = mpg.horsepower.astype(int)
mpg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 392 entries, 0 to 391
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           392 non-null    float64
 1   cylinders     392 non-null    int64  
 2   displacement  392 non-null    float64
 3   horsepower    392 non-null    int32  
 4   weight        392 non-null    int64  
 5   acceleration  392 non-null    float64
 6   model year    392 non-null    int64  
 7   origin        392 non-null    int64  
 8   car name      392 non-null    object 
dtypes: float64(3), int32(1), int64(4), object(1)
memory usage: 26.2+ KB


In [216]:
# mpg 를 kpl로 변환 후 컬럼 추가
mpg['kpl'] = mpg['mpg'] * 0.45
mpg.head(3)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name,kpl
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu,8.1
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320,6.75
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite,8.1


In [217]:
# origin 컬럼을 지역으로 바꾸고 컬럼 추가
mpg['origin'].unique()

oriList = []
for i in range(1,4):
    oriList.append(mpg[mpg.origin == i])

for i in oriList:
    print(i.head(2), '\n')

    mpg  cylinders  displacement  horsepower  weight  acceleration  \
0  18.0          8         307.0         130    3504          12.0   
1  15.0          8         350.0         165    3693          11.5   

   model year  origin                   car name   kpl  
0          70       1  chevrolet chevelle malibu  8.10  
1          70       1          buick skylark 320  6.75   

     mpg  cylinders  displacement  horsepower  weight  acceleration  \
19  26.0          4          97.0          46    1835          20.5   
20  25.0          4         110.0          87    2672          17.5   

    model year  origin                      car name    kpl  
19          70       2  volkswagen 1131 deluxe sedan  11.70  
20          70       2                   peugeot 504  11.25   

     mpg  cylinders  displacement  horsepower  weight  acceleration  \
14  24.0          4         113.0          95    2372          15.0   
18  27.0          4          97.0          88    2130          14.5   



In [218]:
# origin 1 => usa, 2 => eur, 3 => jpy
mpg['region'] = mpg['origin'].replace({1: 'usa', 2: 'eur', 3: 'jpy'})
mpg.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name,kpl,region
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu,8.1,usa
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320,6.75,usa
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite,8.1,usa
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst,7.2,usa
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino,7.65,usa


### (3-2) 데이터 정규화

## (4) 데이터 추출