In [212]:
from urllib.request import urlopen, urlretrieve
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

# [1] 데이터 로딩

In [213]:
data = 'https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'
# columns = 'https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.names'
# FILE1 = '../data/mpg_data.csv'
# FILE2 = '../data/mpg_col.csv'
# get_data = urlretrieve(data, FILE1)
# get_col = urlretrieve(columns, FILE2)

df = pd.read_csv(data, header=None, sep='\s+')
df.columns = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model year', 'origin',
              'car name']
df.head(5)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino


# [2] 데이터 전처리

In [214]:
# 데이터타입 확인
df.info()

# 고유값 확인
for i in df.columns:
    print(i)
    print(df[i].unique())

# 결측치 확인
print(df.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    object 
 4   weight        398 non-null    float64
 5   acceleration  398 non-null    float64
 6   model year    398 non-null    int64  
 7   origin        398 non-null    int64  
 8   car name      398 non-null    object 
dtypes: float64(4), int64(3), object(2)
memory usage: 28.1+ KB
mpg
[18.  15.  16.  17.  14.  24.  22.  21.  27.  26.  25.  10.  11.   9.
 28.  19.  12.  13.  23.  30.  31.  35.  20.  29.  32.  33.  17.5 15.5
 14.5 22.5 24.5 18.5 29.5 26.5 16.5 31.5 36.  25.5 33.5 20.5 30.5 21.5
 43.1 36.1 32.8 39.4 19.9 19.4 20.2 19.2 25.1 20.6 20.8 18.6 18.1 17.7
 27.5 27.2 30.9 21.1 23.2 23.8 23.9 20.3 21.6 16.2 19.8 22.3 17.6 18.2
 1

In [215]:
# horsepower 열에 '?' 평균값으로 처리하고, int형으로 바꿔주기
df['horsepower'].replace('?', np.nan, inplace=True)
df['horsepower'] = df['horsepower'].astype(float)
df['horsepower'].replace(np.nan, df['horsepower'].mean(), inplace=True)
df['horsepower'] = df['horsepower'].astype(int)

In [216]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    int32  
 4   weight        398 non-null    float64
 5   acceleration  398 non-null    float64
 6   model year    398 non-null    int64  
 7   origin        398 non-null    int64  
 8   car name      398 non-null    object 
dtypes: float64(4), int32(1), int64(3), object(1)
memory usage: 26.6+ KB


In [217]:
df.head(5)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504.0,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693.0,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436.0,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433.0,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449.0,10.5,70,1,ford torino


In [218]:
# mpg => 스케일링
# cylinders => 원핫인코딩
# displacement => 스케일링
# horsepower => 스케일링
# weight => 스케일링
# acceleration => 스케일링
# model year => 원핫인코딩
# origin => 원핫인코딩

In [219]:
scaler = MinMaxScaler()
df[['displacement', 'horsepower', 'weight', 'acceleration']] = scaler.fit_transform(
    df[['displacement', 'horsepower', 'weight', 'acceleration']])

In [220]:
df.head(5)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,0.617571,0.456522,0.53615,0.238095,70,1,chevrolet chevelle malibu
1,15.0,8,0.728682,0.646739,0.589736,0.208333,70,1,buick skylark 320
2,18.0,8,0.645995,0.565217,0.51687,0.178571,70,1,plymouth satellite
3,16.0,8,0.609819,0.565217,0.516019,0.238095,70,1,amc rebel sst
4,17.0,8,0.604651,0.51087,0.520556,0.14881,70,1,ford torino


In [221]:
# mpg => km/h로 변환
df['mpg'] = df['mpg'] * 0.425144
df.rename(columns={'mpg': 'km/L'}, inplace=True)

In [222]:
# 원-핫 인코딩 진행
df = pd.get_dummies(data=df, columns=['cylinders'], prefix='cylinders')
df = pd.get_dummies(data=df, columns=['model year'], prefix='model year')
df = pd.get_dummies(data=df, columns=['origin'], prefix='origin')

In [223]:
df.head(5)

Unnamed: 0,km/L,displacement,horsepower,weight,acceleration,car name,cylinders_3,cylinders_4,cylinders_5,cylinders_6,...,model year_76,model year_77,model year_78,model year_79,model year_80,model year_81,model year_82,origin_1,origin_2,origin_3
0,7.652592,0.617571,0.456522,0.53615,0.238095,chevrolet chevelle malibu,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,6.37716,0.728682,0.646739,0.589736,0.208333,buick skylark 320,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,7.652592,0.645995,0.565217,0.51687,0.178571,plymouth satellite,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,6.802304,0.609819,0.565217,0.516019,0.238095,amc rebel sst,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,7.227448,0.604651,0.51087,0.520556,0.14881,ford torino,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [224]:
# 쓸모없는 열 삭제
df.drop('car name', inplace=True, axis=1)

# [3] 학습 / 테스트 데이터 분리

In [225]:
# input, target 데이터 분리
target = df['km/L']
input = df.iloc[:, 1:]

In [226]:
# 학습 / 테스트 데이터 분리
train_input, test_input, train_target, test_target = train_test_split(
    input, target, random_state=42, test_size=0.3)

# [4] 모델 생성 및 확인

In [227]:
# 모델 생성
model = LinearRegression()

In [230]:
model.fit(train_input, train_target)
print(f'train 스코어 = {model.score(train_input, train_target)}')
print(f'test 스코어 = {model.score(test_input, test_target)}')

train 스코어 = 0.8720288034390645
test 스코어 = 0.8624547665276223


In [229]:
model.predict(test_input)

array([13.49172979, 13.14041942,  9.23473646,  6.96826989,  5.29420127,
       11.62001872, 12.42355408,  3.35486097,  7.7040154 ,  8.50941824,
        5.90623609, 15.80138787, 11.65358434,  6.37539467, 10.1988342 ,
        4.25068385, 13.11497588,  9.20304933,  6.39549864, 15.80596049,
       10.70369152,  8.29773333, 11.68975618, 11.67814589,  7.49893087,
       15.5332512 , 10.57612898, 10.24223874,  7.99414424,  3.68363605,
       12.66957252, 15.41949171,  8.91087474, 11.13857422, 16.11583261,
        4.03352762,  8.45748444,  8.69547331,  5.34085554, 11.57001718,
       10.26897348, 12.92802831,  9.18963296,  3.87956705, 10.61226779,
       15.09228082, 11.63352833,  9.933002  , 10.6459001 , 11.57345732,
       10.69852598, 14.25100083, 15.45952809,  4.75531626, 10.89100945,
        5.41883309,  8.4000871 , 14.29198034, 10.69028059,  7.67092134,
        6.07957091, 13.05970762,  9.70324202,  9.53503638,  7.79712667,
       11.76324302, 10.21478504, 13.60009092, 11.51810144,  6.26