In [1]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
from math import sqrt

import numpy as np
import pandas as pd

# 데이터 파일 경로 정의
INSURANCE_FILE_PATH = './insurance.csv'

insurance_df = pd.read_csv(INSURANCE_FILE_PATH)  # 데이터를 pandas dataframe으로 갖고 온다 (insurance_df.head()를 사용해서 데이터를 한 번 살펴보세요!)
insurance_df = pd.get_dummies(data=insurance_df, columns=['sex', 'smoker', 'region'])  # 필요한 열들에 One-hot Encoding을 해준다

# 입력 변수 데이터를 따로 새로운 dataframe에 저장
X = insurance_df.drop(['charges'], axis=1)

polynomial_transformer = PolynomialFeatures(4)  # 4 차항 변형기를 정의
polynomial_features = polynomial_transformer.fit_transform(X.values)  #  4차 항 변수로 변환

features = polynomial_transformer.get_feature_names(X.columns)  # 새로운 변수 이름들 생성

X = pd.DataFrame(polynomial_features, columns=features)  # 다항 입력 변수를 dataframe으로 만들어 준다
y = insurance_df[['charges']]  # 목표 변수 정의

# 여기 코드를 쓰세요
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=5)

model = Lasso(alpha=1, max_iter=2000, normalize=True)
model.fit(X_train, y_train)

y_test_predict = model.predict(X_test)
y_train_predict = model.predict(X_train)

# 채점용 코드
mse = mean_squared_error(y_train, y_train_predict)

print("training set에서 성능")
print("-----------------------")
print(f'오차: {sqrt(mse)}')

mse = mean_squared_error(y_test, y_test_predict)

print("testing set에서 성능")
print("-----------------------")
print(f'오차: {sqrt(mse)}')


training set에서 성능
-----------------------
오차: 4726.636439607449
testing set에서 성능
-----------------------
오차: 4692.232442526968


In [2]:
X

Unnamed: 0,1,age,bmi,children,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,...,region_northwest^2 region_southwest^2,region_northwest region_southeast^3,region_northwest region_southeast^2 region_southwest,region_northwest region_southeast region_southwest^2,region_northwest region_southwest^3,region_southeast^4,region_southeast^3 region_southwest,region_southeast^2 region_southwest^2,region_southeast region_southwest^3,region_southwest^4
0,1.0,19.0,27.900,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1.0,18.0,33.770,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,1.0,28.0,33.000,3.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,1.0,33.0,22.705,0.0,0.0,1.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,32.0,28.880,0.0,0.0,1.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1333,1.0,50.0,30.970,3.0,0.0,1.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1334,1.0,18.0,31.920,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1335,1.0,18.0,36.850,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1336,1.0,21.0,25.800,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [3]:
features

['1',
 'age',
 'bmi',
 'children',
 'sex_female',
 'sex_male',
 'smoker_no',
 'smoker_yes',
 'region_northeast',
 'region_northwest',
 'region_southeast',
 'region_southwest',
 'age^2',
 'age bmi',
 'age children',
 'age sex_female',
 'age sex_male',
 'age smoker_no',
 'age smoker_yes',
 'age region_northeast',
 'age region_northwest',
 'age region_southeast',
 'age region_southwest',
 'bmi^2',
 'bmi children',
 'bmi sex_female',
 'bmi sex_male',
 'bmi smoker_no',
 'bmi smoker_yes',
 'bmi region_northeast',
 'bmi region_northwest',
 'bmi region_southeast',
 'bmi region_southwest',
 'children^2',
 'children sex_female',
 'children sex_male',
 'children smoker_no',
 'children smoker_yes',
 'children region_northeast',
 'children region_northwest',
 'children region_southeast',
 'children region_southwest',
 'sex_female^2',
 'sex_female sex_male',
 'sex_female smoker_no',
 'sex_female smoker_yes',
 'sex_female region_northeast',
 'sex_female region_northwest',
 'sex_female region_southeas

In [4]:
X.values

array([[ 1.  , 19.  , 27.9 , ...,  0.  ,  0.  ,  1.  ],
       [ 1.  , 18.  , 33.77, ...,  0.  ,  0.  ,  0.  ],
       [ 1.  , 28.  , 33.  , ...,  0.  ,  0.  ,  0.  ],
       ...,
       [ 1.  , 18.  , 36.85, ...,  0.  ,  0.  ,  0.  ],
       [ 1.  , 21.  , 25.8 , ...,  0.  ,  0.  ,  1.  ],
       [ 1.  , 61.  , 29.07, ...,  0.  ,  0.  ,  0.  ]])