In [1]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
from math import sqrt

import numpy as np
import pandas as pd

In [2]:
INSURANCE_FILE_PATH = 'Downloads/insurance.csv'
insurance_df = pd.read_csv(INSURANCE_FILE_PATH)
insurance_df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [3]:
insurance_df = pd.get_dummies(data=insurance_df, columns=['sex', 'smoker', 'region'])
X = insurance_df.drop(['charges'], axis = 1)

In [4]:
polynomial_transformer = PolynomialFeatures(4)
polynomial_features = polynomial_transformer.fit_transform(X.values)
features = polynomial_transformer.get_feature_names(X.columns)

In [5]:
X = pd.DataFrame(polynomial_features, columns = features)
y = insurance_df[['charges']]

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 5)

In [7]:
model = Lasso(alpha = 1, max_iter=2000, normalize=True)
model.fit(X_train, y_train)

Lasso(alpha=1, max_iter=2000, normalize=True)

In [8]:
y_train_predict = model.predict(X_train)
y_test_predict = model.predict(X_test)

In [9]:
mse = mean_squared_error(y_train, y_train_predict) 
print("training set에서의 성능")
print("-----------------------")
print(sqrt(mse))

mse = mean_squared_error(y_test, y_test_predict) 
print("test set에서의 성능")
print("-----------------------")
print(sqrt(mse))

training set에서의 성능
-----------------------
4726.636439607449
test set에서의 성능
-----------------------
4692.232442526966
