In [None]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression
from sklearn.ensemble import VotingRegressor

%matplotlib inline

In [None]:
data = pd.read_csv('../input/insurance/insurance.csv')
data.head(20)

In [None]:
data.describe()

In [None]:
data.shape

Проверим, есть ли в данных пропущенные значения

In [None]:
data.isnull().sum()

In [None]:
sns.heatmap(data.corr(), annot = True)

In [None]:
np.mean(data)

In [None]:
print(data['age'].value_counts())
print(data['bmi'].value_counts())
print(data['children'].value_counts())
print(data['smoker'].value_counts())
print(data['region'].value_counts())
print(data['charges'].value_counts())

In [None]:
fig = plt.figure(figsize = (15, 15))
fig.add_subplot(221)
sns.barplot(data = data, x = 'sex', y = 'charges')
fig.add_subplot(222)
sns.barplot(data = data, x = 'smoker', y = 'charges')
fig.add_subplot(223)
sns.barplot(data = data, x = 'children', y = 'charges')
fig.add_subplot(224)
sns.barplot(data = data, x = 'region', y = 'charges')

Преобразование категориальных признаков в числовые значения

In [None]:
data['sex'] = data['sex'].apply(lambda x: 1 if x == 'male' else 0)
data['smoker'] = data['smoker'].apply(lambda x: 1 if x == 'yes' else 0)
data.head()

Закодирование с помощью one-hot-encoding названий регионов

In [None]:
data = pd.get_dummies(data)
data.head()

In [None]:
sns.pairplot(data, vars = ['age', 'bmi', 'charges'])

Построние графиков boxplot для фиксирования возможных выбросов

In [None]:
fig = plt.figure(figsize = (15, 15))
fig.add_subplot(221)
sns.boxplot(data = data, x = 'children', y = 'charges')
fig.add_subplot(222)
sns.boxplot(data = data, x = 'smoker', y = 'charges')

In [None]:
data[(data['children'] == 3) & (data['charges'] > 60000)]

In [None]:
data[(data['children'] == 0) & (data['charges'] > 60000)]

In [None]:
X = data.drop('charges', 1)
Y = data['charges']

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.5, random_state = 0)

Значение веса у 'smoker' подозрительно большое, относительно остальных весов. Это может быть показателем переобучения модели.

In [None]:
linear_regressor = LinearRegression()
linear_regressor.fit(X_train, Y_train)
list(zip(X.columns, linear_regressor.coef_)) # вывод весов, которые модель присвоила каждому признаку

Ошибка на train

In [None]:
linear_regressor.score(X_train, Y_train)

In [None]:
predictions = linear_regressor.predict(X_test)
print(predictions[:10])
print(Y_test[:10])

Вывод качества модели с помощью кросс-валидации

In [None]:
scores = cross_val_score(linear_regressor, X, Y, cv = 5)
scores

In [None]:
scores.mean()

In [None]:
lasso_regressor = Lasso(alpha = 1)
lasso_regressor.fit(X_train, Y_train)
list(zip(X.columns, lasso_regressor.coef_))

In [None]:
lasso_regressor.score(X_train, Y_train)

In [None]:
predictions_1 = lasso_regressor.predict(X_test)
print(predictions_1[:10])
print(Y_test[:10])

In [None]:
scores_1 = cross_val_score(lasso_regressor, X, Y, cv = 5)
scores_1.mean()

In [None]:
ridge_regressor = Ridge()
ridge_regressor.fit(X_train, Y_train)
list(zip(X.columns, ridge_regressor.coef_))

In [None]:
ridge_regressor.score(X_train, Y_train)

In [None]:
predictions_2 = ridge_regressor.predict(X_test)
print(predictions_2[:10])
print(Y_test[:10])

In [None]:
scores_2 = cross_val_score(ridge_regressor, X, Y, cv = 5)
scores_2.mean()

In [None]:
elast_net = ElasticNet(alpha = 1, l1_ratio = 1)
elast_net.fit(X_train, Y_train)
list(zip(X.columns, elast_net.coef_))

In [None]:
elast_net.score(X_train, Y_train)

In [None]:
predictions_3 = elast_net.predict(X_test)
print(predictions_3[:10])
print(Y_test[:10])

In [None]:
scores_3 = cross_val_score(elast_net, X, Y, cv = 10)
scores_3.mean()

In [None]:
ensemble = VotingRegressor([('Lasso', lasso_regressor), ('Ridge', ridge_regressor), 
                            ('ElasticNet', elast_net)], weights = [0.7, 0.7, 0.8])

ensemble.fit(X_train, Y_train)

In [None]:
ensemble.score(X_train, Y_train)

In [None]:
predictions_ensemble = ensemble.predict(X_test)
print(predictions_ensemble[:10])
print(Y_test[:10])

In [None]:
scores_ensemble = cross_val_score(ensemble, X, Y, cv = 5)
scores_ensemble.mean()