In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from statsmodels.regression import linear_model
import statsmodels.formula.api as smf
from statsmodels.api import qqplot
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from scipy import stats
import warnings
warnings.filterwarnings("ignore")
pd.set_option("display.max_rows", 100)

# EDA

Dataset - https://www.kaggle.com/mirichoi0218/insurance

In [None]:
insur_data = pd.read_csv("/kaggle/input/insurance/insurance.csv")

In [None]:
insur_data[insur_data.duplicated()]

One of the observations is full duplicate. We can remove it.

In [None]:
insur_data = insur_data.drop(581)

In [None]:
plt.rc("figure", figsize=(30, 15))
fig, axes = plt.subplots(3, 2)
sns.violinplot(x="sex", y="charges", hue="smoker", data=insur_data, ax=axes[0][0])

sns.histplot(insur_data.bmi, ax=axes[0][1], kde=True)
norm_test = stats.shapiro(insur_data.bmi)
axes[0][1].set_title("Statistic = %.3f, p-value = %.3g" % norm_test)

sns.histplot(insur_data.charges, ax=axes[1][0], kde=True, color="forestgreen")
norm_test = stats.shapiro(insur_data.charges)
axes[1][0].set_title("Statistic = %.3f, p-value = %.3g" % norm_test)

insur_data.pivot(columns="region", values="charges").join(insur_data).groupby("age").agg("mean").drop(columns=["bmi", "children", "charges"]) \
    .plot(kind="bar", stacked=True, ax=axes[1][1])

sns.boxplot(x="region", y="charges", hue="children", data=insur_data, ax=axes[2][0])

cmap = sns.diverging_palette(6.8, 129.9, as_cmap=True)
cor = insur_data.corr().drop(index="children", columns="children")

sns.heatmap(cor, cmap=cmap, center=0, linewidths=.5, cbar_kws={"shrink": .5}, ax=axes[2][1], annot=True)

Smokers pay for insurance more and it does not depend on age. Distribution of body mass index **bmi** and **charges** significantly differ from normal. There is no strong correlation beetween variables, however there is weak positive сorrelation between **charges** и **age**. There is no great differences between insurance price depending on regions (**region**), but there are some deviations of that price depending on **age** and **region**, which can be caused by insufficiency of observations. 

In [None]:
sns.pairplot(insur_data.drop(columns="children"), hue="smoker", palette="colorblind")

Here we can observe unussual pattern in **age - charges** plot. There are 3 distinc groups. Two or them depend on **smoker** variable and the third's origin remains unknown.

# Multiple Linear Regression

## Building full model

Here I define several functions that are needed to check linear regression assumptions and to evaluate model.

In [None]:
def qqPlot(data, **kwargs):
    _ = qqplot(data, marker='o', markerfacecolor='w', markeredgecolor="k", line="s", **kwargs)


def resid_distribution(fitted, resid, scale=False, **kwargs):
    if scale:
        resid = (resid - resid.mean()) / resid.std()
    plt.scatter(fitted, resid, **kwargs)
    plt.hlines(-2, min(fitted), max(fitted), color="red")
    plt.hlines(2, min(fitted), max(fitted), color="red")

    
def cooks_distances_stat_models(model, return_cooks=False, **kwargs):
    influence = model.get_influence()
    cooks = influence.cooks_distance
    g = sns.barplot(x=list(range(len(cooks[0]))), y=cooks[0], **kwargs)
    g.axes.xaxis.set_ticklabels([])
    plt.xlabel('Observation')
    plt.ylabel("Cook's distance")
    plt.show()
    
    if return_cooks:
        return cooks
    
def adj_r2(r2, X, p):
    n = X.shape[0]
    r_sq_adj = 1 - (1 - r2) * (n - 1) / (n - p - 1)
    return r_sq_adj

Our dependent variable (**charges**) distribution is very skewed, so we need to transform it. This operation does not make the distribution normal, but significantly reduces skeweness. I also scaled numerical predictors in order to estimate their impact on dependent variable.

In [None]:
y = np.log(insur_data.charges)
X = insur_data.drop(columns="charges")
le = LabelEncoder()
X.sex = le.fit_transform(X.sex)
X.smoker = le.fit_transform(X.smoker)
X.region = le.fit_transform(X.region)
X.bmi = (X.bmi - X.bmi.mean()) / X.bmi.std()
X.age = (X.age - X.age.mean()) / X.age.std()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 223)

Here I build full model. **Sex**, **smoker**, **region** and **children** are factors. We splitted our data, however **statsmodels** linear regression with formula requires combined data (both dependent and independent variables).

In [None]:
X_train["charges"] = y_train
model_train = smf.ols(formula="charges ~ age + bmi + C(sex) + C(smoker) + C(children) + C(region)", data=X_train).fit()

In [None]:
model_train.summary()

We see that $R^2_{train}$ = **0.766**, which is not so high. We also need to check it using test population.

In [None]:
print("Model adj.R^2 = {:.3f} (Train)".format(model_train.rsquared_adj))
mse_train = mean_squared_error(y_train, model_train.fittedvalues)
print("Total MSE = {:.3f} (Train)".format(mse_train))

test_r2 = r2_score(y_test, model_train.predict(X_test))
mse_test = mean_squared_error(y_test, model_train.predict(X_test))
test_r2_adj = adj_r2(test_r2, X_test, 6)
print("Model R^2 = {:.3f} (Test)".format(test_r2_adj))
print("Total MSE = {:.3f} (Test)".format(mse_test))

### Linear regression assumptions

Residue distribution significantly differ from normal. However we have a lot of obervations and this may not be critical.

In [None]:
plt.rc("figure", figsize=(10, 10))
qqPlot(model_train.resid)
norm_test = stats.shapiro(model_train.resid)
plt.title("Statistic = %.3f, p-value = %.3g" % norm_test)

In [None]:
cooks_distances_stat_models(model_train)

We see strange patterns in residue distribution and many of the residued decline for more than 2 sigma.

In [None]:
resid_distribution(model_train.fittedvalues, model_train.resid, scale=True)

## Selecting best model

Here I tried to build model "charges ~ (age + C(smoker) + children + bmi + C(sex))**2" model.

In [None]:
subset = insur_data.copy()
y = np.log(subset.charges)
X = subset.drop(columns="charges")
le = LabelEncoder()
X.sex = le.fit_transform(X.sex)
X.smoker = le.fit_transform(X.smoker)
X.region = le.fit_transform(X.region)
X.age = (X.age - X.age.mean()) / X.age.std()
X.bmi = (X.bmi - X.bmi.mean()) / X.bmi.std()

$R^2$ was calculated as mean adjusted $R^2$ for 50 different test populations. This yielded **0.815**.

In [None]:
r2 = 0
for seed in range(50):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = seed)
    X_train.loc[:, "charges"] = y_train
    model_train = smf.ols(formula="charges ~ (age + C(smoker) + children + bmi + C(sex))**2", data=X_train).fit()
    test_r2 = r2_score(y_test, model_train.predict(X_test))
    test_r2_adj = adj_r2(test_r2, X_test, 16)
    r2 += test_r2_adj
print("adj.R^2 =", r2/50)

In [None]:
model_train.summary()

In [None]:
mse_train = mean_squared_error(y_train, model_train.fittedvalues)
print("Total MSE = {:.3f} (Train)".format(mse_train))

mse_test = mean_squared_error(y_test, model_train.predict(X_test))
print("Total MSE = {:.3f} (Test)".format(mse_test))

### Linear regression assumptions

In [None]:
qqPlot(model_train.resid)
norm_test = stats.shapiro(model_train.resid)
plt.title("Statistic = %.3f, p-value = %.3g" % norm_test)

In [None]:
cooks_distances_stat_models(model_train)

Now we see that a few residues decline significantly. Patterns in residues are still observed, but distribution is a bit better than in previous case.

In [None]:
resid_distribution(model_train.fittedvalues, model_train.resid, scale=True)

I suggest that the model described above can be used for prediction, but with caution. The problem is that we have a hidden variable with nearly 3 levels in our data. This does not allow us to build a reliable model without knowing it.