In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler,OneHotEncoder,PolynomialFeatures, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import ElasticNetCV, LinearRegression
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR
import warnings
warnings.filterwarnings("ignore")

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv("/kaggle/input/insurance/insurance.csv")

In [None]:
# View of data
df.head()

In [None]:
#overview of data
df.info()

In [None]:
#Distribution of variable "charges"
sns.kdeplot(df.charges);

In [None]:
#histogram of variable "charges"
sns.histplot(df.charges,kde = True);

In [None]:
sns.countplot(df.sex, hue = df.smoker);

In [None]:
plt.figure(figsize = (15, 15))
sns.lineplot(x = 'age', y = 'charges', hue = 'sex', data = df,  ci = None);

In [None]:
plt.figure(figsize = (15, 15))
sns.boxplot(df.sex, df.charges, hue = df.smoker);
# être fumeur impact les charges médicaux

In [None]:
plt.figure(figsize = (15, 15))
sns.boxplot(df.region, df.charges);
# Frais médicaux plus importants dans le NorthEast

In [None]:
plt.figure(figsize = (10, 10))
sns.countplot(x = 'smoker', hue = 'region', data = df[df.smoker == 'yes'] );

In [None]:
plt.figure(figsize = (15, 15))
sns.lineplot(x = 'children', y = 'charges', hue = 'sex', data = df,  ci = None );
# de 0 à 4 enfants les frais médicaux croissent en fonction du nombre d'enfants.
# Ces frais décroissent à partir de 5 enfants

In [None]:
plt.figure(figsize = (20, 20))
sns.lmplot(x = 'bmi', y = 'charges',hue = 'smoker', data = df, ci = None, height = 15);


Features ingenering

In [None]:
#Isolation de la donnée cible
target = df.charges
data = df.drop('charges', axis = 1)
#identification des variables catégorielles
cat = [col for col in data.columns if (data[col].dtype == 'O')]
#identification des variables numériques
num = [col for col in data.columns if(data[col].dtype != 'O')]
print("Les variables de type catégorielle sont: ", cat)
print("Les variables de type numériques sont: ", num)

In [None]:
#Préparation des colonnes
# 1- Normalisation des variables numériques
# 2- Dichotomisation des variables catégorielles
# 3- Calcul des features polynomiaux pour les variables numériques
t =  [('num', MinMaxScaler(), num),('cat', OneHotEncoder(), cat)]
cols_transform = ColumnTransformer(transformers = t)

#Pipeline préparation des données
# 1- Transformation des colonnes numériques et catégorielles
# 2- Reduction des dimensions avec PCA
pca = PCA()
pipe = Pipeline([('cols_transform', cols_transform),
                 ('poly',PolynomialFeatures(2)),
                 ('pca', pca)
                ])

#Séparation des mes données d'entrainement de ceux de test
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size = 0.2, random_state = 42 )

#Préparation des mes données d'entrainement et de test à l'aide du pipeline
X_train_pipe = pipe.fit_transform(X_train)
X_test_pipe = pipe.transform(X_test)

In [None]:
#Modèle Régression lineaire
reg_lin = LinearRegression()
#recherche des parèmètres optimaux
params = {'fit_intercept': [True, False]}
grid = GridSearchCV(estimator = reg_lin, param_grid = params, cv = 5)
grid.fit(X_train_pipe,y_train)
print("[LINEAR REGRESSION]Les paramètres optimaux sont:" , grid.best_params_)
#Scores R2 pour les données de test
print("[LINEAR REGRESSION] Le score R2 pour les données de test est: ", grid.score(X_test_pipe, y_test))

In [None]:
#Modèle ElasticNetCV
#Initialisation des paramètres l1_ratio, et alphas
l1_ratio = np.logspace(-1, 0,12, endpoint = False)
alphas = np.logspace(-2, 0, 10)
#Instanciation du modèle
reg_en = ElasticNetCV(l1_ratio = l1_ratio, alphas = alphas,cv =5)
#Entrainement du modèle sur les données train
reg_en.fit(X_train_pipe,y_train)
#Scores R2 pour les données de test
X_test_pipe = pipe.transform(X_test)
print("[ELASTICNETCV] Le score R2 pour les données de test est: ", grid.score(X_test_pipe, y_test))


In [None]:
#Modèles SVR
svr = SVR()
#recherche des parèmètres optimaux
params = {'kernel': ['linear', 'poly', 'rbf'],
          'degree': [1, 2, 3,4],
          'C': np.logspace(-1,2,5)}
grid_svr = GridSearchCV(estimator = svr, param_grid = params, cv = 5)
grid_svr.fit(X_train_pipe, y_train)
print("[SVR] Les paramètres optimaux sont:" ,grid_svr.best_params_)
#Scores R2 pour les données de test
print("[SVR] Le score R2 pour les données de test est: ", grid_svr.score(X_test_pipe, y_test))
