In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

**Introduction**

The kernel targets to predict medical insurance prices through collected data


<font color= "red">
Content:

[1-Load and Check](#1)

[2-Variable Description](#2)
    
*    [Categorical Variable Analysis](#9)
    
*    [Numerical Variable Analysis](#10)
    

[3-Basic Data Analysis](#3)
    
[4-Outlier Data Detection](#4)
    
[5-Missing Value](#5)
    
[6-Visualization](#6)
    
*     [Heatmap](#11)
     
*     [Bar plot](#12)
     
*     [Pair plot](#13)
     
*     [Joint plot](#14)
     
*     [Scatter plot](#15)
    
[7-Feature Engineering](#7)
    
[8-Modelling](#8)
    
*     [Linear regression](#16)
     
*     [Ridge Regression](#17)
     
*     [Lasso Regression](#18)

<a id= "1"></a><br>
# 1-LOAD AND CHECK DATA

In [None]:
data=pd.read_csv("/kaggle/input/insurance/insurance.csv")
data.head()

In [None]:
data.shape  # the data includes 1338 rows and 7 columns

In [None]:
data.describe()  # mean people age who participate in the survey is 39.2

In [None]:
data.info()  # the data includes int object and float types so we are going to examine again in next parts

<a id= "2"></a><br>
# 2-VARIABLE DESCRIPTION

Columns

age: age of primary beneficiary

sex: insurance contractor gender, female, male

bmi: Body mass index, providing an understanding of body, weights that are relatively high or low relative to height,
objective index of body weight (kg / m ^ 2) using the ratio of height to weight, ideally 18.5 to 24.9

children: Number of children covered by health insurance / Number of dependents

smoker: Smoking

region: the beneficiary's residential area in the US, northeast, southeast, southwest, northwest.

charges: Individual medical costs billed by health insurance

<a id= "9"></a><br>
# categorical variable analysis

In [None]:

plt.figure(figsize=(3,5))
plt.bar(data["sex"].value_counts().index,data["sex"].value_counts())

In [None]:
def bar(variable):
    
    var=data[variable]
    var_value=var.value_counts()
    plt.figure(figsize=(4,5))
    plt.title(i)  
    plt.bar(var_value.index,var_value)
    plt.xticks(var_value.index,var_value.index.values)
    plt.ylabel("Frequency")
    plt.show()


In [None]:
variables=["sex","smoker","region"]   # I prefer "bar plot" for categorical variables

for i in variables:
        
    bar(i)
    

<a id= "10"></a><br>
# numerical variable analysis

In [None]:

    
    
def histogram(variables):
    
    var=data[variables]
    
    var_value=data[variables].value_counts()
    plt.figure(figsize=(10,8))
    plt.title("{} distribution with histogram".format(variables))
    
    plt.hist(var)
    
    plt.xticks(var_value.index,var_value.index.values)
    plt.ylabel("Values")
    plt.show()

In [None]:
variable=["age","bmi","children","charges"]

for i in variable:
    histogram(i)

<a id= "3"></a><br>
# 3-BASIC DATA ANALYSIS

In [None]:

data_x=data[["age","charges"]].groupby(["age"],as_index=False).mean().sort_values(by="charges",ascending=False)
data_x.head()

In [None]:
data_y=data[["sex","charges"]].groupby(["sex"],as_index=False).mean().sort_values(by="charges",ascending=False)
data_y.head()

In [None]:
data_z=data[["children","charges"]].groupby(["children"],as_index=False).mean().sort_values(by="children",ascending=False)
data_z.head()

<a id= "4"></a><br>
# 4-OUTLIER DATA DETECTION

In [None]:
variable=["age","bmi","charges","children"]  #as we observ on plot that bmi and charges variables includes outlier values
for i in variable:

    sns.boxplot(x=data[i])
    plt.show()

In [None]:
bmi=data["bmi"]
bmi.shape

In [None]:
bmi=data["bmi"]
    
Q1=bmi.quantile(0.25)

Q3=bmi.quantile(0.75)

IQR=Q3-Q1

up_limit=Q3+1.5*IQR

down_limit=Q1-1.5*IQR

aykiri=(bmi>up_limit)|(bmi<down_limit)
                  

    

In [None]:
bmi_aykiri=bmi[aykiri]  #these values are outliers
bmi_aykiri

In [None]:
clear_bmi=bmi[~((bmi>(up_limit))|(bmi<(down_limit)))]     # 9 bmi data is outliers
clear_bmi

In [None]:
charges=data["charges"]
    
Q1=charges.quantile(0.25)

Q3=charges.quantile(0.75)

IQR=Q3-Q1

up_limit=Q3+1.5*IQR

down_limit=Q1-1.5*IQR

aykiri=(charges>up_limit)|(charges<down_limit)

    

In [None]:
charges_aykiri=charges[aykiri]  #139 values is outlier
charges_aykiri

In [None]:
clear_charges=charges[~((charges>(up_limit))|(charges<(down_limit)))]  #139 data is outliers
clear_charges

<a id= "5"></a><br>
# 5-MISSING VALUE

In [None]:
#MISSING VALUE

data.isnull().sum()    #the dataset does not include missing value

<a id= "6"></a><br>
# 6-VISUALIZATION 

<a id= "11"></a><br>
# heatmap

In [None]:

plt.figure(figsize=(8,6))
variable=["age","bmi","charges","children"]

sns.heatmap(data[variable].corr(),cmap="BuPu",annot=True,fmt=".2f");    #" cmap_color_settings= Reds, Oranges,BuGn,BuPu..""


# It seems that there is strong correlation between age and charges.Thus we can say that age affects medical insurance price directly

<a id= "12"></a><br>
# bar plot

In [None]:
from matplotlib import rcParams
sns.factorplot(x="age",y="charges",data=data,kind="bar")
plt.xlabel("observation participant ages")
plt.ylabel("Medical Insurance Prices")

plt.plot()

In [None]:
plt.figure(figsize=(3,4))
plt.bar(x=data["sex"],height=data["charges"])  # Gender does not affect price of Medical Insurance
plt.show()

In [None]:
plt.figure(figsize=(3,4))
sns.barplot(x="sex",y="charges",data=data);     #as you see; seaborn library provides good-looker graphs than matplot

In [None]:
plt.figure(figsize=(6,8))
sns.barplot(x="bmi",y="charges",data=data);

<a id= "13"></a><br>
# pair plot

In [None]:
plt.figure(figsize=(6,6))
sns.pairplot(data,hue="children")
plt.show()

<a id= "14"></a><br>
# joint plot

In [None]:
sns.jointplot(x="age", y="charges", data=data,
                  kind="reg",truncate=False,
                  color="blue")

In [None]:
sns.jointplot(x="bmi", y="charges", data=data,
                  kind="reg",truncate=False,
                  color="orange")

<a id= "15"></a><br>
# scatter plot

In [None]:

plt.figure(figsize=(8,8))
sns.scatterplot(x="bmi",y="charges",hue="smoker",data=data,markers="*",s=100);

In [None]:
plt.figure(figsize=(8,8))
sns.boxplot(x="children",y="charges",hue="region",data=data);

<a id= "7"></a><br>
# 7-FEATURE ENGINEERING

In [None]:
from sklearn.preprocessing import LabelEncoder
lbe=LabelEncoder()
data["sex"]=lbe.fit_transform(data["sex"])

data["sex"]=pd.get_dummies(data["sex"])

In [None]:
data["smoker"]=lbe.fit_transform(data["smoker"])


In [None]:
data["region"]=lbe.fit_transform(data["region"])

In [None]:
data.info()

<a id= "8"></a><br>
# 8-MODELLING

In [None]:
from sklearn.model_selection import train_test_split,GridSearchCV,StratifiedKFold,cross_val_score

from sklearn.linear_model import LinearRegression ,Ridge ,LassoCV

from sklearn.svm import SVC

from sklearn.ensemble import RandomForestClassifier,VotingClassifier


from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree  import DecisionTreeClassifier

from sklearn.metrics import confusion_matrix,mean_squared_error,r2_score



In [None]:
X=data.drop(labels="charges",axis=1)
y=data["charges"]


In [None]:

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)

In [None]:
print("X_train= ",len(X_train))
print("X_test= ",len(X_test))
print("y_train= ",len(y_train))
print("y_test= ",len(y_test))

<a id= "16"></a><br>
# linear regression

In [None]:
linreg=LinearRegression()
linreg.fit(X_train,y_train)
linreg_pred=linreg.predict(X_test)

In [None]:
MSE=mean_squared_error(linreg_pred,y_test)
np.sqrt(MSE)

In [None]:
r2_score(linreg_pred,y_test)  #r2_score is used for regression problems but accuracy_score is used for classification problems


<a id= "17"></a><br>
# ridge regression

In [None]:
#Ridge Regression
ridge=Ridge()
ridge.fit(X_train,y_train)

y_pred=ridge.predict(X_test)

In [None]:
r2_score(y_pred,y_test)

<a id= "18"></a><br>
# lasso regression

In [None]:
#Lasso Regression

lasso=LassoCV()
lasso.fit(X_train,y_train)
lasso_pred=lasso.predict(X_test)

In [None]:
r2_score(y_test,lasso_pred)

In [None]:
#LASSO MODEL TUNING
?LassoCV

In [None]:
params=[{"n_alphas":np.arange(90,150),"max_iter":[100,500,1000,3000]}]

clf = GridSearchCV(lasso, params,cv=10,n_jobs=-1,verbose=2)
clf.fit(X_train,y_train)

In [None]:
clf.best_params_

In [None]:
lasso_tuned=LassoCV(max_iter=100, n_alphas= 94).fit(X_train,y_train)
lasso_tuned_pred=lasso_tuned.predict(X_test)

In [None]:
r2_score(lasso_tuned_pred,y_test)