In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
import warnings 
warnings.filterwarnings("ignore")
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<a id="1"></a><br>
## 1.Load and Check Data

In [None]:
data = pd.read_csv("/kaggle/input/insurance/insurance.csv")

In [None]:
data.head()

In [None]:
data.columns

In [None]:
data.describe()

<a id ="2"></a><br>
# Variable Description

* age: age of primary beneficiary

* sex: insurance contractor gender, female, male

* bmi: Body mass index, providing an understanding of body, weights that are relatively high or low relative to height,
objective index of body weight (kg / m ^ 2) using the ratio of height to weight, ideally 18.5 to 24.9

* children: Number of children covered by health insurance / Number of dependents

* smoker: Smoking

* region: the beneficiary's residential area in the US, northeast, southeast, southwest, northwest.

* charges: Individual medical costs billed by health insurance

In [None]:
data.info()

In [None]:
data["sex"] = np.where(data["sex"].isin(["male"]),1,0)
data["smoker"] = np.where(data["smoker"].isin(["yes"]),1,0)

In [None]:
data["sex"] = pd.Categorical(data["sex"])
data["smoker"] = pd.Categorical(data["smoker"])

In [None]:
data.head()

In [None]:
data.info()

* float64(2):bmi,charges
* int64(5): age,children
* object(5):region
* category : sex,smoker

<a id = "3"></a><br>
# Univariate Variable Analysis

*     Categorical Variable: sex,smoker and region
    
*     Numerical Variable:age,children,bmi and charges

In [None]:
def bar_plot(variable):
    """
    input: variable ex: "Sex"
    output: bar plot & value count
    """
    # get feature
    var = data[variable]
    #count number of categorical variable(value/sample)
    varValue = var.value_counts()
    
    #visualize
    plt.figure(figsize = (9,3))
    plt.bar(varValue.index,varValue)
    plt.xticks(varValue.index,varValue.index.values)
    plt.ylabel("Count")
    plt.title(variable)
    plt.show()
    
    print("{}:\n{}".format(variable,varValue))

In [None]:
category1 = ["sex","smoker","region"]

for i in category1:
    bar_plot(i)

<a id = "5"></a><br>
## Numerical Variable

In [None]:
def hist_plot(variable):
    plt.figure(figsize = (9,3))
    plt.hist(data[variable],bins = 50)
    plt.xlabel(variable)
    plt.ylabel("Count")
    plt.title("{} distribution with hist".format(variable))
    plt.show()
    

In [None]:
category2 = ["age","children","bmi","charges"]
for i in category2:
    hist_plot(i)

 <a id="6"></a><br>
# Basic Data Analysis
* Sex - Charges
* Sex - bmi
* region - charges
* children - charges 
* smoker - charges

In [None]:
# Sex - Charges
data.groupby("sex")["charges"].agg("mean").reset_index()

In [None]:
# Sex - bmi
data.groupby("sex")["bmi"].agg("mean").reset_index()

In [None]:
# region - charges
data.groupby("region")["charges"].agg("mean").reset_index()

In [None]:
# children - charges
data.groupby("children")["charges"].agg("mean").reset_index()

In [None]:
# smoker - charges
data.groupby("smoker")["charges"].agg("mean").reset_index()

<a id="7"></a><br>
# Outlier Detection

In [None]:
def detect_outliers(data,features):
    outlier_indices = []
    
    for c in features:
        # 1st quartile
        Q1 = np.percentile(data[c],25)
        # 3rd quartile
        Q3 = np.percentile(data[c],75)
        # IQR
        IQR = Q3 - Q1
        # Outlier step
        outlier_step = IQR * 1.5
        # detect outlier and their indeces
        outlier_list_col = data[(data[c] < Q1 - outlier_step) | (data[c] > Q3 + outlier_step)].index
        # store indeces
        outlier_indices.extend(outlier_list_col)
    
    outlier_indices = Counter(outlier_indices)
    multiple_outliers = list(i for i, v in outlier_indices.items() if v > 2)
    
    return multiple_outliers

In [None]:
data.loc[detect_outliers(data,["age","charges","bmi","children"])]

<a id="8"></a><br>
# Missing Value
* Find Missing Value

In [None]:
data.isnull().any()

<a id = "10"></a><br>
# Visualization

In [None]:
sns.heatmap(data.corr(),annot = True,fmt=".2f");

In [None]:
# Sex - Charges
sns.barplot(data.sex,data.charges)
plt.show()

In [None]:
# Sex - bmi
sns.barplot(data.sex,data.bmi)
plt.show()

In [None]:
# region - charges
sns.barplot(x ="region", y ="charges",data = data)
plt.show()

In [None]:
# smoker - charges
sns.barplot(x="smoker",y="charges",data=data)
plt.show()

In [None]:
dms = pd.get_dummies(data["region"])
dms

In [None]:
#train-test split
from sklearn.model_selection import train_test_split,cross_val_score,cross_val_predict
X = data.drop(["charges","region"],axis=1)
X = pd.concat([X,dms],axis = 1)
y = data[["charges"]]

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.20,random_state = 42)

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
y_train.shape

In [None]:
y_test.shape

In [None]:
training = data.copy()
training.head()

# Statsmodel ile model kurma

In [None]:
import statsmodels.api as sm
lm = sm.OLS(y_train,X_train)

In [None]:
model = lm.fit()

In [None]:
model.summary()

# Scikitlearn ile model Kurma

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
reg = LinearRegression()
model = reg.fit(X_train,y_train)

In [None]:
model.intercept_

In [None]:
model.coef_

# Tahmin

In [None]:
y_pred = model.predict(X_train)

In [None]:
y_pred[0:10]

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
# eğitim seti hatamız
rmse = np.sqrt(mean_squared_error(y_train,model.predict(X_train)))
rmse

In [None]:
rmse = np.sqrt(mean_squared_error(y_test,model.predict(X_test)))
rmse

# Model Tuning(Model Doğrulama)

In [None]:
X

In [None]:
y

In [None]:
X_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.20,random_state = 144)

In [None]:
lm = LinearRegression()
model = lm.fit(X_train,y_train)

In [None]:
np.sqrt(mean_squared_error(y_train,model.predict(X_train)))

In [None]:
np.sqrt(mean_squared_error(y_test,model.predict(X_test)))

In [None]:
model.score(X_train, y_train)

## Cross validation

In [None]:
cross_val_score(model, X, y, cv =10,scoring="r2").mean()

In [None]:
cross_val_score(model, X_train, y_train, cv=10, scoring="r2").mean()

In [None]:
# gercek egitim hatam
np.sqrt(-cross_val_score(model, 
                X_train, 
                y_train, 
                cv = 10, 
                scoring="neg_mean_squared_error")).mean()

In [None]:
np.sqrt(-cross_val_score(model, 
                X_test, 
                y_test, 
                cv = 10, 
                scoring="neg_mean_squared_error")).mean()