# Medical Cost Insurance Prediction




## Dataset

**Importing Libraries**

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (12.0,9.0)

**Importing Dataset**

In [None]:
df = pd.read_csv('../input/insurance/insurance.csv')
df

In [None]:
df.tail()

**Describe**

In [None]:
df.shape

In [None]:
df.info()

In [None]:
##Value counts for every columns
for column in df.columns:
    print(df[column].value_counts())
    print('*' * 20)

In [None]:
df.describe()

## Data Visualization

In [None]:
sns.pairplot(df)

**Finding correlation between Age and Charges by ploting a scatter plot.**

In [None]:
plt.title('Relation Between Age and Charges')
sns.scatterplot(x = df['age'], y= df['charges'])
plt.show()

In [None]:
plt.title('Regression line Between Age and Charges')
sns.regplot(x = df['age'], y= df['charges'])
plt.show()

There is a positive relation between Age and Charges

**Finding correlation between BMI and Charges by ploting a scatter plot**


In [None]:
plt.title('Relation Between BMI and Charges')
sns.scatterplot(x = df['bmi'], y= df['charges'])
plt.show()
plt.title('Regression line Between BMI and Charges')
sns.regplot(x = df['bmi'] , y = df['charges'])
plt.show()

There is a postive relation between Bmi and Charges as we can see it from the regression line


**Finding correlation between Smokers and Charges by ploting a scatter plot and swarmplot**

In [None]:
plt.title('Relation Between BMI and Charges')
sns.scatterplot(x = df['smoker'], y= df['charges'])
plt.show()
sns.swarmplot(x=df['smoker'],y = df['charges'] )

In [None]:
plt.title('Relation Between BMI and Charges')
sns.scatterplot(x = df['bmi'], y= df['charges'] , hue = df['smoker'])
plt.show()

In [None]:
sns.lmplot(x = 'bmi',y = 'charges', hue = 'smoker' , data=df)

As we can see from the plots, smoker tends to pay more charges than non smoker.
The slope of the regression line of smoker is more steeper than the slope of the non smoker

**Finding correlation between Children and Charges by ploting a barplot**


In [None]:
plt.title('Relation between Children and Charges')
sns.barplot(x=df['children'], y=df['charges'])

From the plot we can see people having 2 or 3 childs has more charges.

**Finding correlation between Sex and Charges**

In [None]:
plt.title('Relation between Sex and Charges')
sns.swarmplot(x = df['sex'] , y = df['charges'])
plt.show()
sns.barplot(x = df['sex'], y = df['charges'])
plt.show()

 There is not much difference between cost paid by mail and female. Thus sex of the person does not necessarily determine the insurance charges one pays.Either I can delete it or keep it

**Finding correlation between Region and Charges by ploting a barplot plot**

In [None]:
plt.title('Relation between Region and Charges')
sns.barplot(x=df['region'], y=df['charges'])
plt.show()

**Final Corelation plot**


In [None]:
sns.heatmap(df.corr() , cmap = 'Wistia' , annot = True)

## Data Preprocessing

**Checking the missing values**

In [None]:
df.isnull().sum()

There are no missing values in the dataset

**Seeing the missing values in the heatmap**

In [None]:
sns.heatmap(df.isnull())

**Seeing the percentage of missing values**

In [None]:
missing_val_per = (df.isnull().sum() / df.shape[0] ) * 100
missing_val_per

**Outlier Detection using Z-Score Method**

In [None]:
def outlier_zscore(data):
    global outliers,zscore
    outliers = []
    zscore = []
    threshold = 3.5
    mean = np.mean(data)
    std = np.std(data)
    for i in data:
        z_score= (i - mean)/std 
        zscore.append(z_score)
        if np.abs(z_score) > threshold:
            outliers.append(i)
    print(outliers)
    return len(outliers), outliers

In [None]:
## Checking whether age column has any outlier

In [None]:
age_outliers_number, age_outliers = outlier_zscore(df.age)

In [None]:
print(f"The number of outliers are {age_outliers_number} and the outliers are {age_outliers}")

In [None]:
df['age'].describe()

In [None]:
sns.boxplot(y = 'age', data = df)

Thus there is no outlier in the age column

In [None]:
## Checking whether bmi column has any outlier

In [None]:
df['bmi'].describe()

In [None]:
bmi_outliers_number, bmi_outliers = outlier_zscore(df.bmi)

In [None]:
print(f"The number of outliers are {bmi_outliers_number} and the outliers are {bmi_outliers}")

In [None]:
sns.boxplot(y = 'bmi', data = df)

In [None]:
#Removing the outliers of bmi 
for num, i in enumerate(df['bmi']):
    if i in bmi_outliers:
        df['bmi'][num] = 48.5


In [None]:
bmi_outliers_number, bmi_outliers = outlier_zscore(df.bmi)

In [None]:
print(f"And now the number of outliers are {bmi_outliers_number}")

In [None]:
df['bmi'].describe()

In [None]:
## Checking whether children column has any outlier

In [None]:
df['children'].describe()

In [None]:
chil_outliers_number, chil_outliers = outlier_zscore(df.children)


In [None]:
print(f"The number of outliers are {bmi_outliers_number} and the outliers are {bmi_outliers}")

In [None]:
sns.boxplot(y = 'children', data = df)

In [None]:
## Checking whether children column has any outlier

In [None]:
df['charges'].describe()

In [None]:
charges_outliers_number, charges_outliers = outlier_zscore(df.charges)

In [None]:
print(f"The number of outliers are {charges_outliers_number} and the outliers are {charges_outliers}")

In [None]:
sns.boxplot(y = 'charges', data = df)

In [None]:
#Removing the outliers of bmi 
for num, i in enumerate(df['charges']):
    if i in charges_outliers:
        df['charges'][num] = 55000.00000

In [None]:
charges_outliers_number, charges_outliers = outlier_zscore(df.charges)

In [None]:
print(f"And now the number of outliers are {charges_outliers_number}")

In [None]:
df['charges'].describe()

**Label Encoding**

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [None]:
df['sex'] = le.fit_transform(df['sex'])
df['smoker'] = le.fit_transform(df['smoker'])
df['region'] = le.fit_transform(df['region'])

In [None]:
df.head()

## Multiple Linear Regression

**Dividing into features and target variable**

In [None]:
x = df.iloc[ : , : -1]
y = df.iloc[ : , -1]

In [None]:
x

In [None]:
y

**Spliting Dataset into train and test data**

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train , x_test , y_train , y_test = train_test_split(x, y, test_size = 0.2 , random_state = 51)

In [None]:
print(" Shape of x_train = ", x_train.shape)
print(" Shape of x_test = ", x_test.shape)
print(" Shape of y_train = ", y_train.shape)
print(" Shape of y_test = ", y_test.shape)

**Feature Scaling**

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(x_train)
x_train = sc.transform(x_train)
x_test = sc.transform(x_test)

In [None]:
x_train

In [None]:
x_test

**Linear Regression Model Training**

In [None]:
from sklearn.linear_model import LinearRegression

**Create Model & Fit**

In [None]:
lr = LinearRegression() 
lr.fit(x_train, y_train)

**Get Results**

In [None]:
print("Intercept : " , lr.intercept_)
print("Slope : " , lr.coef_)

**Prediction**

In [None]:
y_pred = lr.predict(x_test)
y_pred_df = pd.DataFrame(y_pred, columns=["Predicted Values" ])
y_test_df = pd.DataFrame(np.array(y_test), columns=["Real Values"])
pd.concat([y_test_df , y_pred_df] , axis=1)

**Metrics**

In [None]:
from sklearn.metrics import r2_score , mean_squared_error
score = r2_score(y_test , y_pred)
print("R2 Score : {}".format(score))
print("Model Accuracy: {}%".format(score * 100))