#  Medical Cost Personal Dataset

## Inporing Libraries


In [None]:
import numpy as np
import pandas as pd
import scipy as sp
import sklearn as sk
import seaborn as sns
import matplotlib.pyplot as plt
import os
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

## Importing and Preprocessing Dataset

In [None]:
df = pd.read_csv('../input/insurance/insurance.csv')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.dtypes

In [None]:
df.isnull().sum()

# Visualizing Dataset

## Analysis on basis of sex

In [None]:
fig, ax = plt.subplots(1, 2, sharex='col', sharey='row', figsize = (20, 8))
c=0
for i in df.sex.unique():
    ax[c].scatter(df[df["sex"]==i].age,df[df["sex"]==i].charges)
    ax[c].set_title(i)
    if not c:
        ax[c].set_ylabel('Charges')
    ax[c].set_xlabel('Age')
    c+=1
fig.suptitle("Scatter Plot of Age vs Charges depending Upon Sex")
fig.show()

In [None]:
fx=plt.subplots(figsize=(5,5))
plt.pie([len(df[df["sex"]=="male"]),len(df[df["sex"]=="female"])],labels=["Male","Female"],autopct="%f",explode=[0,0.1],radius=1,colors=["yellow","royalblue"])
plt.title("Pie plot Describing Sex Percentages")
plt.show()

## Analysis on the basis of Age

### Adding a new age group Column

**Young Adults:** AGE in range 18 and 30

**Middle-Aged Adults:** AGE in range 31 and 45

**Old-Aged Adults:** AGE greater than 45


In [None]:
min(df.age.unique())

In [None]:
max(df.age.unique())

In [None]:
l=[]
for i in df["age"]:
    if i>=18 and i<=30:
        l.append("Young Adults")
    elif i>=31 and i<=45:
        l.append("Middle-Aged Adults")
    else:
        l.append("Old-Aged Adults")
df["age_group"]=l

In [None]:
df.head()

In [None]:
print(df.age_group.value_counts())

In [None]:
l=[]
for i in df.age_group.unique():
    l.append(df[df["age_group"]==i].age_group.value_counts()[0])
fig=plt.subplots(figsize = (8, 8))
plt.pie(l,labels=df.age_group.unique(),autopct="%f",explode=[0,0.1,0.1],radius=1,colors=["red","royalblue","yellow"])
plt.title("Pie plot Describing Age Percentages")
plt.plot()

In [None]:
fig, ax = plt.subplots(figsize = (15, 7))
x1=np.array([i-0.75 for i in range(2,2*len(df.age_group.unique())+2,2)])
x2=np.array([i-0.25 for i in range(2,2*len(df.age_group.unique())+2,2)])
y1=[]
y2=[]
for i in df.age_group.unique():
    y1.append(df[(df["smoker"]=="yes") & (df["age_group"]==i)].age_group.value_counts()[0])
    y2.append(df[(df["smoker"]=="no") & (df["age_group"]==i)].age_group.value_counts()[0])
ax.bar(x1,y1,width=0.5,label='Smokers',color="Blue")
ax.bar(x2,y2,width=0.5,label='Non Smokers',color="Green")
ax.bar_label(ax.containers[0])
ax.bar_label(ax.containers[1])
ax.legend()
ax.set_xticks(x2-0.25)
ax.set_xticklabels(df.age_group.unique())
ax.set_ylabel('Count of Smokers and Non Smokers')
ax.set_xlabel('AGE GROUP')
ax.set_title('Count of Smokers and Non Smokers depending Upon Age Group')
plt.plot()

In [None]:
sns.boxplot(y=df.bmi,x=df.age_group)

In [None]:
sns.boxplot(y=df.charges,x=df.age_group)

## Analisis on basis of BMI

### Adding a new bmi group Column

**Underweight:** BMI is less than 18.5

**Normal weight:** BMI is 18.5 to 24.9

**Overweight:** BMI is 25 to 29.9

**Obese:** BMI is 30 or more

In [None]:
l=[]
for i in df["bmi"]:
    if i<18.5:
        l.append("Underweight")
    elif i>=18.5 and i<=24.9:
        l.append("Normal weight")
    elif i>=55 and i<=29.9:
        l.append("Overweight")
    else:
        l.append("Obese")
df["bmi_group"]=l

In [None]:
df.head()

In [None]:
sns.scatterplot(x=df.age,y=df.charges,hue=df.bmi_group)

In [None]:
fig, ax = plt.subplots(figsize = (15, 7))
x1=np.array([i-0.75 for i in range(2,2*len(df.bmi_group.unique())+2,2)])
x2=np.array([i-0.25 for i in range(2,2*len(df.bmi_group.unique())+2,2)])
y1=[]
y2=[]
for i in df.bmi_group.unique():
    y1.append(df[(df["smoker"]=="yes") & (df["bmi_group"]==i)].bmi_group.value_counts()[0])
    y2.append(df[(df["smoker"]=="no") & (df["bmi_group"]==i)].bmi_group.value_counts()[0])
ax.bar(x1,y1,width=0.5,label='Smokers',color="Blue")
ax.bar(x2,y2,width=0.5,label='Non Smokers',color="Green")
ax.bar_label(ax.containers[0])
ax.bar_label(ax.containers[1])
ax.legend()
ax.set_xticks(x2-0.25)
ax.set_xticklabels(df.bmi_group.unique())
ax.set_ylabel('Count of Smokers and Non Smokers')
ax.set_xlabel('BMI GROUPS')
ax.set_title('Count of Smokers and Non Smokers depending Upon BMI Group')
plt.plot()

## Analysis on the basis on Region

In [None]:
df.region.value_counts()

In [None]:
fig, ax = plt.subplots(figsize = (10, 6))
x1=np.array([i-0.75 for i in range(2,2*len(df.region.unique())+2,2)])
x2=np.array([i-0.25 for i in range(2,2*len(df.region.unique())+2,2)])
y1=[]
y2=[]
for i in df.region.unique():
    y1.append(df[(df["sex"]=="male") & (df["region"]==i)].region.value_counts()[0])
    y2.append(df[(df["sex"]=="female") & (df["region"]==i)].region.value_counts()[0])
ax.bar(x1,y1,width=0.5,label='Male',color="Blue")
ax.bar(x2,y2,width=0.5,label='Female',color="Green")
ax.bar_label(ax.containers[0])
ax.bar_label(ax.containers[1])
ax.legend()
ax.set_xticks(x2-0.25)
ax.set_xticklabels(df.region.unique())
ax.set_ylabel('Count of Males and Females')
ax.set_xlabel('Regions')
ax.set_title('Count of Males and Females depending Upon Regions')
plt.plot()

In [None]:
fig, ax = plt.subplots(1, 4, sharex='col', sharey='row', figsize = (20, 5))
c=0
for i in df.region.unique():
    ax[c].scatter(df[df["region"]==i].age,df[df["region"]==i].charges)
    ax[c].set_title(i)
    if not c:
        ax[c].set_ylabel('Charges')
    ax[c].set_xlabel('Age')
    c+=1
fig.suptitle("Scatter Plot of Age vs Charges depending Upon Regions")
fig.show()

In [None]:
fig, ax = plt.subplots(figsize = (15, 7))
x1=np.array([i-0.75 for i in range(2,2*len(df.region.unique())+2,2)])
x2=np.array([i-0.25 for i in range(2,2*len(df.region.unique())+2,2)])
y1=[]
y2=[]
for i in df.region.unique():
    y1.append(df[(df["smoker"]=="yes") & (df["region"]==i)].region.value_counts()[0])
    y2.append(df[(df["smoker"]=="no") & (df["region"]==i)].region.value_counts()[0])
ax.bar(x1,y1,width=0.5,label='Smokers',color="Blue")
ax.bar(x2,y2,width=0.5,label='Non Smokers',color="Green")
ax.bar_label(ax.containers[0])
ax.bar_label(ax.containers[1])
ax.legend()
ax.set_xticks(x2-0.25)
ax.set_xticklabels(df.region.unique())
ax.set_ylabel('Count of Smokers and Non Smokers')
ax.set_xlabel('Regions')
ax.set_title('Count of Smokers and Non Smokers depending Upon Regions')
plt.plot()

## Other Charts

In [None]:
sns.displot(x=df['charges'], kind='hist',kde=True, height=6, aspect=1)

In [None]:
sns.pairplot(df)

# Regression Models

## Preparing Dataset

In [None]:
sex = ('male','female')
x = pd.get_dummies(df.iloc[:,:-3], columns=["sex","region","smoker"])
x.head()

In [None]:
y=df["charges"] / df["charges"].abs().max()
y

In [None]:
x_train,x_test,y_train,y_test = train_test_split( x, y, test_size=0.33)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.fit_transform(x_test)

## Multiple Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
reg=LinearRegression().fit(x_train,y_train)
y_pred=reg.predict(x_test)

In [None]:
score=r2_score(y_test,y_pred)
print(score)

In [None]:
plt.scatter(y_test , y_pred)