# **1. Introduction:**

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# Loading the necessary libraries

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # plotting
import seaborn as sns # plotting
sns.set()
import warnings # turn off warnings for final notebook
warnings.filterwarnings("ignore")

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input 
# directory.

import os # accessing directory structure

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when 
# you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current 
# session

There is 1 csv file in the current version of the dataset:

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# **2. Loading the Data:**

In [None]:
# load dataset
data = pd.read_csv('/kaggle/input/ushealthinsurancedataset/insurance.csv')
data.head()

# **3. Exploratory Data Analysis:**

# 3.1 Shape:
Lets's start with the shape of the data.

In [None]:
print(f'Shape of the data: {data.shape}')
print(f'There are {data.shape[0]} rows and {data.shape[1]} columns in the data.')

# 3.2 Data Types:                                
Let's understand the data types of various attributes.

In [None]:
data.info()

In [None]:
# Let's check out the different datatypes present in the data:
data.dtypes.unique()

In [None]:
# Let's check out the individual columns:
data.columns

In [None]:
# Let's transform the Index object to a series, and grouping by data types:
g = data.columns.to_series().groupby(data.dtypes).groups
g

In [None]:
# Let's create a dictionary containing various datatypes (Integer, Float and Object) and the columns that have this datatype:
dt = {k.name: v for k, v in g.items()}

# Let's display the columns by different datatypes:
attributes_by_datatype = pd.DataFrame(list(dt.values()), index = dt.keys(), columns = ['Attr 1', 'Attr 2', 'Attr 3'])
attributes_by_datatype

We observe that 'age'(int64), 'bmi'(float64) and 'charges'(float64) are numerical attributes; and 'sex', 'smoker', 'region' and 'children' are categorical attributes.

We observe that the 'children' column is being treated as an integer datatype, even though it contains categorical information.

In [None]:
# Let's display the unique values for 'children':
sorted(data['children'].unique())

In [None]:
# Let's display the unique values for 'region':
sorted(data['region'].unique())

In [None]:
# Let's display the unique values for 'smoking status':
sorted(data['smoker'].unique())

In [None]:
# Let's display the unique values for 'sex (i.e. gender)':
sorted(data['sex'].unique())

# 3.3. Data Cleaning:
Let's check if the DataFrame contains any missing or null values.

In [None]:
# null values
data.isnull().any().sort_values(ascending=False)

There are no missing data in the DataFrame.

# 3.4. Summary and Distribution of attributes:

We need to change the strings into numbers

In [None]:
data['sex'] = data['sex'].apply({'female':0,'male':1}.get) 
data['smoker'] = data['smoker'].apply({'yes':1, 'no':0}.get)
data['region'] = data['region'].apply({'northwest':1, 'southeast':2, 'southwest':3, 'northeast':4}.get)
data.head()

We can get an overview of the important statistics of the dataset through the describe() method:

In [None]:
data.describe().transpose()

# 3.5. Correlation:
Let's study the correlation between different attributes in our dataset.

In [None]:
corr = data.corr()
plt.figure(figsize=(16, 8))
sns.heatmap(corr, annot=True, cmap = 'viridis')

**Observation:**
* From the correlation heatmap, we can conclude that smoking habit is strongly positively correlated with premium charges and weak positive correlation of the Age and BMI of the insured is seen with premium charges. 

Let's study the correlation between different attributes with the premium charges in our dataset.

In [None]:
corr = data.corr()[['charges']].sort_values(by='charges', ascending=False)
plt.figure(figsize=(8, 12))
sns.heatmap(corr, vmin=-1, vmax=1, annot=True, cmap = 'BrBG')

**Observation:**
* From this correlation heatmap, we can conclude that smoking habit is strongly positively correlated with premium charges, sex and children have very weak positive correlation with premium charges, and region has the very weakest positive correlation with premium charges.

An interesting relationship between Premium Charges, BMI and Smoking Status (Smoker / Non - Smoker) can be seen in this graph:

In [None]:
data.plot(kind="scatter", x="age", y="charges", 
    s=data["smoker"]*25, label="smoker", figsize=(14,10),
    c='bmi', cmap=plt.get_cmap("jet"), colorbar=True,
    sharex=False)
plt.legend()