In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

## Multiple Linear Regression ot predict the health insurance cost based on a number of input features 

### Importing the libraries 

In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 

### Loading the dataset 


In [None]:
df = pd.read_csv('/kaggle/input/health-insurance-cost-prediction/insurance.csv')
df.head()

Gather some information regarding the different columns in the dataset 


### EDA

In [None]:
df.info()

Frequency distribution of the different categories in the categorical variables 

In [None]:
df['sex'].value_counts()

In [None]:
df['smoker'].value_counts()

In [None]:
df['region'].value_counts()

Check the number of distinct categories within a categorical feature 

In [None]:
df['region'].value_counts().count()

Visualizing categorical features using barplots 

In [None]:
region_count = df['region'].value_counts()
sns.barplot(region_count.index, region_count.values, alpha=0.9)
plt.title("Frequency distribution of region")
plt.ylabel("Frequency")
plt.xlabel("Region")
plt.show()

Similarly, we can plot a pie chart using matplotlib to visualize the different categorical variables in the region column.

In [None]:
labels = df['region'].astype('category').cat.categories.tolist()
counts = df['region'].value_counts()
sizes = [counts[var_cat] for var_cat in labels]
fig1, ax1 = plt.subplots()
ax1.pie(sizes, labels=labels, autopct='%1.1f%%')
ax1.axis('equal')
plt.show()

#### Encoding the categorical data 

There are different ways to encode the categorical data. Some of them are :
    - Replacing values 
    - Encoding labels 
    - One-Hot encoding 
    - Binary encoding 
    - Backward difference encoding 
    - Miscellaneous features 
    

#### Replacing values

The most basic method is to replace the categories with the desired numbers. This can be achieved with the pandas replace() function. The main idea is that we have the liberty to choose the whatever numbers we can assign to the variables. We can encode all the variables directly as shown below:


In [None]:
replace_map = {'region': {k:v for k,v in zip(labels,list(range(1,len(labels)+1)))}}
replace_map

Now, we will replace the categorical variables with the new numbers in the replace_map variable into a new dataframe 

In [None]:
df_replace = df.copy()
df_replace.replace(replace_map, inplace=True)
df_replace.head()

#### Label encoding 

In [None]:
df_le = df.copy()
df_le['region'] = df_le['region'].astype('category')
df_le['region'] = df_le['region'].cat.codes
df_le.head()    # Alphabetically coded from 1 to 10


We can also achieve the same thing using sklearn's LabelEncoder 


In [None]:
df_labelenc = df.copy()

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df_labelenc['region_code'] = le.fit_transform(df['region'])
df_labelenc.head()

#### One hot encoding 
The strategy here is to convert each category into a new column and assign 1 or 0 value to the column. We can use pandas' get_dummies() method to perform one hot encoding. The method takes three arguments : The dataframe you want to encode on, the column you want to do encoding on and last, the prefix argument that lets you specify what prefix to add on the new column generated after encoding.

In [None]:
df_onehot = df.copy()
df_onehot = pd.get_dummies(df_onehot, columns=['region'],prefix=['region'])
df_onehot.head()