In [1]:
#importing all necessary library
import numpy as np 
import pandas as pd 
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
from pandas.core.dtypes.common import is_numeric_dtype
import warnings #for ignore waring
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('insurance.csv') #load dataset 
data.head() #check top(5) rows of dataframe

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [3]:
#statistic summary

data.describe() 

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [4]:
#check how many rows and columns there in dataset 
data.shape

(1338, 7)

In [5]:
#check all feature name (colmuns name)

data.columns 

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')

In [6]:
'''
check all data type of dateFrame. This will helpful for encoding.
Encoding only categorical variables into numerical values.
'''
data.dtypes

age           int64
sex          object
bmi         float64
children      int64
smoker       object
region       object
charges     float64
dtype: object

In [7]:
#check unique labels 
print('',data['sex'].unique())
print(data['smoker'].unique())
print(data['region'].unique())

 ['female' 'male']
['yes' 'no']
['southwest' 'southeast' 'northwest' 'northeast']


In [8]:
#check null values
data.isnull().sum() 

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [9]:
# make copy of main dataset beacuse we use multiple time this same data

data1 = data.copy()
data2 = data.copy()
data3 = data.copy()
data4 = data.copy()
data5 = data.copy()
data6 = data.copy()
data7 = data.copy()
data8 = data.copy()
data9 = data.copy()
data10 = data.copy()

# Label Encoder 

In [10]:
#import LabelEncoder from sklearn preprocessing

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder() # make LabelEncoder object 

In [11]:
#fit_transform use Fit label encoder and return encoded labels

data1.sex=le.fit_transform(data1[["sex"]])

In [12]:
print('Main Data before encoding')
print('-------------------------')
print(data.head())
print('\n\n\nAfter LabelEncoding')
print('---------------------')
print(data1.head())

Main Data before encoding
-------------------------
   age     sex     bmi  children smoker     region      charges
0   19  female  27.900         0    yes  southwest  16884.92400
1   18    male  33.770         1     no  southeast   1725.55230
2   28    male  33.000         3     no  southeast   4449.46200
3   33    male  22.705         0     no  northwest  21984.47061
4   32    male  28.880         0     no  northwest   3866.85520



After LabelEncoding
---------------------
   age  sex     bmi  children smoker     region      charges
0   19    0  27.900         0    yes  southwest  16884.92400
1   18    1  33.770         1     no  southeast   1725.55230
2   28    1  33.000         3     no  southeast   4449.46200
3   33    1  22.705         0     no  northwest  21984.47061
4   32    1  28.880         0     no  northwest   3866.85520


In [13]:
# check total labels in variable
data1['sex'].unique()

array([0, 1])

In [14]:
#this one also check the unique labels
print(le.classes_)

['female' 'male']


In [15]:
# Print the encoded values and the corresponding original categories

for value, category in zip(data1.sex, data['sex']):
    print(f'{value}: {category}')   

0: female
1: male
1: male
1: male
1: male
0: female
0: female
0: female
1: male
0: female
1: male
0: female
1: male
0: female
1: male
1: male
0: female
1: male
1: male
1: male
0: female
0: female
1: male
0: female
1: male
0: female
0: female
0: female
1: male
1: male
1: male
0: female
0: female
1: male
1: male
1: male
0: female
1: male
1: male
1: male
0: female
0: female
1: male
0: female
1: male
1: male
0: female
0: female
0: female
1: male
0: female
0: female
1: male
1: male
0: female
1: male
0: female
1: male
0: female
0: female
1: male
1: male
1: male
0: female
0: female
0: female
0: female
1: male
0: female
1: male
0: female
1: male
0: female
1: male
1: male
1: male
0: female
1: male
0: female
0: female
1: male
0: female
1: male
0: female
0: female
1: male
0: female
0: female
0: female
0: female
0: female
0: female
1: male
1: male
0: female
0: female
0: female
1: male
1: male
1: male
0: female
1: male
0: female
0: female
0: female
1: male
0: female
1: male
1: male
1: male
1: male


# Loop concept for LabelEncoder
# Type 1

In [16]:
data2.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [17]:
for col in data2.columns:
    if data2[col].dtype== np.number:
        continue
    
    else:
        data2[col] = le.fit_transform(data2[col])


In [18]:
data2.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,1,0,27.9,0,1,3,16884.924
1,0,1,33.77,1,0,2,1725.5523
2,10,1,33.0,3,0,2,4449.462
3,15,1,22.705,0,0,1,21984.47061
4,14,1,28.88,0,0,1,3866.8552


In [19]:
#print labels

print('Sex ',data2['sex'].unique())
print('Smoker ',data2['smoker'].unique())
print('Region',data2['region'].unique())

Sex  [0 1]
Smoker  [1 0]
Region [3 2 1 0]


# Type 2

In [20]:
for col in data3.columns:
    if data3[col].dtype != np.number:
        data3[col] = le.fit_transform(data3[col])
        

In [21]:
data3.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,1,0,27.9,0,1,3,16884.924
1,0,1,33.77,1,0,2,1725.5523
2,10,1,33.0,3,0,2,4449.462
3,15,1,22.705,0,0,1,21984.47061
4,14,1,28.88,0,0,1,3866.8552


# Type 3

In [22]:

for col in data4.columns:
    if data4[col].dtype == object:
        data4[col] = le.fit_transform(data4[col])

In [23]:
data4.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,3,16884.924
1,18,1,33.77,1,0,2,1725.5523
2,28,1,33.0,3,0,2,4449.462
3,33,1,22.705,0,0,1,21984.47061
4,32,1,28.88,0,0,1,3866.8552


#  One Hot Encoding

In [24]:
dummy = pd.get_dummies(data5['sex'], drop_first=True) #drop_first use for drop first dummy variable 

In [25]:
data5.drop('sex',inplace=True,axis=1) # drop main variable , we don't need main variable

In [26]:
dummy.head()

Unnamed: 0,male
0,0
1,1
2,1
3,1
4,1


In [27]:
data5 = pd.concat([data5,dummy],axis=1) # add dummy variable and after drop dataset 

In [28]:
data5.head()

Unnamed: 0,age,bmi,children,smoker,region,charges,male
0,19,27.9,0,yes,southwest,16884.924,0
1,18,33.77,1,no,southeast,1725.5523,1
2,28,33.0,3,no,southeast,4449.462,1
3,33,22.705,0,no,northwest,21984.47061,1
4,32,28.88,0,no,northwest,3866.8552,1


# Loop 

In [29]:
for col in data6.columns:
    if is_numeric_dtype(data6[col]): #is_numeric_dtype is pandas keyword which check is this is number or not 
        continue
    else:
        one = pd.get_dummies(data6[col], drop_first=True,prefix='Area')
        data6.drop(data6[[col]], axis=1, inplace=True)
        data6 = pd.concat([data6, one], axis=1)

In [30]:
data6.head()

Unnamed: 0,age,bmi,children,charges,Area_male,Area_yes,Area_northwest,Area_southeast,Area_southwest
0,19,27.9,0,16884.924,0,1,0,0,1
1,18,33.77,1,1725.5523,1,0,0,1,0
2,28,33.0,3,4449.462,1,0,0,1,0
3,33,22.705,0,21984.47061,1,0,1,0,0
4,32,28.88,0,3866.8552,1,0,1,0,0


# Ordinal Encoder

In [31]:
from sklearn.preprocessing import OrdinalEncoder

In [32]:
data7.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [33]:
order=data7.sex.unique()

In [34]:
order

array(['female', 'male'], dtype=object)

In [35]:
ordinal = OrdinalEncoder(categories=[order])

In [36]:
encoded = ordinal.fit_transform(data7[['sex']])

In [37]:
encoded

array([[0.],
       [1.],
       [1.],
       ...,
       [0.],
       [0.],
       [0.]])

In [38]:
data7.sex = encoded

In [39]:
data7.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0.0,27.9,0,yes,southwest,16884.924
1,18,1.0,33.77,1,no,southeast,1725.5523
2,28,1.0,33.0,3,no,southeast,4449.462
3,33,1.0,22.705,0,no,northwest,21984.47061
4,32,1.0,28.88,0,no,northwest,3866.8552


#  Loop


In [40]:
for col in data8.columns:
    if is_numeric_dtype(data8[col]):
        continue
    else:
        order=data8[col].unique()
        ordinal = OrdinalEncoder(categories=[order])
        encoded = ordinal.fit_transform(data8[[col]])
        data8[col] = encoded
        

In [41]:
data8.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0.0,27.9,0,0.0,0.0,16884.924
1,18,1.0,33.77,1,1.0,1.0,1725.5523
2,28,1.0,33.0,3,1.0,1.0,4449.462
3,33,1.0,22.705,0,1.0,2.0,21984.47061
4,32,1.0,28.88,0,1.0,2.0,3866.8552


#  Replace Function

In [42]:
data8.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0.0,27.9,0,0.0,0.0,16884.924
1,18,1.0,33.77,1,1.0,1.0,1725.5523
2,28,1.0,33.0,3,1.0,1.0,4449.462
3,33,1.0,22.705,0,1.0,2.0,21984.47061
4,32,1.0,28.88,0,1.0,2.0,3866.8552


In [43]:
data8.sex = data8.sex.replace(['female','male'],[0,1])

In [44]:
data8.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0.0,27.9,0,0.0,0.0,16884.924
1,18,1.0,33.77,1,1.0,1.0,1725.5523
2,28,1.0,33.0,3,1.0,1.0,4449.462
3,33,1.0,22.705,0,1.0,2.0,21984.47061
4,32,1.0,28.88,0,1.0,2.0,3866.8552


In [45]:
data8.nunique()

age           47
sex            2
bmi          548
children       6
smoker         2
region         4
charges     1337
dtype: int64