In [16]:
# Import required modules.
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder

In [17]:
# Collection of data
df=pd.read_csv("insurance.csv")
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [18]:
# Catogorical data in textual form and nominal data.
# Hence One-hot encoding is required
df['region'].value_counts()

southeast    364
southwest    325
northwest    325
northeast    324
Name: region, dtype: int64

In [19]:
# Try Ordinal encoding just for knowledge ( how to use ).
ordinal_encoder=OrdinalEncoder()
region_cat_ordinal=ordinal_encoder.fit_transform(df[['region']])
print(ordinal_encoder.categories_)
print(region_cat_ordinal[:10])
df.insert(7,"region_cat_ordinal",region_cat_ordinal)

[array(['northeast', 'northwest', 'southeast', 'southwest'], dtype=object)]
[[3.]
 [2.]
 [2.]
 [1.]
 [1.]
 [2.]
 [2.]
 [1.]
 [0.]
 [1.]]


In [20]:
# Display the dataframe
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,region_cat_ordinal
0,19,female,27.9,0,yes,southwest,16884.924,3.0
1,18,male,33.77,1,no,southeast,1725.5523,2.0
2,28,male,33.0,3,no,southeast,4449.462,2.0
3,33,male,22.705,0,no,northwest,21984.47061,1.0
4,32,male,28.88,0,no,northwest,3866.8552,1.0


In [21]:
# Nominal Encoding i.e. One-Hot Encoding.
hot_encoder=OneHotEncoder()
region_cat_one_hot=hot_encoder.fit_transform(df[['region']])
print(hot_encoder.categories_)
print(region_cat_one_hot)     # Scipy sparse matrix

[array(['northeast', 'northwest', 'southeast', 'southwest'], dtype=object)]
  (0, 3)	1.0
  (1, 2)	1.0
  (2, 2)	1.0
  (3, 1)	1.0
  (4, 1)	1.0
  (5, 2)	1.0
  (6, 2)	1.0
  (7, 1)	1.0
  (8, 0)	1.0
  (9, 1)	1.0
  (10, 0)	1.0
  (11, 2)	1.0
  (12, 3)	1.0
  (13, 2)	1.0
  (14, 2)	1.0
  (15, 3)	1.0
  (16, 0)	1.0
  (17, 0)	1.0
  (18, 3)	1.0
  (19, 3)	1.0
  (20, 0)	1.0
  (21, 3)	1.0
  (22, 2)	1.0
  (23, 0)	1.0
  (24, 1)	1.0
  :	:
  (1313, 3)	1.0
  (1314, 1)	1.0
  (1315, 0)	1.0
  (1316, 3)	1.0
  (1317, 2)	1.0
  (1318, 0)	1.0
  (1319, 1)	1.0
  (1320, 1)	1.0
  (1321, 0)	1.0
  (1322, 2)	1.0
  (1323, 2)	1.0
  (1324, 1)	1.0
  (1325, 0)	1.0
  (1326, 0)	1.0
  (1327, 2)	1.0
  (1328, 0)	1.0
  (1329, 3)	1.0
  (1330, 2)	1.0
  (1331, 3)	1.0
  (1332, 3)	1.0
  (1333, 1)	1.0
  (1334, 0)	1.0
  (1335, 2)	1.0
  (1336, 3)	1.0
  (1337, 1)	1.0


In [22]:
# convert scipy matrix to numpy matrix
region_cat_one_hot_encoding=region_cat_one_hot.toarray()

In [23]:
region_cat_one_hot_encoding

array([[0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       ...,
       [0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [0., 1., 0., 0.]])

In [24]:
labels=['northeast', 'northwest', 'southeast', 'southwest']
df_onehot=pd.DataFrame(region_cat_one_hot_encoding,columns=labels)

In [25]:
df_onehot.head()

Unnamed: 0,northeast,northwest,southeast,southwest
0,0.0,0.0,0.0,1.0
1,0.0,0.0,1.0,0.0
2,0.0,0.0,1.0,0.0
3,0.0,1.0,0.0,0.0
4,0.0,1.0,0.0,0.0


In [26]:
# Udes to append new columns of one-hot encoding.
for i in range(len(hot_encoder.categories_[0])):
    arr=region_cat_one_hot_encoding[:,i]
    df.insert(7+i+1,hot_encoder.categories_[0][i],arr)


In [27]:
# Display the dataframe.
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,region_cat_ordinal,northeast,northwest,southeast,southwest
0,19,female,27.9,0,yes,southwest,16884.924,3.0,0.0,0.0,0.0,1.0
1,18,male,33.77,1,no,southeast,1725.5523,2.0,0.0,0.0,1.0,0.0
2,28,male,33.0,3,no,southeast,4449.462,2.0,0.0,0.0,1.0,0.0
3,33,male,22.705,0,no,northwest,21984.47061,1.0,0.0,1.0,0.0,0.0
4,32,male,28.88,0,no,northwest,3866.8552,1.0,0.0,1.0,0.0,0.0


4