In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from pandas.core.dtypes.common import is_numeric_dtype

In [2]:
df = pd.read_excel("insurance.xlsx")

In [3]:
df.isnull()

Unnamed: 0,age,gender,bmi,children,smoker,region,charges
0,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...
1333,False,False,False,False,False,False,False
1334,False,False,False,False,False,False,False
1335,False,False,False,False,False,False,False
1336,False,False,False,False,False,False,False


In [4]:
df.isnull().sum()

age         0
gender      0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

# Handle Null Value:There is no null value

In [5]:
df1 = df.copy()
df2 = df.copy()
df3 = df.copy()
df4 = df.copy()


# Encode using the replace method

In [6]:
df1 = df.copy()

In [7]:
df1.gender = df1.gender.replace(["female", "male"], [0, 1])
df1.smoker = df1.smoker.replace(["no", "yes"], [0, 1])
df1.region = df1.region.replace(['southwest', 'southeast', 'northwest', 'northeast'], [0,1,2,3])

In [8]:
df1.head()

Unnamed: 0,age,gender,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,0,16884.924
1,18,1,33.77,1,0,1,1725.5523
2,28,1,33.0,3,0,1,4449.462
3,33,1,22.705,0,0,2,21984.47061
4,32,1,28.88,0,0,2,3866.8552


# Categorical Data Columns

In [10]:
df.columns

Index(['age', 'gender', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')

In [11]:
cols = []
for column in df.columns:
    if not is_numeric_dtype(df[column]):
        cols.append(column)

In [12]:
cols

['gender', 'smoker', 'region']

# Label Encoding Using Loop

In [16]:
df2 = df.copy()
le = LabelEncoder()
for column in cols:
    df2[column] = le.fit_transform(df2[column])

df2.head()

Unnamed: 0,age,gender,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,3,16884.924
1,18,1,33.77,1,0,2,1725.5523
2,28,1,33.0,3,0,2,4449.462
3,33,1,22.705,0,0,1,21984.47061
4,32,1,28.88,0,0,1,3866.8552


# One Hot Encoding Using Loop

In [18]:
df3 = df.copy()
for column in cols:
    dummy = pd.get_dummies(df3[column], columns=df3[column].unique, drop_first=True)
    df3.drop(column, axis=1, inplace=True)
    df3 = pd.concat([df3, dummy], axis=1)

df3.head()

Unnamed: 0,age,bmi,children,charges,male,yes,northwest,southeast,southwest
0,19,27.9,0,16884.924,False,True,False,False,True
1,18,33.77,1,1725.5523,True,False,False,True,False
2,28,33.0,3,4449.462,True,False,False,True,False
3,33,22.705,0,21984.47061,True,False,True,False,False
4,32,28.88,0,3866.8552,True,False,True,False,False


# Ordinal Encoding Using Loop

In [19]:
df4 = df.copy()
for column in cols:
    uc = df4[column].unique()
    od = OrdinalEncoder(categories=[uc])
    encoded = od.fit_transform(df4[[column]])
    encoded = pd.DataFrame(encoded, columns=[column])
    df4.drop(column, axis=1, inplace=True)
    df4 = pd.concat([df4, encoded], axis=1)

df4.head()

Unnamed: 0,age,bmi,children,charges,gender,smoker,region
0,19,27.9,0,16884.924,0.0,0.0,0.0
1,18,33.77,1,1725.5523,1.0,1.0,1.0
2,28,33.0,3,4449.462,1.0,1.0,1.0
3,33,22.705,0,21984.47061,1.0,1.0,2.0
4,32,28.88,0,3866.8552,1.0,1.0,2.0
