In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

In [2]:
# encoding means where we convert categorical data to numerical data
dataset = pd.read_csv('../loan.csv')
dataset.head(3)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y


In [5]:
dataset.isnull().sum()

Loan_ID               0
Gender                0
Married               0
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [4]:
dataset['Gender'].fillna(dataset['Gender'].mode()[0], inplace=True)
dataset['Married'].fillna(dataset['Married'].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset['Married'].fillna(dataset['Married'].mode()[0], inplace=True)


In [None]:
# one hot encoder
en_data = dataset[["Gender", "Married"]]

In [15]:
pd.get_dummies(en_data).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype
---  ------         --------------  -----
 0   Gender_Female  614 non-null    bool 
 1   Gender_Male    614 non-null    bool 
 2   Married_No     614 non-null    bool 
 3   Married_Yes    614 non-null    bool 
dtypes: bool(4)
memory usage: 2.5 KB


In [17]:
ohe = OneHotEncoder()
ar = ohe.fit_transform(en_data).toarray()

In [18]:
pd.DataFrame(ar, columns=['Gender_Female', 'Gender_Male', 'Married_No', 'Married_Yes'])

Unnamed: 0,Gender_Female,Gender_Male,Married_No,Married_Yes
0,0.0,1.0,1.0,0.0
1,0.0,1.0,0.0,1.0
2,0.0,1.0,0.0,1.0
3,0.0,1.0,0.0,1.0
4,0.0,1.0,1.0,0.0
...,...,...,...,...
609,1.0,0.0,1.0,0.0
610,0.0,1.0,0.0,1.0
611,0.0,1.0,0.0,1.0
612,0.0,1.0,0.0,1.0


In [26]:
# Label Encoding
df = pd.DataFrame({"name": ["red", "black", "white", "green", "black", "blue", "yelow", "red"]})
df

Unnamed: 0,name
0,red
1,black
2,white
3,green
4,black
5,blue
6,yelow
7,red


In [27]:
le = LabelEncoder()
df["en_name"] = le.fit_transform(df['name'])

In [28]:
df

Unnamed: 0,name,en_name
0,red,3
1,black,0
2,white,4
3,green,2
4,black,0
5,blue,1
6,yelow,5
7,red,3


In [30]:
# ordinal encoding
df = pd.DataFrame({"Size": ['s', 'm', 'xl', 'l', 's', 'm', 's', 'm', 'xl', 'l', 'm']})
df

Unnamed: 0,Size
0,s
1,m
2,xl
3,l
4,s
5,m
6,s
7,m
8,xl
9,l


In [31]:
ord_data = [['s', 'm', 'l', 'xl']]

In [32]:
from sklearn.preprocessing import OrdinalEncoder

In [35]:
oe = OrdinalEncoder(categories=ord_data)
oe.fit(df[["Size"]])

0,1,2
,categories,"[['s', 'm', ...]]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,


In [37]:
df['size_enc'] = oe.transform(df[["Size"]])

In [38]:
df

Unnamed: 0,Size,size_enc
0,s,0.0
1,m,1.0
2,xl,3.0
3,l,2.0
4,s,0.0
5,m,1.0
6,s,0.0
7,m,1.0
8,xl,3.0
9,l,2.0


In [39]:
ord_data1 = {"s": 0, "m": 1, "l": 2, "xl": 3}

In [40]:
df["Size_map_enc"] = df['Size'].map(ord_data1)
df

Unnamed: 0,Size,size_enc,Size_map_enc
0,s,0.0,0
1,m,1.0,1
2,xl,3.0,3
3,l,2.0,2
4,s,0.0,0
5,m,1.0,1
6,s,0.0,0
7,m,1.0,1
8,xl,3.0,3
9,l,2.0,2
