# Label Encoding
This process is used for converting categorical data into numeric data and is necessary step while preparing the data to feed the machines (computer program) since the machine do not understand any other language than the language of numbers. Label encoding helps us train our model in better manner since the categorical features would be dropped otherwise.

In [1]:
#Importing libraries
import pandas as pd
import numpy as np

In [2]:
#Loading the data
df=pd.read_csv('bank.csv',";")
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,no
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,no


In [3]:
#Getting the unique values for encoding
df['marital'].unique()

array(['married', 'single', 'divorced'], dtype=object)

In [4]:
#Label Encoding
df['marital']=df['marital'].map({'single':0, 'married':1,'divorced':2})
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,1,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,services,1,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,0,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no
3,30,management,1,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,no
4,59,blue-collar,1,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,no


It is easy to encode this way when the unique values of the particular column are less, but what if there are lot of them.

In [5]:
#Getting the unique values for encoding
df['job'].unique()

array(['unemployed', 'services', 'management', 'blue-collar',
       'self-employed', 'technician', 'entrepreneur', 'admin.', 'student',
       'housemaid', 'retired', 'unknown'], dtype=object)

In [6]:
df['job'].replace(
    {'unknown':np.nan, 
     'services':0,
     'management':1,
     'blue-collar':2,
     'self-employed':3,
     'technician':4,
     'entrepreneur':5,
     'admin.':6,
     'student':7,
     'housemaid':8,
     'retired':9,
     'unemployed':10 },
    inplace=True)
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,10.0,1,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,0.0,1,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,1.0,0,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no
3,30,1.0,1,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,no
4,59,2.0,1,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,no


In case when you have more unique values you can still encode them using the same method but this becomes time consuming process. In such cases we can use a function offered by `Sklearn` known as `LabelEncoder` for encoding our data.

#### Encoding with Label Encoder

In [7]:
#Importing function from the library
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()

Let's replace all the months with numbers.

In [8]:
df['month'].unique()
#Month doesn't have `unknown` values

array(['oct', 'may', 'apr', 'jun', 'feb', 'aug', 'jan', 'jul', 'nov',
       'sep', 'mar', 'dec'], dtype=object)

In [9]:
df["month"]=le.fit_transform(df["month"])
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,10.0,1,primary,no,1787,no,no,cellular,19,10,79,1,-1,0,unknown,no
1,33,0.0,1,secondary,no,4789,yes,yes,cellular,11,8,220,1,339,4,failure,no
2,35,1.0,0,tertiary,no,1350,yes,no,cellular,16,0,185,1,330,1,failure,no
3,30,1.0,1,tertiary,no,1476,yes,yes,unknown,3,6,199,4,-1,0,unknown,no
4,59,2.0,1,secondary,no,0,yes,no,unknown,5,8,226,1,-1,0,unknown,no


In [10]:
#df["contact"]=le.fit_transform(df["contact"])
#df.head()
#Doing this will also replace the `unknown` i.e. the missing value with some number
#We should take care of that using if-else block

In [11]:
#Replacing `unknown` with np.nan wiz missing values
for column in df.columns:
    df[column].replace({'unknown':np.nan}, inplace=True)
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,10.0,1,primary,no,1787,no,no,cellular,19,10,79,1,-1,0,,no
1,33,0.0,1,secondary,no,4789,yes,yes,cellular,11,8,220,1,339,4,failure,no
2,35,1.0,0,tertiary,no,1350,yes,no,cellular,16,0,185,1,330,1,failure,no
3,30,1.0,1,tertiary,no,1476,yes,yes,,3,6,199,4,-1,0,,no
4,59,2.0,1,secondary,no,0,yes,no,,5,8,226,1,-1,0,,no


In [12]:
df['poutcome'].unique()

array([nan, 'failure', 'other', 'success'], dtype=object)

In [13]:
df['poutcome']=le.fit_transform(df['poutcome'])
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,10.0,1,primary,no,1787,no,no,cellular,19,10,79,1,-1,0,3,no
1,33,0.0,1,secondary,no,4789,yes,yes,cellular,11,8,220,1,339,4,0,no
2,35,1.0,0,tertiary,no,1350,yes,no,cellular,16,0,185,1,330,1,0,no
3,30,1.0,1,tertiary,no,1476,yes,yes,,3,6,199,4,-1,0,3,no
4,59,2.0,1,secondary,no,0,yes,no,,5,8,226,1,-1,0,3,no


In [14]:
df['poutcome'].unique()

array([3, 0, 1, 2])

In [15]:
df['contact'].unique()

array(['cellular', nan, 'telephone'], dtype=object)

In [16]:
df['contact']=le.fit_transform(df['contact'])
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,10.0,1,primary,no,1787,no,no,0,19,10,79,1,-1,0,3,no
1,33,0.0,1,secondary,no,4789,yes,yes,0,11,8,220,1,339,4,0,no
2,35,1.0,0,tertiary,no,1350,yes,no,0,16,0,185,1,330,1,0,no
3,30,1.0,1,tertiary,no,1476,yes,yes,2,3,6,199,4,-1,0,3,no
4,59,2.0,1,secondary,no,0,yes,no,2,5,8,226,1,-1,0,3,no


In [17]:
df['contact'].unique()

array([0, 2, 1])

You can see that it encodes the null values with highest digit, we will use this fact to replace the null value again with `np.nan`,

In [18]:
#Loading the data again while null values are replaced with np.nan
df=pd.read_csv('bank.csv',';')
for column in df.columns:
    df[column].replace({'unknown':np.nan}, inplace=True)
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no
3,30,management,married,tertiary,no,1476,yes,yes,,3,jun,199,4,-1,0,,no
4,59,blue-collar,married,secondary,no,0,yes,no,,5,may,226,1,-1,0,,no


In [19]:
#categorical columns
categorical_columns=['job','marital','education','default','housing','loan','contact','month','poutcome','y']

In [20]:
#categorical columns with null values
null_cat_cols=[]
for col in categorical_columns:
    if np.nan in list(df[col].unique()):
        null_cat_cols.append(col)
null_cat_cols       

['job', 'education', 'contact', 'poutcome']

In [21]:
#we will encode this using label encoder
for column in categorical_columns:
    df[column]=le.fit_transform(df[column])
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,10,1,0,0,1787,0,0,0,19,10,79,1,-1,0,3,0
1,33,7,1,1,0,4789,1,1,0,11,8,220,1,339,4,0,0
2,35,4,2,2,0,1350,1,0,0,16,0,185,1,330,1,0,0
3,30,4,1,2,0,1476,1,1,2,3,6,199,4,-1,0,3,0
4,59,1,1,1,0,0,1,0,2,5,8,226,1,-1,0,3,0


While using the fact that null values are replaced at the end therefore having the highest numeric value in the respective column.

In [22]:
for col in null_cat_cols:
    df[col].replace({max(df[col]):np.nan}, inplace=True)
df.head()
#tadaa we got our missing values back

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,10.0,1,0.0,0,1787,0,0,0.0,19,10,79,1,-1,0,,0
1,33,7.0,1,1.0,0,4789,1,1,0.0,11,8,220,1,339,4,0.0,0
2,35,4.0,2,2.0,0,1350,1,0,0.0,16,0,185,1,330,1,0.0,0
3,30,4.0,1,2.0,0,1476,1,1,,3,6,199,4,-1,0,,0
4,59,1.0,1,1.0,0,0,1,0,,5,8,226,1,-1,0,,0


**Note** : It is always better to fill the missing values prior to the label encoding which saves lots of trouble and efforts.

In [23]:
#Saving the data for next session
df.to_csv('bank_encoded.csv',index=False)

The End