In [1]:
import pandas as pd

In [2]:
import numpy as np

In [22]:
df  = pd.read_csv('customers.csv')
df.head()

Unnamed: 0,age,income,gender,m_status,buys
0,25,high,male,single,no
1,25,high,male,married,no
2,35,high,male,single,yes
3,35,medium,male,single,yes
4,30,low,female,single,yes


In [4]:
df.isnull().sum()

age         0
income      0
gender      0
m_status    0
buys        0
dtype: int64

# Encoding

In [62]:
df.income.unique()

array(['high', 'medium', 'low'], dtype=object)

In [63]:
df.income = df.income.replace(['high', 'medium', 'low'],[3,2,1])

In [64]:
df.income.head()

0    3
1    3
2    3
3    2
4    1
Name: income, dtype: int64

# Label Encoding

In [65]:
from sklearn.preprocessing import LabelEncoder

In [66]:
df.head()

Unnamed: 0,age,income,gender,m_status,buys
0,25,3,male,single,no
1,25,3,male,married,no
2,35,3,male,single,yes
3,35,2,male,single,yes
4,30,1,female,single,yes


In [67]:
df  = pd.read_csv('customers.csv')
df.head()

Unnamed: 0,age,income,gender,m_status,buys
0,25,high,male,single,no
1,25,high,male,married,no
2,35,high,male,single,yes
3,35,medium,male,single,yes
4,30,low,female,single,yes


In [68]:
label = LabelEncoder()

In [69]:
df.income = label.fit_transform(df['income'])

In [70]:
df.head()

Unnamed: 0,age,income,gender,m_status,buys
0,25,0,male,single,no
1,25,0,male,married,no
2,35,0,male,single,yes
3,35,2,male,single,yes
4,30,1,female,single,yes


In [71]:
#Loop
for column in df.columns:
    if df[column].dtype == np.number:
        continue
    df[column] = LabelEncoder().fit_transform(df[column])

  if df[column].dtype == np.number:


In [73]:
df.income.head()

0    0
1    0
2    0
3    2
4    1
Name: income, dtype: int64

# One Hot Encoding

In [74]:
df  = pd.read_csv('customers.csv')
df.head()

Unnamed: 0,age,income,gender,m_status,buys
0,25,high,male,single,no
1,25,high,male,married,no
2,35,high,male,single,yes
3,35,medium,male,single,yes
4,30,low,female,single,yes


In [75]:
dummy_variables = pd.get_dummies(df['income'],drop_first=True)

In [76]:
dummy_variables.head()

Unnamed: 0,low,medium
0,0,0
1,0,0
2,0,0
3,0,1
4,1,0


In [77]:
df.head()

Unnamed: 0,age,income,gender,m_status,buys
0,25,high,male,single,no
1,25,high,male,married,no
2,35,high,male,single,yes
3,35,medium,male,single,yes
4,30,low,female,single,yes


In [78]:
new_df = df.drop('income',axis=1)

In [79]:
new_df.head()

Unnamed: 0,age,gender,m_status,buys
0,25,male,single,no
1,25,male,married,no
2,35,male,single,yes
3,35,male,single,yes
4,30,female,single,yes


In [80]:
df = pd.concat([new_df,dummy_variables],axis=1)

In [81]:
df.head()

Unnamed: 0,age,gender,m_status,buys,low,medium
0,25,male,single,no,0,0
1,25,male,married,no,0,0
2,35,male,single,yes,0,0
3,35,male,single,yes,0,1
4,30,female,single,yes,1,0


# Ordinal Encoder

In [84]:
df  = pd.read_csv('customers.csv')
df.head()

Unnamed: 0,age,income,gender,m_status,buys
0,25,high,male,single,no
1,25,high,male,married,no
2,35,high,male,single,yes
3,35,medium,male,single,yes
4,30,low,female,single,yes


In [85]:
from sklearn.preprocessing import OrdinalEncoder

In [86]:
df.income.unique()

array(['high', 'medium', 'low'], dtype=object)

In [88]:
income_list = ['high', 'medium', 'low']

In [89]:
ordinal = OrdinalEncoder(categories=[income_list])

In [90]:
encoded_values = ordinal.fit_transform(df[['income']]) # number of sample & number of feature

In [106]:
new_income = pd.DataFrame(encoded_values, columns= ['income'])

In [107]:
new_df = df.drop('income',axis=1)

In [108]:
new_df

Unnamed: 0,age,gender,m_status,buys
0,25,male,single,no
1,25,male,married,no
2,35,male,single,yes
3,35,male,single,yes
4,30,female,single,yes
5,32,female,single,no
6,22,female,married,yes
7,22,male,married,no
8,25,female,single,yes
9,35,female,married,yes


In [109]:
new_income

Unnamed: 0,income
0,0.0
1,0.0
2,0.0
3,1.0
4,2.0
5,2.0
6,2.0
7,1.0
8,2.0
9,1.0


In [110]:
#df.income = new_income
df = pd.concat([new_df,new_income],axis=1)

In [111]:
df.head()

Unnamed: 0,age,gender,m_status,buys,income
0,25,male,single,no,0.0
1,25,male,married,no,0.0
2,35,male,single,yes,0.0
3,35,male,single,yes,1.0
4,30,female,single,yes,2.0


# Hashing Encoder

In [96]:
df  = pd.read_csv('customers.csv')
df.head()

Unnamed: 0,age,income,gender,m_status,buys
0,25,high,male,single,no
1,25,high,male,married,no
2,35,high,male,single,yes
3,35,medium,male,single,yes
4,30,low,female,single,yes


In [97]:
df.income.unique()

array(['high', 'medium', 'low'], dtype=object)

In [98]:
import category_encoders as ce

In [101]:
encoders = ce.HashingEncoder(cols='income',n_components=3)

In [102]:
encoders.fit_transform(df)

Unnamed: 0,col_0,col_1,col_2,age,gender,m_status,buys
0,0,1,0,25,male,single,no
1,0,1,0,25,male,married,no
2,0,1,0,35,male,single,yes
3,1,0,0,35,male,single,yes
4,1,0,0,30,female,single,yes
5,1,0,0,32,female,single,no
6,1,0,0,22,female,married,yes
7,1,0,0,22,male,married,no
8,1,0,0,25,female,single,yes
9,1,0,0,35,female,married,yes
