# Preprocessing and Encoding for "Customers" dataset.

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("Customers.csv")

In [3]:
df.head()

Unnamed: 0,age,income,gender,m_status,buys
0,25.0,high,male,single,no
1,25.0,high,male,married,no
2,35.0,high,male,single,yes
3,35.0,medium,male,single,yes
4,30.0,low,female,single,yes


In [4]:
df.isnull().sum()

age         2
income      0
gender      0
m_status    1
buys        0
dtype: int64

# Dealing with null values

In [5]:
mean=df.age.mean() 

In [6]:
df.age = df.age.fillna(mean)

In [7]:
df.isnull().sum()

age         0
income      0
gender      0
m_status    1
buys        0
dtype: int64

In [8]:
mode =  df.m_status.mode()

In [9]:
mode

0    single
dtype: object

In [10]:
df.m_status = df.m_status.fillna("single")

In [11]:
df.isnull().sum()

age         0
income      0
gender      0
m_status    0
buys        0
dtype: int64

In [12]:
df.head()

Unnamed: 0,age,income,gender,m_status,buys
0,25.0,high,male,single,no
1,25.0,high,male,married,no
2,35.0,high,male,single,yes
3,35.0,medium,male,single,yes
4,30.0,low,female,single,yes


# One Hot Encoding for "income" column

In [13]:
dummy_vari = pd.get_dummies(df.income, drop_first=True)

In [14]:
dummy_vari.head()

Unnamed: 0,low,medium
0,0,0
1,0,0
2,0,0
3,0,1
4,1,0


In [15]:
new_df = df.drop("income", axis=1) #droping "income" column

In [16]:
new_df.head()

Unnamed: 0,age,gender,m_status,buys
0,25.0,male,single,no
1,25.0,male,married,no
2,35.0,male,single,yes
3,35.0,male,single,yes
4,30.0,female,single,yes


In [17]:
df = pd.concat([new_df, dummy_vari], axis=1)
df.head()

Unnamed: 0,age,gender,m_status,buys,low,medium
0,25.0,male,single,no,0,0
1,25.0,male,married,no,0,0
2,35.0,male,single,yes,0,0
3,35.0,male,single,yes,0,1
4,30.0,female,single,yes,1,0


# Level Encoding for gender, m_status & buys columns

In [18]:
from sklearn.preprocessing import LabelEncoder

In [19]:
for column in df.columns: 
    if df[column].dtype == np.number:
        continue
    df[column] = LabelEncoder().fit_transform(df[column])

  if df[column].dtype == np.number:


In [20]:
df.head()

Unnamed: 0,age,gender,m_status,buys,low,medium
0,25.0,1,1,0,0,0
1,25.0,1,0,0,0,0
2,35.0,1,1,1,0,0
3,35.0,1,1,1,0,1
4,30.0,0,1,1,1,0
