In [1]:
# Importing necessary libraries
import numpy as np
import pandas as pd

## Handling Null Values

In [2]:
# Creating a dummy dataframe
data = [['Steve',25,55],['Linus',28,60],['Elon',26]]
df = pd.DataFrame(data=data,columns=['Name','Age','Weight'])

In [4]:
# Displaying the dataframe
df

Unnamed: 0,Name,Age,Weight
0,Steve,25,55.0
1,Linus,28,60.0
2,Elon,26,


In [32]:
# Checking null values
df.isnull()

Unnamed: 0,Name,Age,Weight
0,False,False,False
1,False,False,False
2,False,False,True


In [5]:
# Displaying the count of null values per column
df.isnull().sum()

Name      0
Age       0
Weight    1
dtype: int64

In [6]:
# Dropping the row having null value
# NOTE : The changes won't be reflected in the original dataframe until we set inplace=True
df.dropna()

Unnamed: 0,Name,Age,Weight
0,Steve,25,55.0
1,Linus,28,60.0


In [7]:
# Our original dataframe still has the 3rd row intact as we did not set inplace=True in the above operation
df

Unnamed: 0,Name,Age,Weight
0,Steve,25,55.0
1,Linus,28,60.0
2,Elon,26,


### Imputation

##### Using the SimpleImputer class for Imputation

In [8]:
from sklearn.impute import SimpleImputer

In [9]:
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer = imputer.fit(df[['Weight']])
df['Weight'] = imputer.transform(df[['Weight']])

In [10]:
df

Unnamed: 0,Name,Age,Weight
0,Steve,25,55.0
1,Linus,28,60.0
2,Elon,26,57.5


##### Using the fillna() method

In [11]:
df.fillna(df.mean())

  df.fillna(df.mean())


Unnamed: 0,Name,Age,Weight
0,Steve,25,55.0
1,Linus,28,60.0
2,Elon,26,57.5


## Standardization

In [12]:
from sklearn.preprocessing import StandardScaler

In [13]:
# We have two numerical columns : Age and Weight, hence we will standardize them so that they are on the same scale
std = StandardScaler()
X = std.fit_transform(df[['Age','Weight']])

In [14]:
X

array([[-1.06904497, -1.22474487],
       [ 1.33630621,  1.22474487],
       [-0.26726124,  0.        ]])

## Handling Categorical Variables

### Ordinal Categorical Variables

In [15]:
# Creating a dummy dataframe
df_cat = pd.DataFrame(data = 
                     [['green','M',10.1,'class1'],
                      ['blue','L',20.1,'class2'],
                      ['white','M',30.1,'class1']])
df_cat.columns = ['color','size','price','classlabel']

In [16]:
df_cat

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class1
1,blue,L,20.1,class2
2,white,M,30.1,class1


#### Using the map function

In [17]:
size_mapping = {'M':1,'L':2}
df_cat['size'] = df_cat['size'].map(size_mapping)

In [19]:
df_cat

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,0
1,blue,2,20.1,1
2,white,1,30.1,0


#### Using Label Encoder

In [21]:
from sklearn.preprocessing import LabelEncoder
class_le = LabelEncoder()
df_cat['classlabel'] = class_le.fit_transform(df_cat['classlabel'].values)

### Nominal Categorical Variables

In [22]:
# Using One-Hot Encoding
df_cat = pd.get_dummies(df_cat[['color','size','price']])

In [23]:
df_cat

Unnamed: 0,size,price,color_blue,color_green,color_white
0,1,10.1,0,1,0
1,2,20.1,1,0,0
2,1,30.1,0,0,1


## Multicollinearity

In [93]:
# Avoiding Multicollinearity
df_cat = pd.get_dummies(df_cat[['color','size','price']], drop_first=True)

In [94]:
df_cat

Unnamed: 0,size,price,color_green,color_white
0,1,10.1,1,0
1,2,20.1,0,0
2,1,30.1,0,1
