In [14]:
import pandas as pd
from io import StringIO

import numpy as np

In [2]:
# Creating a raw data with missing values
csv_data = \
'''A,B,C,D,E
1.0,2.0,3.0,,4.0
5.0,6.0,,,8.0
10.0,11.0,,,12.0'''
df=pd.read_csv(StringIO(csv_data))
df

Unnamed: 0,A,B,C,D,E
0,1.0,2.0,3.0,,4.0
1,5.0,6.0,,,8.0
2,10.0,11.0,,,12.0


1. Hence,we see that the missing values are present as 'NaN' entries.
2. For large data seeing these entries manually is difficult hence we can use isnull() along with sum() to get the number of null entries in each column for all the samples.

In [4]:
df.isnull().sum()

A    0
B    0
C    2
D    3
E    0
dtype: int64

In [3]:
# To access the df values in numpy array we can use `values`.
df.values

array([[ 1.,  2.,  3., nan,  4.],
       [ 5.,  6., nan, nan,  8.],
       [10., 11., nan, nan, 12.]])

## 1. Eliminating samples with missing values

In [5]:
# eliminating the entire row can be acheived via axis=0, and
# eliminating the entire feature can be acheived via axis=1
print(df.dropna(axis=0))
print(df.dropna(axis=1))

Empty DataFrame
Columns: [A, B, C, D, E]
Index: []
      A     B     E
0   1.0   2.0   4.0
1   5.0   6.0   8.0
2  10.0  11.0  12.0


In [6]:
# Different parametes of dropna() that can come handy
# only drop row/column where all columns are NaN
print(df.dropna(how='all'))
print(df.dropna(how='all',axis=1))

      A     B    C   D     E
0   1.0   2.0  3.0 NaN   4.0
1   5.0   6.0  NaN NaN   8.0
2  10.0  11.0  NaN NaN  12.0
      A     B    C     E
0   1.0   2.0  3.0   4.0
1   5.0   6.0  NaN   8.0
2  10.0  11.0  NaN  12.0


In [7]:
# drop rows/columns that have less than 4 real values
df.dropna(thresh=4)

Unnamed: 0,A,B,C,D,E
0,1.0,2.0,3.0,,4.0


In [8]:
# only drop rows where NaN appear in specific columns
df.dropna(subset=['C'])

Unnamed: 0,A,B,C,D,E
0,1.0,2.0,3.0,,4.0


Note :<br>
By removing too much rows/columns, we may end up removing too many samples, which will make a reliable analysis impossible. Or, if we remove too many feature columns, we will run the risk of losing valuable information that our classifier needs to discriminate between classes.

## 2. Interpolation Techniques

### 2.1 Mean Imputation

In [15]:
# simply replace the missing value with the mean values of the 
# entire column
from sklearn.impute import SimpleImputer

imr=SimpleImputer(missing_values=np.nan,strategy='mean')
imr=imr.fit(df.values)
imputed_data=imr.transform(df.values)
imputed_data

array([[ 1.,  2.,  3.,  4.],
       [ 5.,  6.,  3.,  8.],
       [10., 11.,  3., 12.]])

Other strategy includes `median` or `most_frequent`(used for categorical feature values)

The `SimpleImputer` class belongs to the so-called **transformer** classes in scikit-learn, which are used for data transformation. The two essential methods of those estimators are fit and transform. The `fit` method is used to learn the parameters from the training data, and the `transform` method uses those parameters to transform the data.

## 3. Handling Categorical data
When dealing with Categorical data it can further be divided into 2 categories :-
1. Ordinal features : Categorical values that can be sorted.
2. Nominal features : Categorifal values in which the values can't be sorted

In [35]:
# Creating a small dataset
df=pd.DataFrame([['green','M',10.1,'class1'],['red','L',13.5,'class2'],
                 ['blue','XL',15.3,'class1']],columns=['color','size','Price','class_label'])
df.head()

Unnamed: 0,color,size,Price,class_label
0,green,M,10.1,class1
1,red,L,13.5,class2
2,blue,XL,15.3,class1


In above data `size` can be considered as **ordinal** feature and `color` as the **Nominal** feature

In [36]:
# To make sure that the learning algorithm interprets the ordinal feature
# correctly, there is not convenient f'n that can automatically derive the 
# correct order of the labels of our `size` feature.

size_mapping={'XL':3,'L':2,'M':1}
df['size']=df['size'].map(size_mapping)
df.head()

Unnamed: 0,color,size,Price,class_label
0,green,1,10.1,class1
1,red,2,13.5,class2
2,blue,3,15.3,class1


In [30]:
# Many machine learning libraries requires that class label are encoded
# as integer values. Although most estimators for classification in scikit-learn
# convert class label internally into integers. It is considered good practise
# to convert class labels into integers

class_mapping={label:idx for idx,label in enumerate(df['class_label'].unique())}
class_mapping

{'class1': 0, 'class2': 1}

In [31]:
df['class_label']=df['class_label'].map(class_mapping)

In [32]:
df.head()

Unnamed: 0,color,size,Price,class_label
0,green,1,10.1,0
1,red,2,13.5,1
2,blue,3,15.3,0


In [37]:
# Alternatively one can also use 'LabelEncoder' class directly 
from sklearn.preprocessing import LabelEncoder
class_le=LabelEncoder()
y=class_le.fit_transform(df['class_label'].values)
y

array([0, 1, 0])

In [38]:
class_le.inverse_transform(y)

array(['class1', 'class2', 'class1'], dtype=object)

### One-hot encoding on nominal features

In [40]:
X=df[['color','size','Price']].values
X

array([['green', 1, 10.1],
       ['red', 2, 13.5],
       ['blue', 3, 15.3]], dtype=object)

In [44]:
pd.get_dummies(df[['Price','color','size']])

Unnamed: 0,Price,size,color_blue,color_green,color_red
0,10.1,1,0,1,0
1,13.5,2,0,0,1
2,15.3,3,1,0,0
