In [1]:
import numpy as np

# Nominal feature encoding

In [2]:
from sklearn.preprocessing import LabelBinarizer, MultiLabelBinarizer

In [3]:
# single class feature
feature = np.array([
    ['Los Angeles'],
    ['San Jose'],
    ['San Francisco'],
    ['Palo Alto'],
    ['Mountain View']
])

In [4]:
LabelBinarizer().fit_transform(feature)

array([[1, 0, 0, 0, 0],
       [0, 0, 0, 0, 1],
       [0, 0, 0, 1, 0],
       [0, 0, 1, 0, 0],
       [0, 1, 0, 0, 0]])

In [5]:
# multiclass feature
a = np.array([
    ['Los Angeles','Snap'],
    ['San Jose','Uber'],
    ['San Francisco','Google'],
    ['Palo Alto','OpenAI'],
    ['Mountain View','Google']
])

In [6]:
MultiLabelBinarizer().fit_transform(a)

array([[0, 1, 0, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 1],
       [1, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 1, 1, 0, 0, 0, 0],
       [1, 0, 1, 0, 0, 0, 0, 0, 0]])

In [7]:
import pandas as pd

In [8]:
feature

array([['Los Angeles'],
       ['San Jose'],
       ['San Francisco'],
       ['Palo Alto'],
       ['Mountain View']], dtype='<U13')

In [9]:
pd.get_dummies(feature[:,0],drop_first=True)

Unnamed: 0,Mountain View,Palo Alto,San Francisco,San Jose
0,0,0,0,0
1,0,0,0,1
2,0,0,1,0
3,0,1,0,0
4,1,0,0,0


# Ordinal Feature encoding

In [10]:
dataframe = pd.DataFrame({"Score": ["Low", "Low", "Medium", "Medium", "High"]})

In [11]:
dataframe

Unnamed: 0,Score
0,Low
1,Low
2,Medium
3,Medium
4,High


In [12]:
scale_mapper = {"Low":1,"Medium":2,"High":3}

In [13]:
dataframe['Score'].replace(scale_mapper)

0    1
1    1
2    2
3    2
4    3
Name: Score, dtype: int64

# Imputing missing data

In [14]:
from sklearn.impute import SimpleImputer

In [15]:
a = np.array([
    [2,3],
    [8,6],
    [np.nan,9],
    [6,3]
])

In [16]:
mean_imputer = SimpleImputer(strategy='mean')

In [17]:
mean_imputer.fit_transform(a)

array([[2.        , 3.        ],
       [8.        , 6.        ],
       [5.33333333, 9.        ],
       [6.        , 3.        ]])

In [18]:
imputer = SimpleImputer(strategy='most_frequent')

In [19]:
imputer.fit_transform(a)

array([[2., 3.],
       [8., 6.],
       [2., 9.],
       [6., 3.]])

# Handling Imbalanced classes

In [20]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris

In [21]:
iris = load_iris()

In [22]:
features = iris.data
target = iris.target

In [23]:
features = features[40:,:]
target = target[40:]

In [24]:
target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [25]:
target = np.where((target == 0), 0 ,1) # where target == 0, write by 0, else write by 1

In [26]:
target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

### To tackle imbalance, generally assign a weight to labels<br>
### redundant labels get higher weights and abundant lables lower weights

In [27]:
weights = {0:0.9,1:0.1}

In [28]:
forest = RandomForestClassifier(class_weight=weights)

In [29]:
# OR
forest = RandomForestClassifier(class_weight='balanced')

In [30]:
# Other methods are to upsample using sklearn/imblearn