In [1]:
#Encoding Nominal Categorical Features
#You have a feature with with nominal classes that has no intrinsic ordering (e.g. apple, pear, banana).
#Solution
#One-hot encoding the feature using scikit-learn's LabelBinarizer:
#Import libraries
import numpy as np
from sklearn.preprocessing import LabelBinarizer, MultiLabelBinarizer

In [2]:
#Create feature
feature =np.array ([['Texas'],['California'],['Texas'], ['Delaware'],['Florida'], ['Georgia'],['Alabama'],['Texas']])

In [3]:
#Create one-hot encoder
one_hot =LabelBinarizer()
#One-hot encoder feature
one_hot.fit_transform(feature)

array([[0, 0, 0, 0, 0, 1],
       [0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1],
       [0, 0, 1, 0, 0, 0],
       [0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 1, 0],
       [1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1]])

In [4]:
#We can use the classes_method to output the classes:
#View feature classes
one_hot.classes_

array(['Alabama', 'California', 'Delaware', 'Florida', 'Georgia', 'Texas'],
      dtype='<U10')

In [5]:
#If we want to reverse the one-hot encoding we can use inverse_transformation:
#Reverse one-hot encoding
one_hot.inverse_transform(one_hot.transform(feature))

array(['Texas', 'California', 'Texas', 'Delaware', 'Florida', 'Georgia',
       'Alabama', 'Texas'], dtype='<U10')

In [6]:
#We can even use pandas to one-hot encode the feature:
#Import library
import pandas as pd
#Create dummy variables from feature
pd.get_dummies(feature[:,0])

Unnamed: 0,Alabama,California,Delaware,Florida,Georgia,Texas
0,0,0,0,0,0,1
1,0,1,0,0,0,0
2,0,0,0,0,0,1
3,0,0,1,0,0,0
4,0,0,0,1,0,0
5,0,0,0,0,1,0
6,1,0,0,0,0,0
7,0,0,0,0,0,1


In [10]:
#One helpful ability of scikit-learn is to handle a situation where each observation lists multiple classes:
#Create multiclass feature
multiclass_feature =[( 'Texas','Florida'), 
                     ('California','Texas'), 
                     ('Delaware','Arizona'), 
                     ('Georgia','Alabama'),
                     ('Texas','South Dacota'),
                    ('Ohio','Louisiana')]
#Create multiclass one-hot encoder
one_hot_multiclass =MultiLabelBinarizer()
#One-hot encode multiclass feature
one_hot_multiclass.fit_transform(multiclass_feature)

array([[0, 0, 0, 0, 1, 0, 0, 0, 0, 1],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 1],
       [0, 1, 0, 1, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 1],
       [0, 0, 0, 0, 0, 0, 1, 1, 0, 0]])

In [11]:
#Once again, we can see the classes with the classes_method:
one_hot_multiclass.classes_

array(['Alabama', 'Arizona', 'California', 'Delaware', 'Florida',
       'Georgia', 'Louisiana', 'Ohio', 'South Dacota', 'Texas'],
      dtype=object)

In [13]:
#Encoding Ordinal Categorical Features
#Problem
#You have an ordinal categorical feature (e.g. high, medium, low).
#Solution
#Use pandas DataFrame's replace method to transform string labels to numerical equivaments:
#load library
import pandas as pd
#Create features
dataframe =pd.DataFrame({'Score':['Low','Low','Medium','Medium','High'] })
#Create_mapper
scale_mapper={"Low":1, "Medium":2, "High":3}
#Replace feature values with scale
dataframe["Score"].replace(scale_mapper)

0    1
1    1
2    2
3    2
4    3
Name: Score, dtype: int64

In [15]:
###Diacussion 
####
#####
dataframe =pd.DataFrame({"Score":["Low",
                                  "Low", 
                                  "Medium",
                                  "Medium", 
                                  "High", 
                                  "Barely More Than Medium"]})
scale_mapper ={"Low":1,
              "Medium":2,
              "Barely More Than Medium":3,
              "High":4}
dataframe["Score"].replace(scale_mapper)

0    1
1    1
2    2
3    2
4    4
5    3
Name: Score, dtype: int64

In [16]:
##
scale_mapper = {"Low":1, "Medium":2, "Barely More Than Medium":2.1,
               "High":3}
dataframe["Score"].replace(scale_mapper)

0    1.0
1    1.0
2    2.0
3    2.0
4    3.0
5    2.1
Name: Score, dtype: float64

In [17]:
#Encoding Dictionaries of Features
#Problem
#You have a dictionary and wanted to convert it into a feature matrix.
#Solution
#Use DictVectorizer:
#import library
from sklearn.feature_extraction import DictVectorizer
#Create dictionary
data_dict =[{"Red":2, "Blue":4},
           {"Red":4, "Blue":3},
           {"Red":1, "Yellow":2},
           {"Red":2, "Yellow":2}]
#Create dictionary vectorizer
dictvectorizer =DictVectorizer(sparse=False)
#Convert dictionary to feature matrix
features = dictvectorizer.fit_transform(data_dict)
#View feature matrix
features

array([[4., 2., 0.],
       [3., 4., 0.],
       [0., 1., 2.],
       [0., 2., 2.]])

In [18]:
###
###
# We can get the names of each generated feature using the get_feature_names method:
#get feature names
feature_names = dictvectorizer.get_feature_names()
#view feature names
feature_names

['Blue', 'Red', 'Yellow']

In [19]:
#While not necessary, for the sake of illustration we can create a pandas DataFrame to view the output better:
#Import library
import pandas as pd
#Create dataframe from features
pd.DataFrame(features, columns =feature_names)

Unnamed: 0,Blue,Red,Yellow
0,4.0,2.0,0.0
1,3.0,4.0,0.0
2,0.0,1.0,2.0
3,0.0,2.0,2.0


In [20]:
## Discussion
###dictvectorizer
###
###Create word counts dictionaries for four documents 
doc_1_word_count = {"Red":2, "Blue":4}
doc_2_word_count = {"Red":4, "Blue":3}
doc_3_word_count = {"Red":1, "Blue":5}
doc_4_word_count = {"Red":2, "Yellow":4}
doc_5_word_count = {"Red":3, "Green":4}

##Create a list
doc_word_counts = [ doc_1_word_count,
                   doc_2_word_count, 
                   doc_3_word_count, 
                   doc_4_word_count, 
                   doc_5_word_count]

##Conver list of word count dictionaries into a feature matrix
dictvectorizer.fit_transform(doc_word_counts)

array([[4., 0., 2., 0.],
       [3., 0., 4., 0.],
       [5., 0., 1., 0.],
       [0., 0., 2., 4.],
       [0., 4., 3., 0.]])

In [28]:
##Imputing Missing Class Values
#Problem
#You have a categorical feature containing missing values that you want to replace with predicted values.
#Solution
## The ideal solution is to train a mechine learning classifier algorithm to predict the missing values, 
#commonly a k-nearest neighbour (KNN)classifier:
#Load libraries
import numpy as np
from sklearn.neighbors import KNeighborsClassifier

#Create feature matrix with categorical feature
X=np.array([[0, 2.10, 1.45],
           [1, 1.18, 1.33],
           [0, 1.22, 1.27],
           [1,-0.21, -1.19]])

#Create feature matrix with missing values in the categorical feature
x_with_nan =np.array([[np.nan, 0.87, 1.31],
                     [np.nan, -0.67, -0.22]])
#Train KNN learner
clf=KNeighborsClassifier(3, weights='distance')
trained_model = clf.fit(X[:,1:], X[:,0])
#Predict missing values' class
imputed_values =trained_model.predict(x_with_nan[:, 1:])

#Join column of predicted class with their other features
X_with_imputed =np.hstack((imputed_values.reshape(-1,1), x_with_nan[:,1:]))

#Join two feature matrices
np.vstack((X_with_imputed,X))

array([[ 0.  ,  0.87,  1.31],
       [ 1.  , -0.67, -0.22],
       [ 0.  ,  2.1 ,  1.45],
       [ 1.  ,  1.18,  1.33],
       [ 0.  ,  1.22,  1.27],
       [ 1.  , -0.21, -1.19]])

In [36]:
#An alternative solution is to fill in missing values with the feature's most frequent value:
from sklearn.impute import SimpleImputer
#Join the two feature matrices
X_complete =np.vstack((x_with_nan, X))
imputer = SimpleImputer(strategy = 'most_frequent')
imputer.fit_transform(X_complete)

array([[ 0.  ,  0.87,  1.31],
       [ 0.  , -0.67, -0.22],
       [ 0.  ,  2.1 ,  1.45],
       [ 1.  ,  1.18,  1.33],
       [ 0.  ,  1.22,  1.27],
       [ 1.  , -0.21, -1.19]])

In [40]:
#Handling Imbalanced Classes
#Problem
#You have a target vector with highly uimbalanced classes
#Solution
#Collect more data. If it isn't possible, change the metrics used to evaluate your model. If that doesn't work, consider
#using a model's build-in class weight parameters (if available),downsampling, or upsampling. 

#To demonstarte our solutions, we need to create some data with imbalanceed classes. 
#Fisher's Iris dataset contains three balanced classes of 50 observations, each indicating the species of flower 
#(Iris setosa, Iris virginica, Iris versicolor). To unbalance the dataset, we remove 40 of the 50 Iris setosa observations
#and then merge the Iris virginica and Iris versicolor classes. The end result is a binary target vector indicating if an 
#observation is an Iris setosa flower or not. 
#The result is 10 observations of Iris setosa (class 0) and 100 observations of not Iris setosa (class1):

#Load libraries
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris

In [38]:
#Load iris data
iris=load_iris()

#Create feature matrix
feature=iris.data

#Create target vector
target =iris.target

#Remove first 40 observations
features =feature[40:,:]
target =target[40:]

#Create binary target vector indicating if class 0
target =np.where((target == 0),0,1)

#look at the imbalanced target vector
target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [44]:
#Many algorithms in scikit-learn offer a parameter to weight classes during training to counteract the effect of
#their imbalance. While we have not covered it yet, 
#RandomForestClassifier is a popular classification algorithm and includes a class_weight parameter.
#You can pass an argument specifying the desited class weights explicitly:

#create weights
weights = {0:.9, 1:0.1}
#Create random forest classifier with weights
RandomForestClassifier(class_weight = weights)
RandomForestClassifier(bootstrap = True, class_weight ={0:0.9, 1:0.1},
                     criterion ='gini', max_depth =None, max_features ='auto',
                     max_leaf_nodes =None, min_impurity_decrease=0.0,
                     min_samples_split =2, min_weight_fraction_leaf=0.0,
                     n_estimators =10, n_jobs=1,oob_score=False, random_state=None,
                     verbose=0, warm_start =False)

RandomForestClassifier(class_weight={0: 0.9, 1: 0.1}, n_estimators=10, n_jobs=1)

In [45]:
#Or you can pass balanced, which automatically creates weights inversely proportional to class frequencies:
#Train a random forest with balanced class weights
RandomForestClassifier(class_weight = "balanced")
RandomForestClassifier(bootstrap = True, class_weight ='balanced',
                     criterion ='gini', max_depth =None, max_features ='auto',
                     max_leaf_nodes =None, min_impurity_decrease=0.0,
                     min_samples_split =2, min_weight_fraction_leaf=0.0,
                     n_estimators =10, n_jobs=1,oob_score=False, random_state=None,
                     verbose=0, warm_start =False)

RandomForestClassifier(class_weight='balanced', n_estimators=10, n_jobs=1)

In [49]:
#Alternatively, we can downsample the majority class or upsample the minority class.
#In downsampling, we randomly sample without replacement from the majority class to create a new subset of observations
#equal in size to the minority class.
#For example, if the minority class has 10 observations, we will randomly select 10 observations from the majority class
# and use those 20 observations as our data. 
#Indicies of each class observations
i_class0 = np.where(target ==0)[0]
i_class1= np.where(target ==1)[0]

#Number of observations in each class
n_class0 =len(i_class0)
n_class1 =len(i_class1)

#For every observation of class 0, randomly sample
#from class 1 without replace
i_class1_downsampled=np.random.choice(i_class1, size=n_class0, replace=False)

#Join together class 0's target vector with the
#downsample class 1's targetr vector
np.hstack((target[i_class0], target[i_class1_downsampled]))

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [50]:
#Join together class 0's feature matrix with the 
#downsampled class 1's feature matrix
np.vstack((features[i_class0,:], features [i_class1_downsampled,:]))[0:5]

array([[5. , 3.5, 1.3, 0.3],
       [4.5, 2.3, 1.3, 0.3],
       [4.4, 3.2, 1.3, 0.2],
       [5. , 3.5, 1.6, 0.6],
       [5.1, 3.8, 1.9, 0.4]])

In [52]:
#Our other option is to upsample the minority class. In upsampling, 
#for every observation in the majority class, we randomly select an observation from the minority class with replacement.
#The end result is the same number of observations from the majority and minority classes. 
#Upsampling is implemented very similarly to down-sampling, just in reverse:

#For every observation in class 1, randomly sample from class 0 witrh replacement
i_class0_upsampled=np.random.choice(i_class0, size=n_class1, replace=True)

#Join together class 0's upsampled target wector with class 1's target vector 
np.concatenate((target[i_class0_upsampled], target [i_class1]))


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1])

In [54]:
#Join together calss 0's upsampled feature matrix with class 1's feature matrix
np.vstack((features[i_class0_upsampled,:],features[i_class1,:]))[0:5]

array([[4.8, 3. , 1.4, 0.3],
       [5.3, 3.7, 1.5, 0.2],
       [5. , 3.3, 1.4, 0.2],
       [5.3, 3.7, 1.5, 0.2],
       [5.1, 3.8, 1.9, 0.4]])

In [None]:
###Discussion