# Preprocessing Structured Data

# Convert Pandas Categorical Data For Scikit-Learn


In [None]:
# Import required packages
from sklearn import preprocessing
import pandas as pd

In [None]:
raw_data = {'patient':[1,2,3,4,5],
           'obs' : [1,2,3,1,2],
           'treatment' : [0,1,0,1,0],
           'score': ['strong','weak', 'normal', 'weak', 'strong']}

In [None]:

df = pd.DataFrame(raw_data, columns = ['patient', 'obs', 'treatment', 'score'])

df

In [None]:
# Fit the label encoder

le = preprocessing.LabelEncoder()

In [None]:
# Fit the encoder to the pandas column

le.fit(df['score'])

In [None]:
# View the labels 

list(le.classes_)

In [None]:
# Transform Categories Into Integers
le.transform(df['score'])

In [None]:
# Transform Integers into their category names

list(le.inverse_transform([2,2,1]))

# Delete Observations With Missing Values


In [None]:
# Load libraries
import numpy as np
import pandas as pd

In [None]:
# Create feature matrix
X = np.array([[1.1, 11.1], 
              [2.2, 22.2], 
              [3.3, 33.3], 
              [4.4, 44.4],
              [8.8, 88.8],
              [np.nan, 55]])

#### Delete Observations With Missing Values


In [None]:
# Remove observations with missing values
X[~np.isnan(X).any(axis=1)]

# Deleting Missing Values


In [None]:
# Load library
import numpy as np
import pandas as pd

In [None]:
# Create Data Frame
X = np.array([[1.11111,2],
             [6,3.3],
             [8.8,4],
             [9,5.5],
             [np.nan,4]])

In [None]:
X

In [None]:
# Drop Missing Values Using NumPy
X[~np.isnan(X).any(axis=1)]

In [None]:
# Drop Missing Values using PANDAS

df = pd.DataFrame(X, columns=['cosa_buena', 'cosa_mala'])


In [None]:
# Remove observations with missing values 
df.dropna()

# Detecting Outliers


In [None]:
# Load libraries
import numpy as np 
from sklearn.covariance import EllipticEnvelope
from sklearn.datasets import make_blobs

In [None]:
# Create Data

X, _ = make_blobs(n_samples = 10,
                 n_features = 2 ,
                 centers = 1 ,
                 random_state = 1)


In [None]:
X

In [None]:
_

In [None]:
# Replace the first observationś values with extreme values

X[0,0] = 10000
X[0,1] = 10000

#### Detect Outliers


EllipticEnvelope assumes the data is normally distributed and based on that assumption “draws” an ellipse around the data, classifying any observation inside the ellipse as an inlier (labeled as 1) and any observation outside the ellipse as an outlier (labeled as -1). A major limitation of this approach is the need to specify a contamination parameter which is the proportion of observations that are outliers, a value that we don’t know.



In [None]:
# Create Detector 

outlier_detector = EllipticEnvelope(contamination=.1)
outlier_detector

In [None]:
# Fit Detector 
outlier_detector.fit(X)


In [None]:
# Predict Outliers

outlier_detector.predict(X)

# Discretize Features


In [None]:
# Load libraries
from sklearn.preprocessing import Binarizer
import numpy as np

In [None]:
# Create feature
age = np.array([[6], 
                [12], 
                [20], 
                [36], 
                [65]])
age

#### Option 1: Binarize Feature


In [None]:
# Create binarizer
binarizer = Binarizer(18)

binarizer

In [None]:
# Transform feature
binarizer.fit_transform(age)

#### Option 2: Break Up Feature Into Bins


In [None]:
# Bin feature
np.digitize(age, bins=[20,30,64])

# Encoding Ordinal Categorical Features


#### Convertir una variable categórica en integer

In [None]:
# Loading libraries 
import pandas as pd 

In [None]:
# Create feature matrix

df = pd.DataFrame({'Score': ['Low', 'Low', 'Medium', 'Medium', 'High']})
df

In [None]:
# Create Scale Map ( Mapper) 

scale_mapper = {'Low':1,
               'Medium':2,
               'High': 3}
scale_mapper

In [None]:
# Map scale to Features
# Map feature values to scale

df['Scale'] = df['Score'].replace(scale_mapper)
df

# Handling Imbalanced Classes With Downsampling


# DOWNSAMPLING : 

#### A strategy to handle imbalanced classes by creating a random subset of the majority of equal size to minority class.
In downsampling, we randomly sample without replacement from the majority class (i.e. the class with more observations) to create a new subset of observation equal in size to the minority class.

In [None]:
# Load Libraries
import numpy as np
from sklearn.datasets import load_iris

In [None]:
# Load df

iris = load_iris()
iris.keys()

In [None]:
# Create feature matrix

X = iris.data
X

In [None]:
#Create target vector

y = iris.target
y

### Make Iris Data Set Imbalanced 


In [None]:
# Remove first 40 observations

X = X[40:,:]
X

In [None]:
# Remove first 40 observations 

y = y[40:]
y

In [None]:
# Create a binary target vector indicating if class 0

y = np.where((y == 0), 0, 1)
y

## Downsample majority class to match minority class

In [None]:
# Indicies of each class' observations

i_class0 = np.where(y == 0)[0]
i_class0

In [None]:
## Indicies of each class' observations

i_class1 = np.where(y== 1)[0]
i_class1

In [None]:
# Number of observations in each class

n_class0 = len(i_class0)
n_class0

In [None]:
# Number of observations in each class

n_class1 = len(i_class1)
n_class1

In [None]:
# For every observation of class 0, randomly sample from class 1 without replacement

i_class1_downsampled = np.random.choice(i_class1, size = n_class0, replace = False)
i_class1_downsampled

In [None]:
# Join together class 0ś target vector with teh downsampled class 1ś target vector

np.hstack((y[i_class0], y[i_class1_downsampled]))

# Handling Imbalanced Classes With Upsampling

## UPSAMPLING

#### A strategy  to handle imbalanced classes by repeatedly sample with replacement from the minority class to make it of equal size as the majority class. 

In upsampling, for every observation in the majority class, we randomly select an observation from the minority class with replacement. The end result is the same number of observations from the minority and majority classes.



In [None]:
# Load libraries
import numpy as np
from sklearn.datasets import load_iris

In [None]:
# Load iris dataset

iris = load_iris()

In [None]:
# Create feature matrix

X = iris.data
y = iris.target

In [None]:
# Make Iris DataSet imbalanced

X = X[40:,:]
y = y[40:]

In [None]:
# Create binary target vector indicating if class 0

y = np.where((y==0), 0,1)

# Look at the imbalanced target vector
y

### Upsampling Minority Class To Match Majority

In [None]:
# Indicies of each class'observations 

i_class0 = np.where(y==0)[0]
i_class0

In [None]:
# Indicies of each class'observations 

i_class1 = np.where(y==1)[0]
i_class1

In [None]:
# Number of observations in each class

n_class0 = len(i_class0)
n_class0

In [None]:
# Number of observations in each class

n_class1 = len(i_class1)
n_class1

In [None]:
# For every observation in class 1, randomly sample from class 0 with replacement

i_class0_UPsampled = np.random.choice(i_class0, size = n_class1, replace = True)   #### difference with DOWNsampling
i_class0_UPsampled

In [None]:
# Join together calss 0ś upsampled target vector with class 1ś target vector

np.concatenate((y[i_class0_UPsampled],y[i_class1]))

# Handling Outliers

### Drop:
Not a great option. We lose lots of information. Find out if genuine extreme value or broken sensor.

### Mark:
Safest option. We can see if the outliers had an effect.

### Rescale: 
Log values so outliers don gave as great an effect.

In [None]:
# Loading libraries

import pandas as pd

In [None]:
# Create DataFrame

houses = pd.DataFrame()
houses['Price'] = [1000,2000,3000,4000,50000]
houses['Bathrooms'] = [2,3,4,4.5,160]
houses['Square_Feet']= [1500, 2500, 3500, 2500, 48000]
houses

## DROP:

In [None]:
# Drop observations greater than some value

houses[houses['Bathrooms']<8]

## MARK: 

In [None]:
# Load library

import numpy as np

In [None]:
# Create feature based on boolean condition

houses['Outlier'] = np.where(houses['Square_Feet'] < 3000,0,1)   #  Menor que 3000 y sustituit por 0 y 1
houses

## RESCALE:   ( TO LOGARITHMS)

In [None]:
houses

In [None]:
# Log feature 

houses['Log_Of_Square_Feet'] = [np.log(x) for x in houses['Square_Feet']]
houses


# Impute Missing Values With Means

Mean imputation replaces missing values with the mean value of that feature/variable. Mean imputation is one of the most ‘naive’ imputation methods because unlike more complex methods like k-nearest neighbors imputation, it does not use the information we have about an observation to estimate a value for it.




In [None]:
# Load Libraries

import pandas as pd
import numpy as np 
from sklearn.preprocessing import Imputer

In [None]:
# Create data

# Create an empty dataset

df = pd.DataFrame()

# Create twpo variables called x0 & x1. Make the first value of X1 a missing value

df['x0'] = [0.3051,0.4949,0.6974,0.3769,0.2231,0.341,0.4436,0.5897,0.6308,0.5]
df['x1'] = [np.nan,0.2654,0.2615,0.5846,0.4615,0.8308,0.4962,0.3269,0.5346,0.6731]

df


### Fit Imputer 

In [None]:
# Create an imputer object that looks for 'Nan' values, then replaces them with the mean value 
# of the features by columns (axis=0)

mean_imputer = Imputer(missing_values='NaN', strategy = 'mean', axis = 0)
mean_imputer

In [None]:
# Train the inputor on the df dataset

mean_imputer = mean_imputer.fit(df)
mean_imputer

### Apply Imputer

In [None]:
# Apply the imputer to the df dataset

imputed_df = mean_imputer.transform(df.values)

imputed_df

## 0.49273333 es el mean sustituido

# Imputing Missing Class Labels
##  Fill Missing Values’ Class With Most Frequent Class



In [None]:
# Load libraries 

import numpy as np
from sklearn.preprocessing import Imputer

### Create feature matrix with missing values

In [None]:
# Create feature matriz with categorical feature

X = np.array([[0, 1.1, 2.2],
             [1, 3.3, 4.4],
             [0, 5.5, 6.6],
             [0, -7.7, -8.8],
             [np.nan, 0.87, 9.9],
             [np.nan, -0.67, -0.22]])
X

### Fill Missing Values’ Class With Most Frequent Class


In [None]:
# Create Imputer object 

imputer = Imputer(strategy='most_frequent', axis = 0)

In [None]:
# Fill missing values class with most frequest class

imputer.fit_transform(X)

# Imputing Missing Class Labels Using k-Nearest Neighbors


In [1]:
import numpy as np
from sklearn.neighbors import KNeighborsClassifier

In [2]:
# Create feature matrix with categorical feature

X = np.array([[0, 2.10, 1.45], 
              [1, 1.18, 1.33], 
              [0, 1.22, 1.27],
              [1, -0.21, -1.19]])
X

array([[ 0.  ,  2.1 ,  1.45],
       [ 1.  ,  1.18,  1.33],
       [ 0.  ,  1.22,  1.27],
       [ 1.  , -0.21, -1.19]])

### Create Feature Matrix With Missing Values

In [3]:
## Create feature matrix with missing values in the categorical feature
 
X_with_nan = np.array([[np.nan, 0.87, 1.31],
                      [np.nan, -0.67, -0.22]])
X_with_nan

array([[  nan,  0.87,  1.31],
       [  nan, -0.67, -0.22]])

### Train k-Nearest Neighbor Classifier


In [4]:
# Train KNN Learner

clf = KNeighborsClassifier(3, weights = 'distance')

clf

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='distance')

In [5]:
trained_model = clf.fit(X[:,1:], X[:,0])
trained_model

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='distance')

### Predict missing values'class


In [6]:
# Predict missing values class'

imputed_values = trained_model.predict(X_with_nan[:,1:])
imputed_values

array([ 0.,  1.])

In [7]:
# # Join column of predicted class with their other features

X_with_imputed = np.hstack((imputed_values.reshape(-1,1), X_with_nan[:,1:]))


In [8]:
# Join two feature matrices

np.vstack((X_with_imputed, X))

array([[ 0.  ,  0.87,  1.31],
       [ 1.  , -0.67, -0.22],
       [ 0.  ,  2.1 ,  1.45],
       [ 1.  ,  1.18,  1.33],
       [ 0.  ,  1.22,  1.27],
       [ 1.  , -0.21, -1.19]])

# Normalizing Observations

#### Rescaling the feature values of each observation so that they have a unit form. Two common norm values are L1 and L2. ( Usually L2 is predicted as default)


In [9]:
# Load libraries

from sklearn.preprocessing import Normalizer
import numpy as np

In [12]:
# Create feature matrix

X = np.array([[0.5, 0.5],
              [1.1, 3.4],
              [1.5, 20.2],
              [1.63, 34.4],
              [10.9, 3.3]])
X

array([[  0.5 ,   0.5 ],
       [  1.1 ,   3.4 ],
       [  1.5 ,  20.2 ],
       [  1.63,  34.4 ],
       [ 10.9 ,   3.3 ]])

### Normalize observations 

Normalizer rescales the values on individual observations to have unit form ( the sum of their lengths is one)

In [13]:
# Create Normalizer

normalizer = Normalizer(norm='l2')  ## L2

# Transform feature matrix 

normalizer.transform(X)

array([[ 0.70710678,  0.70710678],
       [ 0.30782029,  0.95144452],
       [ 0.07405353,  0.99725427],
       [ 0.04733062,  0.99887928],
       [ 0.95709822,  0.28976368]])

# One-Hot Encode Features With Multiple Labels



In [20]:
# Load libraries

from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np

In [21]:
# Create NumPy Array

y = [('Granada', 'Málaga'),
    ('Zanzibar', 'Madrid'),
    ('Zanzibar', 'Bali'),
    ('Zanzibar', 'Granada'),
    ('Bali', 'Granada')]




### One-hot Encode Data

In [22]:
# Create MultiBalbelBinarizer object

one_hot = MultiLabelBinarizer()

# One-hot encode data

one_hot.fit_transform(y)

array([[0, 1, 0, 1, 0],
       [0, 0, 1, 0, 1],
       [1, 0, 0, 0, 1],
       [0, 1, 0, 0, 1],
       [1, 1, 0, 0, 0]])

In [23]:
# View column Headers

one_hot.classes_

array(['Bali', 'Granada', 'Madrid', 'Málaga', 'Zanzibar'], dtype=object)

# One-Hot Encode Nominal Categorical Features

### One Hot encoding allows us to turn nominal categorical data into features with numerical values, while not mathematically imply ani ordinal relationship between the classes.

#### Example : feature( 'Apple','Pear', 'Apple','Pear', 'Apple')
          Apple(1,0,1,0,1)
          Pear (0,1,0,1,0)

In [25]:
# Load Libraries

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelBinarizer

###  Create Data With One Class Label

In [26]:
# Create NumPy array
x = np.array([['Granada'],
              ['Bali'],
              ['Granada'],
              ['Zanzibar'],
              ['Granada']])
x

array([['Granada'],
       ['Bali'],
       ['Granada'],
       ['Zanzibar'],
       ['Granada']],
      dtype='<U8')

## One-hot Encode Data (Method 1)


In [27]:
# Create LabelBinarizer object

one_hot = LabelBinarizer()

# One hot encode data

one_hot.fit_transform(x)

array([[0, 1, 0],
       [1, 0, 0],
       [0, 1, 0],
       [0, 0, 1],
       [0, 1, 0]])

In [28]:
# View Columns Headers
# View classes

one_hot.classes_

array(['Bali', 'Granada', 'Zanzibar'],
      dtype='<U8')

## One-hot Encode Data (Method 2)

In [29]:
# Dummy Feature

pd.get_dummies(x[:,0])

Unnamed: 0,Bali,Granada,Zanzibar
0,0,1,0
1,1,0,0
2,0,1,0
3,0,0,1
4,0,1,0


# Preprocessing Categorical Features

### Often, machine learning methods (e.g. logistic regression, SVM with a linear kernel, etc) will require that categorical variables be converted into dummy variables (also called OneHot encoding). For example, a single feature Fruit would be converted into three features, Apples, Oranges, and Bananas, one for each category in the categorical feature.

There are common ways to preprocess categorical features: using pandas or scikit-learn.


In [30]:
# Load libraries

from sklearn import preprocessing
from sklearn.pipeline import Pipeline
import pandas as pd

### Create data

In [31]:
raw_data = {'first_name': ['Kike', 'Nuno', 'Carlos', 'Messi', 'Coutinho'],
           'last_name': ['Pedrinaci','Carvalho', 'Rosado', 'D10S', 'Correia'],
           'age': ['29', '25', '28', '31', '27'],
           'city' : ['Granada', 'Lisboa', 'Ávila', 'Rosario','Rio de Janeiro']
           }
df = pd.DataFrame(raw_data, columns = ['first_name', 'last_name', 'age', 'city'])
df

Unnamed: 0,first_name,last_name,age,city
0,Kike,Pedrinaci,29,Granada
1,Nuno,Carvalho,25,Lisboa
2,Carlos,Rosado,28,Ávila
3,Messi,D10S,31,Rosario
4,Coutinho,Correia,27,Rio de Janeiro


### Convert Nominal Categorical Feature Into Dummy Variables Using Pandas


In [32]:
# Create dummy variales for every unique category in df.city

pd.get_dummies(df['city'])

Unnamed: 0,Granada,Lisboa,Rio de Janeiro,Rosario,Ávila
0,1,0,0,0,0
1,0,1,0,0,0
2,0,0,0,0,1
3,0,0,0,1,0
4,0,0,1,0,0


### Convert Nominal Categorical Data Into Dummy (OneHot) Features Using Scikit

#### Note that the output of pd.get_dummies() and the scikit methods produces the same output matrix.



In [33]:
# Convert strings categorical names to integers

integerized_data = preprocessing.LabelEncoder().fit_transform(df['city'])
integerized_data

array([0, 1, 4, 3, 2])

In [35]:
# Convert integer categorical representations to OneHot encodings

preprocessing.OneHotEncoder().fit_transform(integerized_data.reshape(-1,1)).toarray()

array([[ 1.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  1.],
       [ 0.,  0.,  0.,  1.,  0.],
       [ 0.,  0.,  1.,  0.,  0.]])

# Preprocessing Iris Data


In [38]:
import numpy as np
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import train_test_split



In [41]:
# Load Data

iris = datasets.load_iris()
X = iris.data
y = iris.target

### Split Data For Cross Validation

In [48]:
# Random split the data into four new datasets, training features, training outcome, test features, 
# and test outcome. Set the size of the test data to be 30% of the full dataset.

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)


### Standardize Feature Data


In [50]:
# Load standard scaler 

sc = StandardScaler()

# Compute the mean and standard desviation based on the training data

sc.fit(X_train)

# Scale the training data to be of mean 0 and of unit variance

X_train_std = sc.transform(X_train)

# Scale the test data to be of mean 0 and of unit variance

X_test_std = sc.transform(X_test)

In [55]:
# Feature Test Data, non standarized

X_test[0:5]

array([[ 6.1,  2.8,  4.7,  1.2],
       [ 5.7,  3.8,  1.7,  0.3],
       [ 7.7,  2.6,  6.9,  2.3],
       [ 6. ,  2.9,  4.5,  1.5],
       [ 6.8,  2.8,  4.8,  1.4]])

In [56]:
# Feature Test Data, standarized

X_test_std[0:5]

array([[ 0.3100623 , -0.49582097,  0.48403749, -0.05143998],
       [-0.17225683,  1.92563026, -1.26851205, -1.26670948],
       [ 2.23933883, -0.98011121,  1.76924049,  1.43388941],
       [ 0.18948252, -0.25367584,  0.36720086,  0.35364985],
       [ 1.15412078, -0.49582097,  0.54245581,  0.21861991]])

# Rescale A Feature
### Min Max Scaling

Rescales feature values to between (0,1). Rescaled value ==> X = (Original value - min(x) ) / (Max(x) - min(x))

In [57]:
import numpy as np 
from sklearn import preprocessing 

### Create Feature

In [58]:
x = np.array([[-500.5],
             [-100.1],
             [0],
             [100.1],
             [900.9]])
x

array([[-500.5],
       [-100.1],
       [   0. ],
       [ 100.1],
       [ 900.9]])

### Rescale Feature Using Min-Max


In [59]:
# Create Scaler

minmax_scale = preprocessing.MinMaxScaler(feature_range=(0,1))

In [61]:
# Scale feature

x_scale = minmax_scale.fit_transform(x)
x_scale

array([[ 0.        ],
       [ 0.28571429],
       [ 0.35714286],
       [ 0.42857143],
       [ 1.        ]])

# Standardize A Feature

### Standardization is a common scaling method. Xi represents the number of standard deviations each value is from the mean value. It rescales a feature to hace a mean of 0 ad unit variance

Xi = Xi - xº / Ó    [ standarized feature value = (value of the ith observation - mean of the feature vector )/ (standard deviation of feature vector)]


In [73]:
# Load libraries

from sklearn import preprocessing
import numpy as np

In [74]:
# Create feature
x = np.array([[-500.5], 
              [-100.1], 
              [0], 
              [100.1], 
              [900.9]])
x

array([[-500.5],
       [-100.1],
       [   0. ],
       [ 100.1],
       [ 900.9]])

### Standardize Feature


In [75]:
#Create scaler

scaler = preprocessing.StandardScaler()

In [77]:
# Transform the feature

standardized = scaler.fit_transform(x)
standardized

array([[-1.26687088],
       [-0.39316683],
       [-0.17474081],
       [ 0.0436852 ],
       [ 1.79109332]])