In [None]:

#Normalizing Observations

In [11]:
#Problem
#You want to rescale the feature values of observations to have unit norm (a totoal length of 1)

#Solution
#Use Normalizer with a norm argument:

#Load libraries
import numpy as np
from sklearn.preprocessing import Normalizer

#Create feature matrix
features =np.array([[0.5, 0.5],
                    [1.1, 3.4],
                    [1.5, 20.2],
                    [1.63,34.4],
                    [10.9, 3.3]])

#Create normalizer
normalizer= Normalizer(norm = "l2")

#Transform feature matrix
normalizer.transform(features)

array([[0.70710678, 0.70710678],
       [0.30782029, 0.95144452],
       [0.07405353, 0.99725427],
       [0.04733062, 0.99887928],
       [0.95709822, 0.28976368]])

In [13]:
##Many rescaling methods (e.g. min-max scaling and standardization)operate on features;
##We can also resacle across individual observations
##Normalizer rescales the values on individual observations to have unit norm (the sum of their lengths is 1)

##This type of rescaling is often used when we have many equivalent features
#(e.g., text classification when every word or n-word group is a feature)

##Normalizer provides three norm options with Euclidean norm (often called L2)

##Transform feature matrix
features_l2_norm =Normalizer(norm ='l2').transform(features)

##Show feature matrix
features_l2_norm

array([[0.70710678, 0.70710678],
       [0.30782029, 0.95144452],
       [0.07405353, 0.99725427],
       [0.04733062, 0.99887928],
       [0.95709822, 0.28976368]])

In [15]:
#Alternatively, we can specifify Manhattan norm(L1):

#Transform feature matrix
features_l1_norm =Normalizer(norm ='l1').transform(features)

#Show feature matrix
features_l1_norm

array([[0.5       , 0.5       ],
       [0.24444444, 0.75555556],
       [0.06912442, 0.93087558],
       [0.04524008, 0.95475992],
       [0.76760563, 0.23239437]])

In [21]:
#Practically notice that norm= 'l1' rescales an observation's values so they sum to 1,
#which is sometimes be a desirable quality:

#Print sum
print("Sum of the first observation\'s values:",
  features_l1_norm[0,0] + features_l1_norm[0,1])

Sum of the first observation's values: 1.0


In [23]:
##Generating Polynomial and Interaction Features
##Problem: You want to create polynomial and interaction features
#Solution
##Even though some choose to create polynomial and interaction features manually, 
#scikit learn offers a build-in method:

#Load libraries
import numpy as np
from sklearn.preprocessing import PolynomialFeatures

#Create feature matrix
features =np.array([[2,3],
                   [2,3],
                   [2,3]])

#Create PolynomialFeatures object
polynomial_interaction = PolynomialFeatures(degree =2, include_bias =False)

#Create polynomial features
polynomial_interaction.fit_transform(features)




array([[2., 3., 4., 6., 9.],
       [2., 3., 4., 6., 9.],
       [2., 3., 4., 6., 9.]])

In [25]:
#The degree parameter determines determines the maximum degree of the polynomial.
#For example, degree =2 will create new features raised to the second power

#We can restrict the features created to only interaction features by setting interaction_only to True:
interaction = PolynomialFeatures(degree =2,
                                interaction_only =True, 
                                include_bias =False)
interaction.fit_transform(features)

array([[2., 3., 6.],
       [2., 3., 6.],
       [2., 3., 6.]])

In [27]:
#Discussion
#Polynomial features are often created when we want to include the notion that there exists a nonlinear relationship
#between the features and target

##Transforming Features
#Problem
#You want to make custom transformation to one or more features:

#Load libraries
import numpy as np
from sklearn.preprocessing import FunctionTransformer

#Create feature matrix
features =np.array([[2,3],
                   [2,3],
                   [2,3]])

#Define a simple function
def add_ten(x):
    return x+10

#Create transformer
ten_transformer =FunctionTransformer(add_ten)

#Transform feature matrix
ten_transformer.transform(features)



array([[12, 13],
       [12, 13],
       [12, 13]])

In [28]:
#We can create the same transformation in pandas using apply:

#Load libraries
import pandas as pd

#Create DataFrame
df=pd.DataFrame(features, columns = ["feature_1", "feature_2"])

#Apply function
df.apply(add_ten)

Unnamed: 0,feature_1,feature_2
0,12,13
1,12,13
2,12,13


In [38]:
#It is common to want to make some custom transformations to one or more features.

##Detecting Outliers
#You want to identify extreme observations.

#Solution
##Detecting outliers is more of an art than a science.

#However, a common rule of thumb is to assume the data is normally distributed and based on that assumption
#'draw' an ellipse around the data, classify any observation inside the ellips as an inlier(labeled as 1)
#and any observation outside the ellipse as an outlier (labeled as -1):


#Load libraries
import numpy as np
from sklearn.covariance import EllipticEnvelope
from sklearn.datasets import make_blobs

#Create simulated data
feature,_ = make_blobs(n_samples =110,
                       n_features =2,
                       centers =1,
                       random_state =1)

#Replace the first observation's values with extreme values
features[0,0]= 10000
features[0,1]=10000

#Create detector
outlier_detector = EllipticEnvelope (contamination =.1)

#Fit detector
outlier_detector.fit(features)

#Predict outliers
outlier_detector.predict(features)

array([-1,  1,  1])

In [None]:
#A major limitation of this approach is the need to specify a contamination parameter,
#which is the proportion of observations that are outlier- a value which we don't know.

In [39]:
#Instead of looking at observations as a whole, we can instead look at individual features
#and identify extreme values in those features using interquartile range(IQR):
#Create one feature
feature = features[:,0]

#Create a function to return index of outliers
def indices_of_outliers(x):
    q1,q3 = np.percentile(x,[25,75])
    iqr =q3-q1
    lower_bound =q1-(iqr * 1.5)
    upper_bound =q3+(iqr* 1.5)
    return np.where((x>upper_bound)| (x< lower_bound))

#Run function
indices_of_outliers(features)

(array([], dtype=int64), array([], dtype=int64))

In [40]:
#IQR is the difference between the first and third quartile of a set of data.
#You can think of IQR as the spread of the bulk of the data, 
#with outliers being observations far from the main concentaration of data.

#Outliers are commonly defined as any value 1.5 IQRs less than the first quartile or 1.5IQRs greater than the third 
#quartile.

#Handling Outliers

#Problem
#You have outliers
#Solution
#Typically we have three strategies we can use to handle outliers. First we can drop them:
#Load libraries
import pandas as pd

#Create DataFrames
houses =pd.DataFrame()
houses['Price']= [534433, 392333, 293222, 4322032]
houses['Bathrooms']=[2,3.5,2,116]
houses['Square_Feet']=[1500, 2500, 1500, 48000]

#Filter observations
houses[houses['Bathrooms']<20]

Unnamed: 0,Price,Bathrooms,Square_Feet
0,534433,2.0,1500
1,392333,3.5,2500
2,293222,2.0,1500


In [41]:
#Second, we can mark them as outliers and include it as a feature:
houses["Outliers"] =np.where(houses["Bathrooms"]< 20,0, 1)

#Show data
houses

Unnamed: 0,Price,Bathrooms,Square_Feet,Outliers
0,534433,2.0,1500,0
1,392333,3.5,2500,0
2,293222,2.0,1500,0
3,4322032,116.0,48000,1


In [42]:
#Finally, we can transform the feature to dampen the effect of the outlier:

#Log features
houses["log_Of_Square_Feet"]=[np.log(x)for x in houses ["Square_Feet"]]

#Show data
houses

Unnamed: 0,Price,Bathrooms,Square_Feet,Outliers,log_Of_Square_Feet
0,534433,2.0,1500,0,7.31322
1,392333,3.5,2500,0,7.824046
2,293222,2.0,1500,0,7.31322
3,4322032,116.0,48000,1,10.778956
