<a href="https://colab.research.google.com/github/somilasthana/MachineLearningSkills/blob/master/SkLearn_Data_Transformation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Pipelines and composite estimators

In [0]:
"""
Transformers are usually combined with classifiers, regressors or other 
estimators to build a composite estimator. The most common tool is a Pipeline.
"""

In [0]:
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.decomposition import PCA

In [6]:
estimator = [('PCA', PCA()), ('clf', SVC)]
pipe = Pipeline(estimator)
pipe

Pipeline(memory=None,
         steps=[('PCA',
                 PCA(copy=True, iterated_power='auto', n_components=None,
                     random_state=None, svd_solver='auto', tol=0.0,
                     whiten=False)),
                ('clf', <class 'sklearn.svm.classes.SVC'>)],
         verbose=False)

In [7]:
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import Binarizer
make_pipeline(Binarizer(), MultinomialNB)

Pipeline(memory=None,
         steps=[('binarizer', Binarizer(copy=True, threshold=0.0)),
                ('abcmeta', <class 'sklearn.naive_bayes.MultinomialNB'>)],
         verbose=False)

In [9]:
pipe.steps

[('PCA',
  PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
      svd_solver='auto', tol=0.0, whiten=False)),
 ('clf', sklearn.svm.classes.SVC)]

In [11]:
from sklearn.datasets import load_digits
digits = load_digits()
pca1 = PCA()
svm1 = SVC(gamma='scale')
pipe1 = Pipeline([('reduce_dim', pca1), ('clf', svm1)])
pipe1.fit(digits.data, digits.target)

Pipeline(memory=None,
         steps=[('reduce_dim',
                 PCA(copy=True, iterated_power='auto', n_components=None,
                     random_state=None, svd_solver='auto', tol=0.0,
                     whiten=False)),
                ('clf',
                 SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                     decision_function_shape='ovr', degree=3, gamma='scale',
                     kernel='rbf', max_iter=-1, probability=False,
                     random_state=None, shrinking=True, tol=0.001,
                     verbose=False))],
         verbose=False)

In [12]:
print(pca1.components_) 

[[-1.77484909e-19 -1.73094651e-02 -2.23428835e-01 ... -8.94184677e-02
  -3.65977111e-02 -1.14684954e-02]
 [ 3.27805401e-18 -1.01064569e-02 -4.90849204e-02 ...  1.76697117e-01
   1.94547053e-02 -6.69693895e-03]
 [-1.68358559e-18  1.83420720e-02  1.26475543e-01 ...  2.32084163e-01
   1.67026563e-01  3.48043832e-02]
 ...
 [ 0.00000000e+00  4.27173158e-16 -3.24255744e-16 ...  1.11022302e-16
  -5.55111512e-17  1.38777878e-16]
 [ 0.00000000e+00  5.01431234e-17 -9.75867742e-17 ... -6.93889390e-17
   5.55111512e-17 -3.46944695e-18]
 [ 1.00000000e+00 -1.68983002e-17  5.73338351e-18 ...  8.66631300e-18
  -1.57615962e-17  4.07058917e-18]]


In [16]:
from tempfile import mkdtemp
from shutil import rmtree

cachedir = mkdtemp()
pca2=PCA()
svm2 = SVC(gamma='scale')
cached_pipe = Pipeline([('reduce_dim', pca2), ('clf', svm2)], memory=cachedir)
cached_pipe.fit(digits.data, digits.target)

Pipeline(memory='/tmp/tmpn_tial1r',
         steps=[('reduce_dim',
                 PCA(copy=True, iterated_power='auto', n_components=None,
                     random_state=None, svd_solver='auto', tol=0.0,
                     whiten=False)),
                ('clf',
                 SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                     decision_function_shape='ovr', degree=3, gamma='scale',
                     kernel='rbf', max_iter=-1, probability=False,
                     random_state=None, shrinking=True, tol=0.001,
                     verbose=False))],
         verbose=False)

In [18]:
# Cannot use print(pca2.components_) 
print(cached_pipe.named_steps['reduce_dim'].components_)

[[-1.77484909e-19 -1.73094651e-02 -2.23428835e-01 ... -8.94184677e-02
  -3.65977111e-02 -1.14684954e-02]
 [ 3.27805401e-18 -1.01064569e-02 -4.90849204e-02 ...  1.76697117e-01
   1.94547053e-02 -6.69693895e-03]
 [-1.68358559e-18  1.83420720e-02  1.26475543e-01 ...  2.32084163e-01
   1.67026563e-01  3.48043832e-02]
 ...
 [ 0.00000000e+00  4.27173158e-16 -3.24255744e-16 ...  1.11022302e-16
  -5.55111512e-17  1.38777878e-16]
 [ 0.00000000e+00  5.01431234e-17 -9.75867742e-17 ... -6.93889390e-17
   5.55111512e-17 -3.46944695e-18]
 [ 1.00000000e+00 -1.68983002e-17  5.73338351e-18 ...  8.66631300e-18
  -1.57615962e-17  4.07058917e-18]]


In [0]:
import numpy as np
from sklearn.datasets import load_boston
from sklearn.compose import TransformedTargetRegressor
from sklearn.preprocessing import QuantileTransformer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [22]:
boston = load_boston()
print(boston.feature_names)
boston.data[1]

['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO'
 'B' 'LSTAT']


array([2.7310e-02, 0.0000e+00, 7.0700e+00, 0.0000e+00, 4.6900e-01,
       6.4210e+00, 7.8900e+01, 4.9671e+00, 2.0000e+00, 2.4200e+02,
       1.7800e+01, 3.9690e+02, 9.1400e+00])

In [0]:
X = boston.data
y = boston.target

In [24]:
y[:5]

array([24. , 21.6, 34.7, 33.4, 36.2])

In [26]:
boston = load_boston()
X = boston.data
y = boston.target
transformer = QuantileTransformer(output_distribution='normal') #This method transforms the features to follow a uniform or a normal distribution.
regressor = LinearRegression()
regr = TransformedTargetRegressor(regressor=regressor,
                                  transformer=transformer)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
regr.fit(X_train, y_train) 

print('R2 score: {0:.2f}'.format(regr.score(X_test, y_test)))

R2 score: 0.67


  % (self.n_quantiles, n_samples))


In [28]:
simple_linear = LinearRegression().fit(X_train, y_train)
print("R2 score: {0:.2f}".format(simple_linear.score(X_test, y_test)))

R2 score: 0.64


In [0]:
"FeatureUnion : combines several transformer objects into a new transformer that combines their output"

In [31]:
from sklearn.pipeline import FeatureUnion
from sklearn.decomposition import PCA
from sklearn.decomposition import KernelPCA
estimators = [('linear_pca', PCA()), ('kernel_pca', KernelPCA())]
combined = FeatureUnion(estimators)
combined 


FeatureUnion(n_jobs=None,
             transformer_list=[('linear_pca',
                                PCA(copy=True, iterated_power='auto',
                                    n_components=None, random_state=None,
                                    svd_solver='auto', tol=0.0, whiten=False)),
                               ('kernel_pca',
                                KernelPCA(alpha=1.0, coef0=1, copy_X=True,
                                          degree=3, eigen_solver='auto',
                                          fit_inverse_transform=False,
                                          gamma=None, kernel='linear',
                                          kernel_params=None, max_iter=None,
                                          n_components=None, n_jobs=None,
                                          random_state=None,
                                          remove_zero_eig=False, tol=0))],
             transformer_weights=None, verbose=False)

In [0]:
import pandas as pd
X = pd.DataFrame(
    {'city': ['London', 'London', 'Paris', 'Sallisaw'],
     'title': ["His Last Bow", "How Watson Learned the Trick",
               "A Moveable Feast", "The Grapes of Wrath"],
     'expert_rating': [5, 3, 4, 5],
     'user_rating': [4, 5, 4, 3]})

In [0]:
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder

column_trans= ColumnTransformer(
    [
        ('city_category', OneHotEncoder(dtype='int'), ['city']),
        ('title_bow', CountVectorizer(), 'title')
    ],
    remainder='drop'
)

In [34]:
column_trans.fit(X)

ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
                  transformer_weights=None,
                  transformers=[('city_category',
                                 OneHotEncoder(categorical_features=None,
                                               categories=None, drop=None,
                                               dtype='int',
                                               handle_unknown='error',
                                               n_values=None, sparse=True),
                                 ['city']),
                                ('title_bow',
                                 CountVectorizer(analyzer='word', binary=False,
                                                 decode_error='strict',
                                                 dtype=<class 'numpy.int64'>,
                                                 encoding='utf-8',
                                                 input='content',
                          

In [35]:
column_trans.get_feature_names()

['city_category__x0_London',
 'city_category__x0_Paris',
 'city_category__x0_Sallisaw',
 'title_bow__bow',
 'title_bow__feast',
 'title_bow__grapes',
 'title_bow__his',
 'title_bow__how',
 'title_bow__last',
 'title_bow__learned',
 'title_bow__moveable',
 'title_bow__of',
 'title_bow__the',
 'title_bow__trick',
 'title_bow__watson',
 'title_bow__wrath']

In [37]:
column_trans.transform(X).toarray()

array([[1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0],
       [0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1]], dtype=int64)

In [40]:
column_trans= ColumnTransformer(
    [
        ('city_category', OneHotEncoder(dtype='int'), ['city']),
        ('title_bow', CountVectorizer(), 'title')
    ],
    remainder='passthrough'
)

column_trans.fit(X)

column_trans.transform(X)

array([[1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 5, 4],
       [1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 3, 5],
       [0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 4, 4],
       [0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 5, 3]])

In [0]:
"""
Feature Extraction: transforming arbitrary data text, images into numerical features

DictVectorizer implements one-of-k or one hot encoding for categorical features

"""

In [41]:
measurements = [
    {'city': 'Dubai', 'temperature': 33.},
    {'city': 'London', 'temperature': 12.},
    {'city': 'San Francisco', 'temperature': 18.},
]

from sklearn.feature_extraction import DictVectorizer
vec = DictVectorizer()

vec.fit_transform(measurements).toarray()

array([[ 1.,  0.,  0., 33.],
       [ 0.,  1.,  0., 12.],
       [ 0.,  0.,  1., 18.]])

In [42]:
vec.get_feature_names()

['city=Dubai', 'city=London', 'city=San Francisco', 'temperature']

In [0]:
pos_window = [
    {
        'word-2': 'the',
        'pos-2': 'DT',
        'word-1': 'cat',
        'pos-1': 'NN',
        'word+1': 'on',
        'pos+1': 'PP',
    },
    # in a real application one would extract many such dictionaries
]

In [44]:
vec = DictVectorizer()
pos_vectorized= vec.fit_transform(pos_window)

pos_vectorized.toarray()

array([[1., 1., 1., 1., 1., 1.]])

In [0]:
from sklearn.feature_extraction.text import CountVectorizer

In [46]:
vectorizer = CountVectorizer()
vectorizer

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [47]:
corpus = [
    'This is the first document.',
    'This is the second second document.',
    'And the third one.',
    'Is this the first document?',
]

X = vectorizer.fit_transform(corpus)
X

<4x9 sparse matrix of type '<class 'numpy.int64'>'
	with 19 stored elements in Compressed Sparse Row format>

In [48]:
analyze = vectorizer.build_analyzer()

analyze("This is a text document to analyze.") == (
    ['this', 'is', 'text', 'document', 'to', 'analyze'])

True

In [49]:
X.toarray()

array([[0, 1, 1, 1, 0, 0, 1, 0, 1],
       [0, 1, 0, 1, 0, 2, 1, 0, 1],
       [1, 0, 0, 0, 1, 0, 1, 1, 0],
       [0, 1, 1, 1, 0, 0, 1, 0, 1]])

In [51]:
vectorizer.transform(['document is first third second']).toarray()

array([[0, 1, 1, 1, 0, 1, 0, 1, 0]])

In [52]:
bigram_vectorizer = CountVectorizer(
    ngram_range=(1, 2),
    token_pattern=r'\b\w+\b', 
    min_df=1)

analyzer = bigram_vectorizer.build_analyzer()

analyzer('Bi-gram are cool!')

['bi', 'gram', 'are', 'cool', 'bi gram', 'gram are', 'are cool']

In [53]:
bigram_vectorizer.fit_transform(corpus).toarray()

array([[0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0],
       [0, 0, 1, 0, 0, 1, 1, 0, 0, 2, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0],
       [1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0],
       [0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1]])

In [1]:
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer(smooth_idf=False)
transformer

TfidfTransformer(norm='l2', smooth_idf=False, sublinear_tf=False, use_idf=True)

In [2]:
counts = [[3, 0, 1],
          [2, 0, 0],
          [3, 0, 0],
          [4, 0, 0],
          [3, 2, 0],
          [3, 0, 2]]

tfidf = transformer.fit_transform(counts)
tfidf


<6x3 sparse matrix of type '<class 'numpy.float64'>'
	with 9 stored elements in Compressed Sparse Row format>

In [3]:
tfidf.toarray()

array([[0.81940995, 0.        , 0.57320793],
       [1.        , 0.        , 0.        ],
       [1.        , 0.        , 0.        ],
       [1.        , 0.        , 0.        ],
       [0.47330339, 0.88089948, 0.        ],
       [0.58149261, 0.        , 0.81355169]])

In [6]:
ngram_vectorize = CountVectorizer(analyzer='char_wb', ngram_range=(2, 2))
counts = ngram_vectorize.fit_transform(['words', 'wprds'])
ngram_vectorize.get_feature_names()

[' w', 'ds', 'or', 'pr', 'rd', 's ', 'wo', 'wp']

In [8]:
counts.toarray()

array([[1, 1, 1, 0, 1, 1, 1, 0],
       [1, 1, 0, 1, 1, 1, 0, 1]])

In [9]:
ngram_vectorize = CountVectorizer(analyzer='char', ngram_range=(2, 2))
count = ngram_vectorize.fit_transform(['words', 'wprds'])
ngram_vectorize.get_feature_names()

['ds', 'or', 'pr', 'rd', 'wo', 'wp']

In [10]:
counts.toarray()

array([[1, 1, 1, 0, 1, 1, 1, 0],
       [1, 1, 0, 1, 1, 1, 0, 1]])

In [0]:
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

class LemmaTokenizer(object):
  def __init__(self):
    self.wnl = WordNetLemmatizer()
  def __call__(self, doc):
    return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]

In [0]:
vect = CountVectorizer(tokenizer=LemmaTokenizer())
vcount = vect.fit_transform(["sklearn is amazing library"])
vect.get_feature_names()

In [0]:
"Feature Selection: Done to either to improve estimators’ accuracy scores or to boost their performance on very high-dimensional datasets"

In [16]:
from sklearn.feature_selection import VarianceThreshold
X = [[0, 0, 1], [0, 1, 0], [1, 0, 0], [0, 1, 1], [0, 1, 0], [0, 1, 1]]

sel = VarianceThreshold(threshold=(0.8 * (1 - 0.8)))
sel.fit_transform(X)

array([[0, 1],
       [1, 0],
       [0, 0],
       [1, 1],
       [1, 0],
       [1, 1]])

In [18]:
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
iris = load_iris()

X, y = iris.data, iris.target
X.shape

(150, 4)

In [20]:
X_new = SelectKBest(chi2, k=2).fit_transform(X, y)
X_new.shape

(150, 2)

In [23]:
"""
SelectFromModel:
The features are considered unimportant and removed, if the corresponding 
coef_ or feature_importances_ values are below the provided threshold parameter. 
Apart from specifying the threshold numerically, there are built-in heuristics 
for finding a threshold using a string argument.
"""
from sklearn.svm import LinearSVC
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectFromModel
iris = load_iris()
X, y = iris.data, iris.target
lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X, y)
model = SelectFromModel(lsvc, prefit=True)
X_new = model.transform(X)
X_new.shape

(150, 3)

In [25]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectFromModel

iris = load_iris()
X, y = iris.data, iris.target
clf = ExtraTreesClassifier(n_estimators=50)
clf = clf.fit(X,y)

clf.feature_importances_

array([0.07244104, 0.04880842, 0.42827602, 0.45047452])

In [26]:
model = SelectFromModel(clf, prefit=True)
X_new = model.transform(X)

X_new.shape

(150, 2)

In [29]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X, y)

clf = Pipeline([
    ('feature_selection', SelectFromModel(lsvc)),
    ('classification', RandomForestClassifier())
]
)

clf.fit(X,y)



Pipeline(memory=None,
         steps=[('feature_selection',
                 SelectFromModel(estimator=LinearSVC(C=0.01, class_weight=None,
                                                     dual=False,
                                                     fit_intercept=True,
                                                     intercept_scaling=1,
                                                     loss='squared_hinge',
                                                     max_iter=1000,
                                                     multi_class='ovr',
                                                     penalty='l1',
                                                     random_state=None,
                                                     tol=0.0001, verbose=0),
                                 max_features=None, norm_order=1, prefit=False,
                                 threshold=None)),
                ('classification',
                 RandomForestClassifier(bootstrap=T

In [0]:
"""
The sklearn.preprocessing package provides several common utility functions and 
transformer classes to change raw feature vectors into a representation that is 
more suitable for the downstream estimators.
"""

In [30]:
from sklearn import preprocessing
import numpy as np
X_train = np.array([[ 1., -1.,  2.],
                    [ 2.,  0.,  0.],
                    [ 0.,  1., -1.]])

X_scaled = preprocessing.scale(X_train)

X_scaled

array([[ 0.        , -1.22474487,  1.33630621],
       [ 1.22474487,  0.        , -0.26726124],
       [-1.22474487,  1.22474487, -1.06904497]])

In [31]:
scaler = preprocessing.StandardScaler().fit(X_train)

scaler.transform(X_train)

array([[ 0.        , -1.22474487,  1.33630621],
       [ 1.22474487,  0.        , -0.26726124],
       [-1.22474487,  1.22474487, -1.06904497]])

In [38]:
min_max_scaler = preprocessing.MinMaxScaler()
min_max_scaler.fit_transform(X_train)

array([[0.5       , 0.        , 1.        ],
       [1.        , 0.5       , 0.33333333],
       [0.        , 1.        , 0.        ]])

In [39]:
X_test = np.array([[-3, -1, 4]])
min_max_scaler.transform(X_test)

array([[-1.5       ,  0.        ,  1.66666667]])

In [40]:
"""
MaxAbsScaler scales in a way that the training data lies within the range [-1, 1] 
by dividing through the largest maximum value in each feature. It is meant for 
data that is already centered at zero or sparse data.
"""

max_abs_scaler = preprocessing.MaxAbsScaler()
max_abs_scaler.fit_transform(X_train)

array([[ 0.5, -1. ,  1. ],
       [ 1. ,  0. ,  0. ],
       [ 0. ,  1. , -0.5]])

In [41]:
max_abs_scaler.transform(X_test)

array([[-1.5, -1. ,  2. ]])

In [0]:
"""
Data contains many outliers, scaling using the mean and variance of the 
data is likely to not work very well."""

Non Linear Transformation


Quantile transforms put all features into the same desired distribution based on the formula  where  is the cumulative distribution function of the feature and  the quantile function of the desired output distribution . This formula is using the two following facts: (i) if  is a random variable with a continuous cumulative distribution function  then  is uniformly distributed on ; (ii) if  is a random variable with uniform distribution on  then  has distribution . By performing a rank transformation, a quantile transform smooths out unusual distributions and is less influenced by outliers than scaling methods. It does, however, distort correlations and distances within and across features.

In [48]:
from sklearn.datasets import load_iris
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
iris = load_iris()

X, y= iris.data, iris.target
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=0)

quantile_transformer = preprocessing.QuantileTransformer(random_state=0)
X_train_quantile = quantile_transformer.fit_transform(X_train)
X_test_quantile = quantile_transformer.transform(X_test)

  % (self.n_quantiles, n_samples))


In [49]:
np.percentile(X_train[:, 0], [0, 25, 50, 75, 100]) 

array([4.3, 5.1, 5.8, 6.5, 7.9])

In [50]:
np.percentile(X_train_quantile[:, 0], [0, 25, 50, 75, 100])

array([0.        , 0.23873874, 0.50900901, 0.74324324, 1.        ])

In [51]:
quantile_transformer = preprocessing.QuantileTransformer(output_distribution='normal', random_state=0)
X_train_quantile = quantile_transformer.fit(X_train)
X_test_quantile = quantile_transformer.transform(X_test)

  % (self.n_quantiles, n_samples))


In [0]:
quantile_transformer.quantiles_

In [55]:
"Normalize"


X = [[ 1., -1.,  2.],
     [ 2.,  0.,  0.],
     [ 0.,  1., -1.]]
X_normalized = preprocessing.normalize(X, norm='l1')

X_normalized                                      


array([[ 0.25, -0.25,  0.5 ],
       [ 1.  ,  0.  ,  0.  ],
       [ 0.  ,  0.5 , -0.5 ]])

In [56]:
"Encoding categorical features"

enc = preprocessing.OneHotEncoder()

X = [['male', 'from US', 'uses Safari'], ['female', 'from Europe', 'uses Firefox']]

enc.fit(X)

OneHotEncoder(categorical_features=None, categories=None, drop=None,
              dtype=<class 'numpy.float64'>, handle_unknown='error',
              n_values=None, sparse=True)

In [58]:
enc.transform([['female', 'from US', 'uses Safari']]).toarray()

array([[1., 0., 0., 1., 0., 1.]])

In [59]:
enc.transform([['female', 'from US', 'uses Firefox']]).toarray()

array([[1., 0., 0., 1., 1., 0.]])

In [60]:
enc.categories_

[array(['female', 'male'], dtype=object),
 array(['from Europe', 'from US'], dtype=object),
 array(['uses Firefox', 'uses Safari'], dtype=object)]

In [65]:
genders = ['female', 'male']
locations = ['from Africa', 'from Asia', 'from Europe', 'from US']
browsers = ['uses Chrome', 'uses Firefox', 'uses IE', 'uses Safari']
enc = preprocessing.OneHotEncoder(categories=[genders, locations, browsers])
X = [['male', 'from US', 'uses Safari'], ['female', 'from Europe', 'uses Firefox']]
enc.fit(X)

OneHotEncoder(categorical_features=None,
              categories=[['female', 'male'],
                          ['from Africa', 'from Asia', 'from Europe',
                           'from US'],
                          ['uses Chrome', 'uses Firefox', 'uses IE',
                           'uses Safari']],
              drop=None, dtype=<class 'numpy.float64'>, handle_unknown='error',
              n_values=None, sparse=True)

In [66]:
enc.categories_

[array(['female', 'male'], dtype=object),
 array(['from Africa', 'from Asia', 'from Europe', 'from US'], dtype=object),
 array(['uses Chrome', 'uses Firefox', 'uses IE', 'uses Safari'],
       dtype=object)]

In [68]:
enc.transform([['male', 'from Asia', 'uses Firefox']]).toarray()

array([[0., 1., 0., 1., 0., 0., 0., 1., 0., 0.]])

In [0]:
"""
Discretization (otherwise known as quantization or binning) provides a way to 
partition continuous features into discrete values. Certain datasets with 
continuous features may benefit from discretization, because discretization 
can transform the dataset of continuous attributes to one with only nominal 
attributes.
"""

X = np.array([[ -3., 5., 15 ],
              [  0., 6., 14 ],
              [  6., 3., 11 ]])


est = preprocessing.KBinsDiscretizer(n_bins=[3,2,2], encode='ordinal').fit(X)

In [70]:
est.transform(X)

array([[0., 1., 1.],
       [1., 1., 1.],
       [2., 0., 0.]])

In [0]:
"""
Feature binarization is the process of thresholding numerical features to get boolean values
"""

X = [[ 1., -1.,  2.],
     [ 2.,  0.,  0.],
     [ 0.,  1., -1.]]

binarizer = preprocessing.Binarizer().fit(X)

In [72]:
binarizer.transform(X)

array([[1., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.]])

In [74]:
"""
Polynomial Features
"""

from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(2)

poly.fit_transform(X) # ( X1,X2 ) => ( 1, X1, X2, X1X2, X1X1, X2X2)

array([[ 1.,  1., -1.,  2.,  1., -1.,  2.,  1., -2.,  4.],
       [ 1.,  2.,  0.,  0.,  4.,  0.,  0.,  0.,  0.,  0.],
       [ 1.,  0.,  1., -1.,  0.,  0., -0.,  1., -1.,  1.]])

In [76]:
from sklearn.preprocessing import FunctionTransformer

transformer = FunctionTransformer(np.log1p, validate=True)
transformer.transform(X)

  return func(X, **(kw_args if kw_args else {}))


array([[0.69314718,       -inf, 1.09861229],
       [1.09861229, 0.        , 0.        ],
       [0.        , 0.69314718,       -inf]])

In [0]:
#Imputation of missing values

"""
SimpleImputer
Missing values can be imputed with a provided constant value, or using the 
statistics (mean, median or most frequent) of each column in which the missing 
values are located. 

"""

In [78]:
from sklearn.impute import SimpleImputer

imp = SimpleImputer(missing_values=np.nan, strategy='mean')

imp.fit([[1, 2], [np.nan, 3], [7, 6]])

SimpleImputer(add_indicator=False, copy=True, fill_value=None,
              missing_values=nan, strategy='mean', verbose=0)

In [79]:
imp.transform([[1, 2], [np.nan, 3], [7, 6]])

array([[1., 2.],
       [4., 3.],
       [7., 6.]])

In [81]:
"""
IterativeImputer class, which models each feature with missing values as a 
function of other features, and uses that estimate for imputation. 
"""


from sklearn.experimental import enable_iterative_imputer

from sklearn.impute import IterativeImputer

imp = IterativeImputer(max_iter=10, random_state=0)

imp.fit([[1, 2], [3, 6], [4, 8], [np.nan, 3], [7, np.nan]]) 

IterativeImputer(add_indicator=False, estimator=None,
                 imputation_order='ascending', initial_strategy='mean',
                 max_iter=10, max_value=None, min_value=None,
                 missing_values=nan, n_nearest_features=None, random_state=0,
                 sample_posterior=False, tol=0.001, verbose=0)

In [82]:
imp.transform([[1, 2], [3, 6], [4, 8], [np.nan, 3], [7, np.nan]]) 

array([[ 1.        ,  2.        ],
       [ 3.        ,  6.        ],
       [ 4.        ,  8.        ],
       [ 1.50000296,  3.        ],
       [ 7.        , 14.00004118]])

In [0]:
"""
In the statistics community, it is common practice to perform multiple imputations, 
generating, for example, m separate imputations for a single feature matrix. 
Each of these m imputations is then put through the subsequent analysis pipeline 
(e.g. feature engineering, clustering, regression, classification). 
The m final analysis results (e.g. held-out validation errors) allow the 
data scientist to obtain understanding of how analytic results may differ 
as a consequence of the inherent uncertainty caused by the missing values. 
The above practice is called multiple imputation.

"""

Random Projection

The sklearn.random_projection module implements a simple and computationally efficient way to reduce the dimensionality of the data by trading a controlled amount of accuracy (as additional variance) for faster processing times and smaller model sizes. This module implements two types of unstructured random matrix: Gaussian random matrix and sparse random matrix.



In [83]:
from sklearn import random_projection

X = np.random.rand(100, 10000)
X.shape

(100, 10000)

In [85]:
transformer = random_projection.GaussianRandomProjection()
X_new = transformer.fit_transform(X)
X_new.shape

(100, 3947)

In [86]:
tranformer = random_projection.SparseRandomProjection()

X_new = transformer.fit_transform(X)
X_new.shape

(100, 3947)

Kernel Approximation

In [0]:
"""
This submodule contains functions that approximate the feature mappings that 
correspond to certain kernels, as they are used for example in 
support vector machines (see Support Vector Machines). The following feature 
functions perform non-linear transformations of the input, which can serve as 
a basis for linear classification or other algorithms.


"""

In [102]:
from sklearn.kernel_approximation import RBFSampler

from sklearn.linear_model import SGDClassifier

X, y= iris.data, iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y)
rbf_feature = RBFSampler(gamma=1, random_state=1)

X_features = rbf_feature.fit_transform(X_train)
X_test_feature = rbf_feature.transform(X_test)
clf = SGDClassifier(max_iter=5)

clf.fit(X_features, y_train)



SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=5,
              n_iter_no_change=5, n_jobs=None, penalty='l2', power_t=0.5,
              random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [104]:
clf.score(X_test_feature, y_test)

1.0

In [134]:
"""
Pairwise metrics, Affinities and Kernels
"""

from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import pairwise_kernels
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel, polynomial_kernel, sigmoid_kernel, laplacian_kernel, chi2_kernel

X = np.array([[2, 3], [3, 5], [5, 8]])


Y = np.array([[1, 0], [2, 1]])


X.shape, Y.shape

((3, 2), (2, 2))

In [118]:
pairwise_distances(X, Y, metric='manhattan')

array([[ 4.,  2.],
       [ 7.,  5.],
       [12., 10.]])

In [110]:
pairwise_distances(X, metric='manhattan')

array([[0., 3., 8.],
       [3., 0., 5.],
       [8., 5., 0.]])

In [122]:
pairwise_kernels(X, Y, metric='linear')

array([[ 2.,  7.],
       [ 3., 11.],
       [ 5., 18.]])

In [126]:
cosine_similarity(X, Y)

array([[0.5547002 , 0.86824314],
       [0.51449576, 0.84366149],
       [0.52999894, 0.85328183]])

In [128]:
linear_kernel(X,Y)

array([[ 2.,  7.],
       [ 3., 11.],
       [ 5., 18.]])

In [129]:
polynomial_kernel(X,Y)

array([[   8.   ,   91.125],
       [  15.625,  274.625],
       [  42.875, 1000.   ]])

In [130]:
sigmoid_kernel(X,Y)

array([[0.96402758, 0.99975321],
       [0.9866143 , 0.99999548],
       [0.9981779 , 1.        ]])

In [133]:
laplacian_kernel(X, Y)

array([[0.13533528, 0.36787944],
       [0.03019738, 0.082085  ],
       [0.00247875, 0.00673795]])

In [0]:
from sklearn.svm import SVC
from sklearn.metrics.pairwise import chi2_kernel

X, y= iris.data, iris.target


X_chi2 = chi2_kernel(X, gamma=.5)

X_train, X_test, y_train, y_test = train_test_split(X_chi2, y)


svm = SVC(kernel='precomputed').fit(X_chi2, y)


In [144]:
X_test_chi2 = chi2_kernel(X_test, gamma=.5)
svm.score(X_chi2, y)

0.9733333333333334

In [145]:
#Label Encoding

from sklearn import preprocessing
le = preprocessing.LabelEncoder()

le.fit([1, 2, 2, 6])

LabelEncoder()

In [146]:
le.transform([1, 1, 2, 6])

array([0, 0, 1, 2])

In [0]:
le.transform([1, 1, 2, 5]) # 5 is missing gives error

In [150]:
le.inverse_transform([0, 0, 1, 2])

array([1, 1, 2, 6])