### Sci-kit Learn helpers!

In [1]:
# load libraries

import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import Binarizer
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split


In [2]:
# sample data
X = np.random.random((10,5))
y = np.array(['M','M','F','F','F','M','M','F','F','F'])

In [3]:
# training & test data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43)

In [4]:
X[X < 0.7] = 0

In [5]:
# Standardization

scaler = StandardScaler().fit(X_train)
scaler

StandardScaler(copy=True, with_mean=True, with_std=True)

In [6]:
standardized_X = scaler.transform(X_train)
standardized_X

array([[ 1.27713542, -1.28258223,  1.60610139, -0.3992335 ,  1.06484074],
       [-1.04584866, -1.22885428, -1.00447648,  0.05127827, -0.59169831],
       [ 0.99392064,  1.10301796,  0.0982004 , -1.96394262,  1.27628   ],
       [-0.77414352, -0.51061   ,  0.15957133,  0.0424027 ,  0.82369127],
       [ 0.41639209,  1.10695622,  1.13454313,  1.67367334, -0.05890516],
       [ 0.60027562,  1.04339241, -1.22199537,  0.31257874, -1.40636452],
       [-1.4677316 , -0.23132008, -0.77194441,  0.28324307, -1.10784402]])

In [7]:
standardized_X_test = scaler.transform(X_test)
standardized_X_test

array([[-1.66951649,  1.13929561, -0.01584555, -1.172049  ,  0.70937363],
       [-0.53914127,  0.29640801, -0.15753619, -0.62501304,  1.59529352],
       [ 0.56486291,  0.27292718, -0.80673409, -1.39578351,  0.69276815]])

In [8]:
# Normalization
scaler = Normalizer().fit(X_train)
scaler

Normalizer(copy=True, norm='l2')

In [9]:
normalized_X = scaler.transform(X_train)
normalized_X

array([[0.60648217, 0.13422586, 0.61306723, 0.2854417 , 0.39602323],
       [0.26976913, 0.30379538, 0.41712636, 0.74645313, 0.32209674],
       [0.55489728, 0.60028419, 0.37817087, 0.04370555, 0.43223202],
       [0.25652416, 0.39349767, 0.5336034 , 0.49250384, 0.50205809],
       [0.38434269, 0.53157351, 0.48179272, 0.54285645, 0.20710614],
       [0.54269846, 0.67558257, 0.19044738, 0.45961623, 0.03936097],
       [0.07238122, 0.58622407, 0.41023461, 0.6815496 , 0.13527751]])

In [10]:
normalized_X_test = scaler.transform(X_test)
normalized_X_test

array([[6.35523608e-06, 7.54647884e-01, 4.47110341e-01, 2.08515063e-01,
        4.32574135e-01],
       [2.89927553e-01, 5.45691780e-01, 4.14953897e-01, 3.12141977e-01,
        5.90375427e-01],
       [6.00764606e-01, 5.66148916e-01, 3.00314770e-01, 1.71321062e-01,
        4.46113691e-01]])

In [11]:
# Binarization
binarizer = Binarizer(threshold=0.0).fit(X)
binarizer

Binarizer(copy=True, threshold=0.0)

In [12]:
binary_X = binarizer.transform(X)
binary_X

array([[0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [1., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 1., 1., 1., 0.],
       [1., 0., 1., 0., 0.],
       [0., 0., 0., 0., 1.]])

In [13]:
# Encoding Categorical Features
enc = LabelEncoder()
enc

LabelEncoder()

In [14]:
y = enc.fit_transform(y)
y

array([1, 1, 0, 0, 0, 1, 1, 0, 0, 0], dtype=int64)

In [15]:
# Imputing Missing Values
imp = Imputer(missing_values=0, strategy='mean', axis=0)
imp

Imputer(axis=0, copy=True, missing_values=0, strategy='mean', verbose=0)

In [16]:
imp.fit_transform(X_train)

array([[0.90214356, 0.19966126, 0.91193886, 0.42459516, 0.58908543],
       [0.19094735, 0.21503173, 0.2952494 , 0.5283527 , 0.22798576],
       [0.81543557, 0.88213279, 0.55573165, 0.06422642, 0.63517588],
       [0.27413159, 0.42050676, 0.57022913, 0.52630856, 0.5365186 ],
       [0.63862157, 0.88325944, 0.80054398, 0.90200712, 0.34412635],
       [0.69491867, 0.86507513, 0.24386552, 0.58853291, 0.05040123],
       [0.06178523, 0.50040591, 0.35017979, 0.5817766 , 0.11547405]])

In [17]:
# Generating Polynomial Features
poly = PolynomialFeatures(5)
poly

PolynomialFeatures(degree=5, include_bias=True, interaction_only=False)

In [18]:
poly.fit_transform(X)

array([[1.        , 0.        , 0.86507513, ..., 0.        , 0.        ,
        0.        ],
       [1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [1.        , 0.        , 0.88325944, ..., 0.        , 0.        ,
        0.        ],
       [1.        , 0.90214356, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.17380818]])

### Model creation

In [19]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression(normalize=True)

from sklearn.svm import SVC
svc = SVC(kernel='linear')

from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()

from sklearn import neighbors
knn = neighbors.KNeighborsClassifier(n_neighbors=5)

## Unsupervised Learning Estimators
## Principal Component Analysis (PCA)

from sklearn.decomposition import PCA
pca = PCA(n_components=0.95)

from sklearn.cluster import KMeans
k_means = KMeans(n_clusters=3, random_state=0)


### Model fitting

In [20]:
lr.fit(X, y)
knn.fit(X_train, y_train)
svc.fit(X_train, y_train)
k_means.fit(X_train)
pca_model = pca.fit_transform(X_train)

### Prediction

In [21]:
y_pred = lr.predict(X_test)
y_pred = svc.predict(np.random.random((2,5)))
y_pred = knn.predict_proba(X_test)
y_pred = k_means.predict(X_test)

### Evaluation

In [22]:
# Evaluate Your Model's Performance

from sklearn.model_selection import cross_val_score

# Cross-Validation
print(cross_val_score(knn, X_train, y_train, cv=4))
print(cross_val_score(lr, X, y, cv=2))

[0.5 0.5 0.5 0. ]
[-1.77229233  0.02745988]




In [None]:
# Regression Metrics

# Mean Absolute Error
from sklearn.metrics import mean_absolute_error
y_true = ([3, -0.5, 2])
mean_absolute_error(y_true, y_pred)

# Mean Squared Error
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, y_pred)

# R2 Score
from sklearn.metrics import r2_score
r2_score(y_true, y_pred)

In [None]:
# Accuracy Score
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)


In [None]:
# Classification Report
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

# Confusion Matrix
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_pred))

In [None]:
# Clustering Metrics
# Adjusted Rand Index
from sklearn.metrics import adjusted_rand_score
adjusted_rand_score(y_true, y_pred)

# Homogeneity
from sklearn.metrics import homogeneity_score
homogeneity_score(y_true, y_pred)

# V-measure
from sklearn.metrics import v_measure_score
metrics.v_measure_score(y_true, y_pred)