# UNSUPERVISED LEARNING MODELS

In [42]:
import numpy as np
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs

In [43]:
# define number of samples
n_samples = 100

# define random state value to initialize the center
random_state = 20

# define number of feature as 5
X,y = make_blobs(n_samples=n_samples, n_features=5, random_state=None)

# define number of cluster to be formed as 3 and 
# in random state and fit features into the model
predict_y = KMeans(n_clusters=3, random_state=random_state).fit_predict(X)

# estimator function
predict_y


array([0, 1, 1, 2, 1, 2, 1, 1, 0, 2, 2, 2, 0, 2, 0, 0, 0, 0, 2, 2, 2, 2,
       1, 2, 1, 0, 0, 1, 2, 0, 1, 1, 0, 0, 1, 2, 0, 1, 0, 2, 2, 0, 1, 1,
       0, 0, 0, 1, 0, 2, 2, 1, 2, 2, 1, 2, 2, 0, 2, 0, 1, 2, 2, 1, 2, 1,
       2, 0, 2, 1, 1, 0, 0, 1, 0, 1, 2, 1, 1, 0, 2, 0, 2, 1, 0, 1, 0, 1,
       1, 2, 1, 1, 1, 0, 0, 0, 2, 0, 2, 0])

# REDUCING DIMENSIONS USING PCA

In [44]:
from sklearn.decomposition import PCA

In [45]:
from sklearn.datasets import make_blobs

In [46]:
# define sample and random state

n_sample = 20
random_state = 20

In [47]:
# generates the data with 10 features
X,y = make_blobs(n_samples=n_sample, n_features=10, random_state=None)

In [48]:
# view the shape of the dataset
X.shape

(20, 10)

In [49]:
# define PCA estimator with the number of reduced components
pca = PCA(n_components=3)

In [50]:
# fit the data into the PCA estimator
pca.fit(X)

PCA(copy=True, iterated_power='auto', n_components=3, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [51]:
pca.explained_variance_ratio_

array([0.62779063, 0.34690182, 0.00597743])

In [52]:
# first PCA component
first_pca = pca.components_[0]

In [53]:
first_pca

array([-0.38697459,  0.13753083,  0.23535353,  0.2586474 , -0.0758717 ,
       -0.33997798,  0.37480649, -0.49793111, -0.00474766, -0.4463931 ])

In [54]:
# transform the fitted data to apply dimensionality reduction
pca_reduced = pca.transform(X)

In [55]:
# VIEW THE REDUCED SHAPE
pca_reduced.shape

(20, 3)

In [56]:
# output: number of features reduced from 10 to 3

# BUILDING A PIPELINE

In [57]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA

In [58]:
# chain the estimators
estimator = [('dim_reduction', PCA()),('logres_model', LogisticRegression())]

In [59]:
estimator

[('dim_reduction',
  PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)),
 ('logres_model',
  LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
            intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
            penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
            verbose=0, warm_start=False))]

In [60]:
# put them in a pipeline object
pipeline_estimator = Pipeline(steps=estimator)

In [61]:
# check the chain of estimators
pipeline_estimator

Pipeline(memory=None,
     steps=[('dim_reduction', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('logres_model', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [62]:
# view first step
pipeline_estimator.steps[0]

('dim_reduction',
 PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
   svd_solver='auto', tol=0.0, whiten=False))

In [63]:
# view the second step
pipeline_estimator.steps[1]

('logres_model',
 LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
           penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
           verbose=0, warm_start=False))

In [64]:
# view all the steps in the pipeline

pipeline_estimator.steps

[('dim_reduction',
  PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)),
 ('logres_model',
  LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
            intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
            penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
            verbose=0, warm_start=False))]

# MODEL PERSISTENCE

In [65]:
from sklearn.datasets import load_iris

In [66]:
iris_ds = load_iris()

In [67]:
iris_ds.feature_names

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [68]:
iris_ds.target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [69]:
X_feature = iris_ds.data
Y_target = iris_ds.target

In [70]:
X_new = [[3,3,3,3],[4,4,4,4]]

In [71]:
logreg = LogisticRegression()

In [72]:
logreg.fit(X_feature, Y_target)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [73]:
logreg.predict(X_new)

array([2, 2])

In [74]:
# for model persistence
import pickle as pkl

In [75]:
# using dumps method to persist the model
persist_model = pkl.dumps(logreg)

In [76]:
persist_model

b'\x80\x03csklearn.linear_model.logistic\nLogisticRegression\nq\x00)\x81q\x01}q\x02(X\x07\x00\x00\x00penaltyq\x03X\x02\x00\x00\x00l2q\x04X\x04\x00\x00\x00dualq\x05\x89X\x03\x00\x00\x00tolq\x06G?\x1a6\xe2\xeb\x1cC-X\x01\x00\x00\x00Cq\x07G?\xf0\x00\x00\x00\x00\x00\x00X\r\x00\x00\x00fit_interceptq\x08\x88X\x11\x00\x00\x00intercept_scalingq\tK\x01X\x0c\x00\x00\x00class_weightq\nNX\x0c\x00\x00\x00random_stateq\x0bNX\x06\x00\x00\x00solverq\x0cX\t\x00\x00\x00liblinearq\rX\x08\x00\x00\x00max_iterq\x0eKdX\x0b\x00\x00\x00multi_classq\x0fX\x03\x00\x00\x00ovrq\x10X\x07\x00\x00\x00verboseq\x11K\x00X\n\x00\x00\x00warm_startq\x12\x89X\x06\x00\x00\x00n_jobsq\x13K\x01X\x08\x00\x00\x00classes_q\x14cnumpy.core.multiarray\n_reconstruct\nq\x15cnumpy\nndarray\nq\x16K\x00\x85q\x17C\x01bq\x18\x87q\x19Rq\x1a(K\x01K\x03\x85q\x1bcnumpy\ndtype\nq\x1cX\x02\x00\x00\x00i4q\x1dK\x00K\x01\x87q\x1eRq\x1f(K\x03X\x01\x00\x00\x00<q NNNJ\xff\xff\xff\xffJ\xff\xff\xff\xffK\x00tq!b\x89C\x0c\x00\x00\x00\x00\x01\x00\x00\x00\x02

In [77]:
# persist the model to file
from sklearn.externals import joblib

In [78]:
joblib.dump(logreg, 'ml_unsupervised_models_1.pkl')

['ml_unsupervised_models_1.pkl']

In [79]:
new_logreg_estimator = joblib.load('ml_unsupervised_models_1.pkl')

In [80]:
new_logreg_estimator

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [81]:
new_logreg_estimator.predict(X_new)

array([2, 2])

In [82]:
# output: the new model predicts same as the old one