In [2]:
import numpy as np
import pandas as pd

# The Data

Here I use iris data. We can load the data from sklearn library.

In [4]:
from sklearn.datasets import load_iris
iris=load_iris()
iris_data = pd.DataFrame(iris.data)
iris_data.columns = iris.feature_names
iris_data.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


# Data Preprocessing

I use standard scaling in the preprocessing process. In this step, it is important to save the parameter of transformation since we have to preprocess the new data with the same treatment.

In [7]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(iris_data.iloc[:, 0:4])
print(scaler.mean_)
print(scaler.var_)

[5.84333333 3.054      3.75866667 1.19866667]
[0.68112222 0.18675067 3.09242489 0.57853156]


# Modeling

I use simple KMeans method with 3 clusters. I the real world problem it is necessary to optimize your model by choosing the best clustering method and also tuning its hyperparameters. 
I also skip the validation step. 

In [11]:
from sklearn.cluster import KMeans
iris_clustering = KMeans(3, random_state=10).fit(scaler.transform(iris_data.iloc[:, 0:4]))

In [17]:
iris_data['cluster'] = iris_clustering.labels_
iris_data.head(5)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),cluster
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [18]:
iris_clustering.predict(iris_data.iloc[:, 0:4])

array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

# Result

In [21]:
iris_clustering.cluster_centers_

array([[-1.01457897,  0.84230679, -1.30487835, -1.25512862],
       [-0.01139555, -0.87288504,  0.37688422,  0.31165355],
       [ 1.16743407,  0.15377779,  1.00314548,  1.02963256]])

The result from the clustering model is the center of each cluster. We use these points to 'classify' new data.

## Saving the scaler and the model into .pkl files

In [15]:
from sklearn.externals import joblib
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(iris_clustering, 'iris_clustering.pkl')

['iris_clustering.pkl']