In [None]:
# libraries
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# read data
from sklearn.datasets import load_iris
iris = load_iris()
X = iris.data
Y = iris.target
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(X,Y,test_size=0.33,random_state=1) 

In [None]:
####################################################
# Kmeans Clustering
####################################################

In [None]:
# kmeans
from sklearn.cluster import KMeans
f = KMeans(n_clusters=3)
f.fit(X)
f.labels_
f.cluster_centers_
f.inertia_
pd.crosstab(f.labels_,Y)

In [None]:
# determine the number of clusters
inertia_list = []
for k in range(2,30):
    f = KMeans(n_clusters=k)
    f.fit(X)
    inertia_list.append(f.inertia_)

In [None]:
plt.plot(np.arange(2,30),inertia_list)

In [None]:
f = KMeans(n_clusters=5)
f.fit(X)
pd.crosstab(f.labels_,Y)

In [None]:
# after scaling & PCA
from sklearn.preprocessing import StandardScaler
f = StandardScaler()
f.fit(X)
X_s = f.transform(X)

In [None]:
from sklearn.decomposition import PCA
f = PCA()
f.fit(X_s)
X_st = f.transform(X_s)


In [None]:
f = KMeans(n_clusters=3)
f.fit(X_st)
pd.crosstab(f.labels_,Y)

In [None]:
# using the first component
from sklearn.decomposition import PCA
f = PCA(n_components=1)
f.fit(X_s)
X_st = f.transform(X_s)

In [None]:
f = KMeans(n_clusters=3)
f.fit(X_st)
pd.crosstab(f.labels_,Y)

In [None]:
####################################################
# Kmeans Clustering with Train/Test
####################################################

In [None]:
# kmeans
from sklearn.cluster import KMeans
f = KMeans(n_clusters=3)
f.fit(xtrain)
f.labels_
pd.crosstab(f.labels_,ytrain)

In [None]:
yhat_test = f.predict(xtest)
pd.crosstab(yhat_test,ytest)

In [None]:
####################################################
# Gaussian Mixture
####################################################

In [None]:
from sklearn.mixture import GaussianMixture
f = GaussianMixture(n_components=3)
f.fit(X)
f.means_
f.covariances_


In [None]:
yhat = f.predict(X)
pd.crosstab(yhat,Y)

In [None]:
# after scaling
from sklearn.preprocessing import StandardScaler
f = StandardScaler()
f.fit(X)
X_s = f.transform(X)

In [None]:
f = GaussianMixture(n_components=3)
f.fit(X_s)
pd.crosstab(f.predict(X_s),Y)

In [None]:
# KMeans with scaling
f = KMeans(n_clusters=3)
f.fit(X)
pd.crosstab(f.labels_,Y)
f.fit(X_s)
pd.crosstab(f.labels_,Y)

In [None]:
####################################################
# Hierachical Clustering
####################################################

In [None]:
from sklearn.cluster import AgglomerativeClustering
f = AgglomerativeClustering(n_clusters=3)
f.fit(X)
pd.crosstab(f.labels_,Y)

In [None]:
# different options
f = AgglomerativeClustering(n_clusters=3,affinity='cosine',linkage='complete')
f.fit(X)
pd.crosstab(f.labels_,Y)

In [None]:
####################################################
# DBSCAN
####################################################

In [None]:
from sklearn.cluster import DBSCAN
f = DBSCAN(eps=0.5,min_samples=5)
f.fit(X)
f.labels_
pd.crosstab(f.labels_,Y)

In [None]:
# using the first two components
from sklearn.decomposition import PCA
f = PCA(n_components=2)
f.fit(X_s)
X_st = f.transform(X_s)

In [None]:
f = DBSCAN(eps=0.5,min_samples=5)
f.fit(X_st)
pd.crosstab(f.labels_,Y)

In [None]:
col_list = ['k','b','r','g','y']
col = []
for i in range(len(f.labels_)):
    col.append(col_list[f.labels_[i]+1])

In [None]:
d = pd.DataFrame(X_st)
d.columns = ['pc1','pc2']
d.plot('pc1','pc2','scatter',c=col)

In [None]:
####################################################
# Clustering Practices
####################################################

In [None]:
# clustering boston data set only with X
# which method and how many cluster will you use?
# can you find the relation between the cluster and medv? 

In [None]:
# read data
df = pd.read_csv('https://raw.githubusercontent.com/flowertoman/data/main/data01_boston.csv')
X = df.iloc[:,:-1]
Y = df['medv']