Unsupervised Learning

Unsupervised machine learning algorithm digunakan untuk mengelompokkan data tidak terstruktur menurut kesamaan dan pola yang berbeda dalam kumpulan data.

# Principal Component Analysis

Principal Component Analysis (PCA) adalah teknik linear dimensionality reduction yang dapat digunakan untuk mengekstraksi informasi dari ruang dimensi tinggi dengan memproyeksikannya ke dalam sub-ruang berdimensi lebih rendah. PCA mencoba untuk mempertahankan bagian penting yang memiliki lebih banyak variasi data dan menghapus bagian yang tidak penting dengan variasi yang lebih sedikit.

In [1]:
import numpy as np
import pandas as pd

In [2]:
data = np.matrix([[1,2,4],
               [4,1,2],
               [5,4,8]])

In [3]:
df = pd.DataFrame(data)

In [4]:
df

Unnamed: 0,0,1,2
0,1,2,4
1,4,1,2
2,5,4,8


## Manual Calculation

In [5]:
# Standardize data 
standard_data = (df - df.mean()) / df.std()
print(standard_data)

          0         1         2
0 -1.120897 -0.218218 -0.218218
1  0.320256 -0.872872 -0.872872
2  0.800641  1.091089  1.091089


In [6]:
# Finding covariance
covarance = np.cov(standard_data.T, bias = 1)
print(covarance)

[[0.66666667 0.27954262 0.27954262]
 [0.27954262 0.66666667 0.66666667]
 [0.27954262 0.66666667 0.66666667]]


In [7]:
# find eigen value& eigen vector
eigenvalue, eigenvectors = np.linalg.eig(covarance)

print(eigenvalue, eigenvectors)

[ 1.51710663e+00  4.82893369e-01 -9.20271082e-33] [[-4.21537439e-01 -9.06810999e-01 -5.84783617e-17]
 [-6.41212207e-01  2.98071982e-01 -7.07106781e-01]
 [-6.41212207e-01  2.98071982e-01  7.07106781e-01]]


In [8]:
# Find PCA
n_components = 2

In [9]:
pca_manual = np.matmul(np.array(standard_data),eigenvectors)
print(pca_manual)

[[ 7.52348033e-01  8.86352520e-01  2.77555756e-17]
 [ 9.84391775e-01 -8.10769054e-01 -1.11022302e-16]
 [-1.73673981e+00 -7.55834658e-02  2.22044605e-16]]


In [10]:
pca_manual  = pca_manual[:,:n_components]
print(pca_manual)

[[ 0.75234803  0.88635252]
 [ 0.98439178 -0.81076905]
 [-1.73673981 -0.07558347]]


In [11]:
print('Standardized data')
print(standard_data.round(2))
print('')

print('Covariance')
print(covarance.round(2))
print('')

print('eigen_value')
print(eigenvalue.round(4))
print('')


print('eigen_vector')
print(eigenvectors.round(4))
print('')

print('PCA manually calculated')
print(pca_manual.round(2))
print('')

Standardized data
      0     1     2
0 -1.12 -0.22 -0.22
1  0.32 -0.87 -0.87
2  0.80  1.09  1.09

Covariance
[[0.67 0.28 0.28]
 [0.28 0.67 0.67]
 [0.28 0.67 0.67]]

eigen_value
[ 1.5171  0.4829 -0.    ]

eigen_vector
[[-0.4215 -0.9068 -0.    ]
 [-0.6412  0.2981 -0.7071]
 [-0.6412  0.2981  0.7071]]

PCA manually calculated
[[ 0.75  0.89]
 [ 0.98 -0.81]
 [-1.74 -0.08]]



## Menggunakan SKlearn

In [None]:
pip install sklearn

In [None]:
from sklearn.decomposition import PCA

In [None]:
# PCA
pca_sklearn = (PCA(n_components).fit_transform(standard_data))

print('PCA - sklearn')
print(pca_sklearn.round(2))

In [None]:
print('PCA manually calculated')
print(pca_manual.round(2))

## Menggunakan Python (Iris Dataset)

In [None]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn import preprocessing
import matplotlib.pyplot as plt

In [None]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"

df = pd.read_csv(url, names=['sepal length', 'sepal width' , 'petal lenght', 'petal width', 'target'])

In [None]:
df

**Standarisasi Data**

In [None]:
from sklearn.preprocessing import StandardScaler

features = ['sepal length','sepal width','petal lenght','petal width']

In [None]:
# feature
x = df.loc[:, features].values

# target
y = df.loc[:, ['target']].values

In [None]:
# scale the features
x = StandardScaler().fit_transform(x)
x

In [None]:
# PCA untuk dua Komponen Utama
pca = PCA(n_components=2)

principalComponents = pca.fit_transform(x)

principalDataframe = pd.DataFrame(data = principalComponents, columns = ['PC1', 'PC2'])

In [None]:
principalDataframe

In [None]:
targetDataframe = df[['target']]

newDataframe = pd.concat([principalDataframe, targetDataframe],axis = 1)

In [None]:
newDataframe

In [None]:
percent_variance = np.round(pca.explained_variance_ratio_* 100, decimals =2)
columns = ['PC1', 'PC2']

In [None]:
list(percent_variance)

In [None]:
plt.bar(x= list(range(1,3)), height=list(percent_variance), tick_label=columns)
plt.ylabel('Percentate of Variance Explained')
plt.xlabel('Principal Component (PC)')
plt.title('PCA Scree Plot')
plt.show()

In [None]:
plt.scatter(principalDataframe.PC1, principalDataframe.PC2)
plt.title('PC1 against PC2')
plt.xlabel('PC1')
plt.ylabel('PC2')

In [None]:
fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1) 
ax.set_xlabel('PC1')
ax.set_ylabel('PC2')

ax.set_title('Plot of PC1 vs PC2', fontsize = 20)

targets = ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']
# menyiapkan warna yang berbeda tiap target 
colors = ['r', 'g', 'b']

for target, color in zip(targets,colors):
    indicesToKeep = newDataframe['target'] == target
    ax.scatter(newDataframe.loc[indicesToKeep, 'PC1']
               , newDataframe.loc[indicesToKeep, 'PC2']
               , c = color
               , s = 50)
    
ax.legend(targets)
ax.grid()

In [None]:
pca.explained_variance_ratio_

## Menggunakan Python (Breast Cancer Dataset)

In [None]:
from sklearn.datasets import load_breast_cancer

In [None]:
breast = load_breast_cancer()
breast_data = breast.data
breast_data.shape

In [None]:
breast_labels = breast.target
breast_labels.shape

In [None]:
import numpy as np
import pandas as pd 

In [None]:
labels = np.reshape(breast_labels,(569,1))

In [None]:
final_breast_data = np.concatenate([breast_data,labels],axis=1)
final_breast_data.shape

In [None]:
breast_dataset = pd.DataFrame(final_breast_data)
features = breast.feature_names
features

In [None]:
features_labels = np.append(features,'label')

In [None]:
breast_dataset.columns = features_labels
breast_dataset.head()

In [None]:
breast_dataset['label'].replace(0, 'Benign',inplace=True)
breast_dataset['label'].replace(1, 'Malignant',inplace=True)
breast_dataset.tail()

**Standarisasi Data**

In [None]:
from sklearn.preprocessing import StandardScaler
x = breast_dataset.loc[:, features].values
x = StandardScaler().fit_transform(x) # normalizing the features
x.shape

In [None]:
feat_cols = ['feature'+str(i) for i in range(x.shape[1])]
normalised_breast = pd.DataFrame(x,columns=feat_cols)
normalised_breast.tail()

In [None]:
from sklearn.decomposition import PCA
pca_breast = PCA(n_components=2)
principalComponents_breast = pca_breast.fit_transform(x)
principalComponents_breast

In [None]:
principal_breast_Df = pd.DataFrame(data=principalComponents_breast,
                                   columns=['principal component 1', 'principal component 2'])
principal_breast_Df.tail()

In [None]:
print('Explained variation per principal component: {}'.format(pca_breast.explained_variance_ratio_))

In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt

%matplotlib inline

plt.figure()
plt.figure(figsize=(10,10))
plt.xticks(fontsize=12)
plt.yticks(fontsize=14)
plt.xlabel('Principal Component - 1',fontsize=20)
plt.ylabel('Principal Component - 2',fontsize=20)
plt.title("Principal Component Analysis of Breast Cancer Dataset",fontsize=20)
targets = ['Benign', 'Malignant']
colors = ['r', 'g']
for target, color in zip(targets,colors):
    indicesToKeep = breast_dataset['label'] == target
    plt.scatter(principal_breast_Df.loc[indicesToKeep, 'principal component 1']
               , principal_breast_Df.loc[indicesToKeep, 'principal component 2'], c = color, s = 50)

plt.legend(targets,prop={'size': 15})

plt.show()

# K-Means Clustering

In [None]:
!pip install kneed

In [None]:
import matplotlib.pyplot as plt

from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler

In [None]:
features, true_labels = make_blobs(
    n_samples=200, centers=3, cluster_std=2.75, random_state=42
)

In [None]:
features[:5]

In [None]:
true_labels[:5]

In [None]:
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

In [None]:
scaled_features[:5]

In [None]:
kmeans = KMeans(
    init="random",
    n_clusters=3,
    n_init=10,
    max_iter=300,
    random_state=42
)

In [None]:
kmeans.fit(scaled_features)

In [None]:
# The lowest SSE value
kmeans.inertia_

In [None]:
# Final locations of the centroid
kmeans.cluster_centers_

In [None]:
# The number of iterations required to converge
kmeans.n_iter_

In [None]:
kmeans.labels_[:5]

In [None]:
# How to Choose the Number of Clusters
kmeans_kwargs = {
    "init": "random",
    "n_init": 10,
    "max_iter": 300,
    "random_state": 42,
}

# A list holds the SSE values for each k
sse = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
    kmeans.fit(scaled_features)
    sse.append(kmeans.inertia_)

In [None]:
plt.style.use("fivethirtyeight")
plt.plot(range(1, 11), sse)
plt.xticks(range(1, 11))
plt.xlabel("Number of Clusters")
plt.ylabel("SSE")
plt.show()

In [None]:
# A list holds the silhouette coefficients for each k
silhouette_coefficients = []

# Notice we start at 2 clusters for silhouette coefficient
for k in range(2, 11):
    kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
    kmeans.fit(scaled_features)
    score = silhouette_score(scaled_features, kmeans.labels_)
    silhouette_coefficients.append(score)

In [None]:
plt.style.use("fivethirtyeight")
plt.plot(range(2, 11), silhouette_coefficients)
plt.xticks(range(2, 11))
plt.xlabel("Number of Clusters")
plt.ylabel("Silhouette Coefficient")
plt.show()

# Density Based Spatial Clustering Algorithm with Noise (DBSCAN)

Density – Based Spatial Clustering Algorithm With Noise (DBSCAN) adalah algoritma pengelompokan yang didasarkan pada kepadatan (density) data.

In [None]:
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.cluster import DBSCAN
from sklearn.decomposition import PCA 

In [None]:
bunga = datasets.load_iris()

In [None]:
print(dir(bunga))

In [None]:
print(bunga.feature_names)

In [None]:
print(bunga.target_names)

In [None]:
x_axis = bunga.data[:,0] 
y_axis = bunga.data[:,2]

In [None]:
plt.scatter(x_axis,y_axis, c=bunga.target)
plt.xlabel("Sepal lenght")
plt.ylabel("Sepal with") 
plt.title("Bunga Iris")
plt.show()

In [None]:
# menggunakan model dbscan 
dbscan = DBSCAN() 
# ftting data
dbscan.fit(bunga.data) 

#transformasi 
pca = PCA(n_components=2).fit(bunga.data) 
pca_2d = pca.transform(bunga.data)

pca_2d

In [None]:
# visualisasi 

fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1) 
ax.set_xlabel('PC1')
ax.set_ylabel('PC2')

ax.set_title('Plot of PC1 vs PC2', fontsize = 20)

targets = ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']
# menyiapkan warna yang berbeda tiap target 
colors = ['r', 'g', 'b']

for target, color in zip(targets,colors):
    indicesToKeep = newDataframe['target'] == target
    ax.scatter(newDataframe.loc[indicesToKeep, 'PC1']
               , newDataframe.loc[indicesToKeep, 'PC2']
               , c = color
               , s = 50)
    
ax.legend(targets)
ax.grid()