<a href="https://colab.research.google.com/github/proteus21/DATA-SCIENCE-STUDY/blob/main/Machine%20Learning/06_Clustering/06_clustering_comparision'.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### UCZENIE NIENADZOROWANE /  UNSUPERVISED LEARNING

# Clustering comparision

A basic library for machine learning in Python
To install the scikit-learn library, use the command below:
```
!pip install scikit-learn
```
To update to the latest version of the scikit-learn library, use the command below:
```
!pip install --upgrade scikit-learn
```

### Contents:
1. [Import libraries](#0)
2. [Data generation and visualisation](#1)
3. [Algorithm comparision - blobs data](#2)
4. [Algorithm comparision - circle data](#3)
5. [Algorithm comparision - moons data](#4)
6. [Algorithm comparision - random data](#5)
6. [Compare metrics (Euclideas, Manhattan, cosinius)](#6)

### <a name='0'></a> Import libraries

In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
import matplotlib.pyplot as plt

sns.set(font_scale=1.3)
np.random.seed(42)
np.set_printoptions(precision=6)


### <a name='1'></a> Data generation and visualization

In [2]:
from sklearn.datasets import make_blobs

blobs_data=make_blobs(n_samples=1000, cluster_std=0.7, random_state=42, center_box=(-4.0, 4.0))[0]
blobs=pd.DataFrame(blobs_data, columns=['x1','x2'])
fig=px.scatter(blobs, 'x1','x2', width=900, height=500, title='blobs data', template='plotly_dark')
fig.update_traces(marker_size=5)
fig.show()

In [3]:
from sklearn.datasets import make_circles

circle_data=make_circles(n_samples=1000,factor=0.5, noise=0.050)[0]
circle=pd.DataFrame(circle_data, columns=['x1','x2'])
fig=px.scatter(circle, 'x1','x2', width=900, height=500, title='circle data', template='plotly_dark')
fig.update_traces(marker_size=5)
fig.show()

In [4]:
from sklearn.datasets import make_moons

moons_data=make_moons(n_samples=1000,noise=0.05)[0]
moons=pd.DataFrame(moons_data, columns=['x1','x2'])
fig=px.scatter(moons, 'x1','x2', width=900, height=500, title='moons data', template='plotly_dark')
fig.update_traces(marker_size=5)
fig.show()

In [5]:
random_data=np.random.rand(1000,2)
random=pd.DataFrame(random_data, columns=['x1','x2'])
fig=px.scatter(random, 'x1','x2', width=900, height=500, title='random data', template='plotly_dark')
fig.update_traces(marker_size=5)
fig.show()

### <a name='2'></a> Algorithm comparision - blobs data - 3 clusters

In [6]:
from plotly.subplots import make_subplots
fig=make_subplots(rows=1, cols=3, shared_yaxes=True, horizontal_spacing=0.01) 
from sklearn.cluster import KMeans

kmeans=KMeans(n_clusters=3, n_init='auto')
kmeans.fit(blobs_data)
clusters=kmeans.predict(blobs_data)
blobs['cluster']=clusters
trace1=px.scatter(blobs, 'x1','x2','cluster', width=800, height=500)['data'][0]
fig.add_trace(trace1, row=1, col=1)

from sklearn.cluster import AgglomerativeClustering

agglo=AgglomerativeClustering(n_clusters=3,metric='euclidean')
clusters=agglo.fit_predict(blobs_data)
blobs['cluster']=clusters
trace2=px.scatter(blobs, 'x1','x2','cluster', width=800, height=500)['data'][0]
fig.add_trace(trace2, row=1, col=2)

from sklearn.cluster import DBSCAN

dbscan=DBSCAN(eps=0.5, min_samples=5)
dbscan.fit(blobs_data)
clusters=dbscan.labels_
blobs['cluster']=clusters
trace3=px.scatter(blobs, 'x1','x2','cluster', width=800, height=500)['data'][0]
fig.add_trace(trace3, row=1, col=3)

fig.update_layout(title='KMeans vs Hierarchical clustering vs DBSCAN - blobs data', template='plotly_dark', coloraxis={'colorscale':'viridis'})
fig.show()



### <a name='3'></a> Algorithm comparision - circle data - 3 clusters

In [13]:
from plotly.subplots import make_subplots
fig=make_subplots(rows=1, cols=3, shared_yaxes=True, horizontal_spacing=0.01) 
from sklearn.cluster import KMeans

kmeans=KMeans(n_clusters=3, n_init='auto')
kmeans.fit(circle_data)
clusters=kmeans.predict(circle_data)
circle['cluster']=clusters
trace1=px.scatter(circle, 'x1','x2','cluster', width=800, height=500)['data'][0]
fig.add_trace(trace1, row=1, col=1)

from sklearn.cluster import AgglomerativeClustering

agglo=AgglomerativeClustering(n_clusters=3,metric='euclidean')
clusters=agglo.fit_predict(circle_data)
circle['cluster']=clusters
trace2=px.scatter(circle, 'x1','x2','cluster', width=800, height=500)['data'][0]
fig.add_trace(trace2, row=1, col=2)

from sklearn.cluster import DBSCAN

dbscan=DBSCAN(eps=0.1, min_samples=5)
dbscan.fit(circle_data)
clusters=dbscan.labels_
circle['cluster']=clusters
trace3=px.scatter(circle, 'x1','x2','cluster', width=800, height=500)['data'][0]
fig.add_trace(trace3, row=1, col=3)

fig.update_layout(title='KMeans vs Hierarchical clustering vs DBSCAN - circle data', template='plotly_dark', coloraxis={'colorscale':'viridis'})
fig.show()


### <a name='4'></a> Algorithm comparision - moon data - 3 clusters

In [14]:
from plotly.subplots import make_subplots
fig=make_subplots(rows=1, cols=3, shared_yaxes=True, horizontal_spacing=0.01) 
from sklearn.cluster import KMeans

kmeans=KMeans(n_clusters=3, n_init='auto')
kmeans.fit(moons_data)
clusters=kmeans.predict(moons_data)
moons['cluster']=clusters
trace1=px.scatter(moons, 'x1','x2','cluster', width=800, height=500)['data'][0]
fig.add_trace(trace1, row=1, col=1)

from sklearn.cluster import AgglomerativeClustering

agglo=AgglomerativeClustering(n_clusters=3,metric='euclidean')
clusters=agglo.fit_predict(moons_data)
moons['cluster']=clusters
trace2=px.scatter(moons, 'x1','x2','cluster', width=800, height=500)['data'][0]
fig.add_trace(trace2, row=1, col=2)

from sklearn.cluster import DBSCAN

dbscan=DBSCAN(eps=0.1, min_samples=5)
dbscan.fit(moons_data)
clusters=dbscan.labels_
moons['cluster']=clusters
trace3=px.scatter(moons, 'x1','x2','cluster', width=800, height=500)['data'][0]
fig.add_trace(trace3, row=1, col=3)

fig.update_layout(title='KMeans vs Hierarchical clustering vs DBSCAN - moons data', template='plotly_dark', coloraxis={'colorscale':'viridis'})
fig.show()


### <a name='4'></a> Algorithm comparision - random data - 3 clusters

In [15]:
from plotly.subplots import make_subplots
fig=make_subplots(rows=1, cols=3, shared_yaxes=True, horizontal_spacing=0.01) 
from sklearn.cluster import KMeans

kmeans=KMeans(n_clusters=3, n_init='auto')
kmeans.fit(random_data)
clusters=kmeans.predict(random_data)
random['cluster']=clusters
trace1=px.scatter(random, 'x1','x2','cluster', width=800, height=500)['data'][0]
fig.add_trace(trace1, row=1, col=1)

from sklearn.cluster import AgglomerativeClustering

agglo=AgglomerativeClustering(n_clusters=3,metric='euclidean')
clusters=agglo.fit_predict(random_data)
random['cluster']=clusters
trace2=px.scatter(random, 'x1','x2','cluster', width=800, height=500)['data'][0]
fig.add_trace(trace2, row=1, col=2)

from sklearn.cluster import DBSCAN

dbscan=DBSCAN(eps=0.1, min_samples=5)
dbscan.fit(random_data)
clusters=dbscan.labels_
random['cluster']=clusters
trace3=px.scatter(random, 'x1','x2','cluster', width=800, height=500)['data'][0]
fig.add_trace(trace3, row=1, col=3)

fig.update_layout(title='KMeans vs Hierarchical clustering vs DBSCAN - random data', template='plotly_dark', coloraxis={'colorscale':'viridis'})
fig.show()
