-
Notifications
You must be signed in to change notification settings - Fork 0
/
KMeans Clustering on wine quality daatset
82 lines (64 loc) · 1.95 KB
/
KMeans Clustering on wine quality daatset
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from matplotlib import style
import warnings
warnings.filterwarnings('ignore')
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
wine_df = pd.read_csv('winequality-red.csv', sep=';')
wine_df.head()
wine_df.shape
wine_df.info()
wine_df.isnull().sum()
wine_df.describe()
plt.figure(figsize=(10,7))
sns.heatmap(wine_df.corr(), annot=True)
plt.title('Correlation between the columns')
plt.show()
wine_df = wine_df.drop('quality', axis=1)
wine_df.columns
wss=[]
for i in range(1,10):
kmeans = KMeans(n_clusters=i, init='k-means++', random_state=42)
kmeans.fit(wine_df)
wss.append(kmeans.inertia_)
plt.plot(range(1,10), wss)
plt.title('The Elbow method')
plt.xlabel('Number of clusters')
plt.ylabel('Sum of squared distances')
plt.show()
from yellowbrick.cluster import KElbowVisualizer
model = KMeans()
visualizer = KElbowVisualizer(model, k=(1,10), timings = False)
visualizer.fit(wine_df)
visualizer.show()
for i in range(2,10):
kmeans = KMeans(n_clusters=i, max_iter=100)
kmeans.fit(wine_df)
score = silhouette_score(wine_df, kmeans.labels_)
print("For cluster: {}, the silhouette score is: {}".format(i,score))
silhouette_coefficients = []
for i in range(2,10):
kmeans = KMeans(n_clusters=i, max_iter=100)
kmeans.fit(wine_df)
score = silhouette_score(wine_df, kmeans.labels_)
silhouette_coefficients.append(score)
plt.plot(range(2,10), silhouette_coefficients)
plt.xticks(range(2,10))
plt.xlabel("number of clusters")
plt.ylabel("Silhouette coefficient")
plt.show()
pca = PCA()
X = pca.fit_transform(wine_df)
kmeans = KMeans(n_clusters=4)
label = kmeans.fit_predict(X)
unique_labels = np.unique(label)
for i in unique_labels:
plt.scatter(X[label==i,0], X[label==i,1], label=i, s=20)
plt.legend()
plt.title('wine groups')
plt.show()