In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

plt.style.use("fivethirtyeight")
import warnings
warnings.filterwarnings("ignore")

# Information abouth Wine:

- **Malic_Acid =** Malic Acid is one of the main acids found in the acidity of grapes. Its concentration decreases the more a grape ripens. Malic Acid provides a strong link to wines tasting ‘flat’ if there is not enough. If there is too much the wine will taste ‘sour’. It is vital that the levels of malic acid are monitored during the fermentation process.

- **Ash =** Ash determination is an obligatory analysis for certified wines to be placed on the market. It is defined as all the inorganic matter remaining after igniting the residue left from the evaporation of must or wine

- **Ash_Alcanity =** The alkalinity of the ash is defined as the sum of cations, other than the ammonium ion, combined with the organic acids in the wine

- **Magnesium =** Magnesium cations are present in wine as a result of soil nutrient uptake in the vineyard.

- **Total_Phenols =** The phenolic content in wine refers to the phenolic compounds—natural phenol and polyphenols—in wine, which include a large group of several hundred chemical compounds that affect the taste, color and mouthfeel of wine. 

- **Flavonoids =** Flavonoids are the main polyphenols present in red wine by weight, and constitute about 80 to 90% of total polyphenols

- **Nonflavanoid_Phenols =** Non-flavonoid phenolics include several subclasses of importance to wine, in particular the hydroxycinnamates, stilbenes, and benzoic acids.

- **Proanthocyanins =** Proanthocyanidins are the principal polyphenols in red wine that are under research to assess risk of coronary heart disease and lower overall mortality. With tannins, they also influence the aroma, flavor, mouth-feel and astringency of red wines.

- **Color_Intensity =** A simple measure of how dark the wine is using a summation of absorbance measurements in the violet, green and red areas of the visible spectrum

- **Proline =** Proline is typically the most abundant amino acid present in grape juice and wine.

In [None]:
df = pd.read_csv("../input/wine-dataset-for-clustering/wine-clustering.csv")
df.head()

In [None]:
df.info()

In [None]:
df.describe().T

# EDA

In [None]:
df.hist(figsize=(15,15))
plt.show()

In [None]:
df.plot(kind="box", subplots = True,layout = (4,4),figsize=(15,15))
plt.show()

In [None]:
plt.figure(figsize=(15,10))
mask = np.triu(df.corr(),1)
sns.heatmap(df.corr(), annot=True,mask=mask, cmap="crest")
plt.show()

# Preprocessing

In [None]:
#Sum of Null Values
df.isnull().sum()

In [None]:
#Sum of Duplicated Value
df.duplicated().sum()

In [None]:
#Outlier Detection
print("Before outlier Detection:",df.shape)

for col in df.columns:
    q1, q3 = df[col].quantile([0.25,0.75])
    IQR = q3-q1
    
    max_val = q3 + 1.5*IQR
    min_val = q1 - 1.5*IQR
    
    outliers = df[(df[col]>max_val) | (df[col]<min_val)].index
    
    df.drop(outliers, axis=0, inplace=True)

print("After outlier Detection:", df.shape)

- If the skewness is between -0.5 and 0.5, the data are fairly symmetrical
- If the skewness is between -1 and – 0.5 or between 0.5 and 1, the data are moderately skewed
- If the skewness is less than -1 or greater than 1, the data are highly skewed

we don't need any skewness process 

In [None]:
#Skewness
df.skew().sort_values(ascending=False)

In [None]:
#Scaling
columns = df.columns
scaler = StandardScaler()

data = scaler.fit_transform(df)

df = pd.DataFrame(data=data, columns=columns)
df.head()

# Modelling

In [None]:
#PCA
pca = PCA(n_components=2)

pca_2 = pca.fit_transform(data)

In [None]:
#13 Dimensions to 2 dimensions
plt.figure(figsize=(12,8))
sns.scatterplot(x=pca_2[:,0], y=pca_2[:,1])
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.show()

## K-Means

**Inertia** measures how well a dataset was clustered by K-Means. It is calculated by measuring the distance between each data point and its centroid, squaring this distance, and summing these squares across one cluster.

A good model is one with low inertia AND a low number of clusters (K). However, this is a tradeoff because as K increases, inertia decreases.

To find the optimal K for a dataset, use the Elbow method; find the point where the decrease in inertia begins to slow. K=3 is the “elbow” of this graph.

**silhouette score** is a metric used to calculate the goodness of a clustering technique. Its value ranges from -1 to 1.

1: Means clusters are well apart from each other and clearly distinguished.

0: Means clusters are indifferent, or we can say that the distance between clusters is not significant.

-1: Means clusters are assigned in the wrong way.

In [None]:
# K-Means Inertia and Silhouette score
inertia = list()
silhouette ={}

for i in range(2,10):
    k_means = KMeans(n_clusters=i)
    k_means.fit(data)
    
    inertia.append(k_means.inertia_)
    
    silhouette[i] = silhouette_score(df, labels=k_means.labels_ , metric="euclidean")

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(15,6))

sns.lineplot(x=range(2,10), y=inertia, marker="o", ax=ax[0])
ax[0].set_xlabel("Number of Cluster")
ax[0].set_ylabel("Inertia")
ax[0].set_title("Inertia by Number of Cluster")

sns.barplot(x=list(silhouette.keys()), y=list(silhouette.values()), ax=ax[1])
ax[1].set_title("Silhouette Score by Number of Cluster")
ax[1].set_xlabel("Number of Cluster")
ax[1].set_ylabel("Silhouette Score")
plt.show()

**in K=3 all the metrics indicates that it is the best clusters number**

In [None]:
kmeans = KMeans(n_clusters=3)
kmeans.fit(data)

kmeans_labels = kmeans.predict(data)
centers = kmeans.cluster_centers_

In [None]:
pca = PCA(n_components=2)
centers = pca.fit_transform(centers)

centers

In [None]:
plt.figure(figsize=(12,8))
plt.scatter(pca_2[:,0],pca_2[:,1], c=kmeans_labels, cmap="Paired")
plt.scatter(centers[:,0],centers[:, 1], c="red", s=200)
plt.show()