In [None]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

wine = pd.read_csv('https://s3.us-east-1.amazonaws.com/General_V88/boomyeah2015/codingdojo/curriculum/content/chapter/1613254703__modified_wine.csv')
df = wine[['malic_acid', 'flavanoids']]
df.head()

In [None]:
# Instantiate Standard Scaler
scaler = StandardScaler()
# Fit & transform data.
scaled_df = scaler.fit_transform(df)

### KMeans  
One of the drawbacks of the KMeans algorithm is that there is no clear way to determine how many clusters to use. In the graph below,
we can visually check how many clusters we think we should have since our data is only 2-dimensional.

### Note: KMeans can fit on any number of columns!  
In this example we are only fitting on two columns for the purpose of plotting, however, realistically, most of the data we work with will be
multi-dimensional. If possible, we can use domain knowledge or work with a subject matter expert to determine which features we

should include and how many clusters are reasonable. We can also use scores such as the Silhouette Score to determine how well-
defined our clusters are -- more on this later.

In [None]:
# Visualize data
plt.scatter(df['malic_acid'], df['flavanoids'])
plt.xlabel('Malic Acid')
plt.ylabel('Flavanoids');

In [None]:
# Instantiate KMeans
kmeans = KMeans(n_clusters = 2)
kmeans.fit(scaled_df)
# Save the cluster labels to the dataframe
df['cluster'] = kmeans.labels_
# Visualize the clusters
plt.scatter(df['malic_acid'], df['flavanoids'], c = df['cluster'])
plt.xlabel('Malic Acid')
plt.ylabel('Flavanoids')
plt.title('Clusters of Wine Varieties');

In [None]:
# Nice! We've just used the KMeans algorithm to cluster our data into two different groups of wine! Something important to note is that you
# may occasionally get data points in unexpected clusters (like the two yellow points in the graph above that appear to fit better in the
# purple cluster). This can happen because of the random initialization of the centroids in the algorithm!