In [26]:
import pandas as pd
import plotly.express as px
import hvplot.pandas
from sklearn.cluster import KMeans


In [27]:
file_path = "Resources/wineQualityWhites.csv"
whiteWine_df = pd.read_csv(file_path)
whiteWine_df.head()

Unnamed: 0.1,Unnamed: 0,fixed.acidity,volatile.acidity,citric.acid,residual.sugar,chlorides,free.sulfur.dioxide,total.sulfur.dioxide,density,pH,sulphates,alcohol,quality
0,1,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,2,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,3,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,5,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [28]:
# Columns
whiteWine_df.columns

Index(['Unnamed: 0', 'fixed.acidity', 'volatile.acidity', 'citric.acid',
       'residual.sugar', 'chlorides', 'free.sulfur.dioxide',
       'total.sulfur.dioxide', 'density', 'pH', 'sulphates', 'alcohol',
       'quality'],
      dtype='object')

In [29]:
# List DataFrame data types
whiteWine_df.dtypes

Unnamed: 0                int64
fixed.acidity           float64
volatile.acidity        float64
citric.acid             float64
residual.sugar          float64
chlorides               float64
free.sulfur.dioxide     float64
total.sulfur.dioxide    float64
density                 float64
pH                      float64
sulphates               float64
alcohol                 float64
quality                   int64
dtype: object

In [30]:
# Find null values
for column in whiteWine_df.columns:
    print(f"Column {column} has {whiteWine_df[column].isnull().sum()} null values")

Column Unnamed: 0 has 0 null values
Column fixed.acidity has 0 null values
Column volatile.acidity has 0 null values
Column citric.acid has 0 null values
Column residual.sugar has 0 null values
Column chlorides has 0 null values
Column free.sulfur.dioxide has 0 null values
Column total.sulfur.dioxide has 0 null values
Column density has 0 null values
Column pH has 0 null values
Column sulphates has 0 null values
Column alcohol has 0 null values
Column quality has 0 null values


In [31]:
# Find Duplicate entries
print(f"Duplicate entries: {whiteWine_df.duplicated().sum()}")

Duplicate entries: 0


# Clustering
# Clustering is a type of unsupervised learning that groups data points together

In [32]:
#The K-means algorithm groups the data into K clusters, where belonging to a cluster is based on some similarity or distance measure to a centroid.
# Initializing model with K = 5 (since we already know there are 11 elements)
model = KMeans(n_clusters=3, random_state=5)
model


KMeans(n_clusters=3, random_state=5)

In [33]:
# Fitting model
model.fit(whiteWine_df)

KMeans(n_clusters=3, random_state=5)

In [34]:
# Get the predictions
predictions = model.predict(whiteWine_df)
print(predictions)

[1 1 1 ... 2 2 2]


In [35]:
# Add a new class column to the whiteWine_df
whiteWine_df["class"] = model.labels_
whiteWine_df.head(10)

Unnamed: 0.1,Unnamed: 0,fixed.acidity,volatile.acidity,citric.acid,residual.sugar,chlorides,free.sulfur.dioxide,total.sulfur.dioxide,density,pH,sulphates,alcohol,quality,class
0,1,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6,1
1,2,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6,1
2,3,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6,1
3,4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6,1
4,5,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6,1
5,6,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6,1
6,7,6.2,0.32,0.16,7.0,0.045,30.0,136.0,0.9949,3.18,0.47,9.6,6,1
7,8,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6,1
8,9,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6,1
9,10,8.1,0.22,0.43,1.5,0.044,28.0,129.0,0.9938,3.22,0.45,11.0,6,1


In [36]:
# Plotting the clusters with two features
# *dont know which elements to use yet*
whiteWine_df.hvplot.scatter(x="sulphates" , y="pH" , by="quality")

In [None]:
# Pandas to_csv() method to export the DataFrame to a CSV 
output_file_path = "<path to folder>/nameOfFile.csv"
name_of_df.to_csv(output_file_path, index=False)