In [5]:
#%%
import pandas as pd
import numpy as np
import matplotlib as plt
from pyspark.ml.clustering import BisectingKMeans
from pyspark.sql import SparkSession
import itertools
from sklearn import mixture
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA



spark = SparkSession.builder.appName('Spark').getOrCreate()

# Loads data.
dataset = spark.read.format("com.databricks.spark.csv").option("header", "true").load("data.csv")

data = pd.DataFrame({
   c: np.random.randn(10000) for c in ['Fresh','Milk','Grocery' ,'Frozen','Detergents_Paper','Delicatessen','Region','Channel']
})

data.drop(['Region', 'Channel'], axis = 1, inplace = True)
#slicing the data. but first eplore the data for best slicing points
indices = [43, 12, 39]

# Create a DataFrame of the chosen samples
samples = pd.DataFrame(data.loc[indices], columns = data.columns).reset_index(drop = True)
print ("Chosen samples of wholesale customers dataset:")
display(samples)

log_data = data
outliers_lst  = []

# For each feature find the data points with extreme high or low values
for feature in log_data.columns:
    Q1 = np.percentile(log_data.loc[:, feature], 25)

    Q3 = np.percentile(log_data.loc[:, feature], 75)
    step = 1.5 * (Q3 - Q1)

    # So here, we're finding any points outside of Q1 - step and Q3 + step
    outliers_rows = log_data.loc[~((log_data[feature] >= Q1 - step) & (log_data[feature] <= Q3 + step)), :]
    # display(outliers_rows)

    outliers_lst.append(list(outliers_rows.index))

outliers = list(itertools.chain.from_iterable(outliers_lst))

# List of unique outliers
# We use set()
# Sets are lists with no duplicate entries
uniq_outliers = list(set(outliers))

# List of duplicate outliers
dup_outliers = list(set([x for x in outliers if outliers.count(x) > 1]))


# Remove duplicate outliers
# Only 5 specified
good_data = log_data.drop(log_data.index[dup_outliers]).reset_index(drop = True)

# Original Data 

# Processed Data
print ('shape of data before dropping outliers:\n',data.shape)
print ('New shape of data:\n', good_data.shape)

pca = PCA(n_components=2)
pca.fit(good_data)

# TODO: Transform the good data using the PCA fit above
reduced_data = pca.transform(good_data)

# TODO: Transform the sample log-data using the PCA fit above
pca_samples = pca.transform(samples)

# Create a DataFrame for the reduced data
reduced_data = pd.DataFrame(reduced_data, columns = ['Dimension 1', 'Dimension 2'])

# Display sample log-data after applying PCA transformation in two dimensions
display(pd.DataFrame(np.round(pca_samples, 4), columns = ['Dimension 1', 'Dimension 2']))

# Create range of clusters 
range_n_clusters = list(range(2,11))

for n_clusters in range_n_clusters:
    # TODO: Apply your clustering algorithm of choice to the reduced data 
    clusterer = KMeans(n_clusters=n_clusters).fit(reduced_data)

    # TODO: Predict the cluster for each data point
    preds = clusterer.predict(reduced_data)

    # TODO: Find the cluster centers
    centers = clusterer.cluster_centers_

    # TODO: Predict the cluster for each transformed sample data point
    sample_preds = clusterer.predict(pca_samples)

    # TODO: Calculate the mean silhouette coefficient for the number of clusters chosen
    score = silhouette_score(reduced_data, preds, metric='euclidean')
    print ("For n_clusters = {}. The average silhouette_score is : {}".format(n_clusters, score))

log_centers = pca.inverse_transform(centers)

# TODO: Exponentiate the centers
true_centers = np.exp(log_centers)

# Display the true centers
segments = ['Segment {}'.format(i) for i in range(0,len(centers))]
true_centers = pd.DataFrame(np.round(true_centers), columns = data.columns)
true_centers.index = segments
display(true_centers)

display(true_centers - data.median())

# Clusters' deviation from mean
# As you can see, this is not a meaningful comparison for Segment 1 where everything is negative
display(true_centers - data.mean())

for i, pred in enumerate(sample_preds):
    print ("Sample point", i, "predicted to be in Cluster", pred)
    



Chosen samples of wholesale customers dataset:


Unnamed: 0,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicatessen
0,1.253365,-1.466152,1.013531,1.101968,-0.860432,0.779702
1,-1.494481,1.156079,-1.740714,0.837891,-1.006927,-0.446752
2,1.194946,1.169885,0.141246,-0.207244,-1.007972,-1.207429


shape of data before dropping outliers:
 (10000, 6)
New shape of data:
 (9988, 6)


Unnamed: 0,Dimension 1,Dimension 2
0,1.2633,-1.7238
1,-0.6042,-0.676
2,-0.9374,-0.7638


For n_clusters = 2. The average silhouette_score is : 0.30545418803646207
For n_clusters = 3. The average silhouette_score is : 0.3291692808900718
For n_clusters = 4. The average silhouette_score is : 0.3057961439929006
For n_clusters = 5. The average silhouette_score is : 0.30675291225753354
For n_clusters = 6. The average silhouette_score is : 0.3262382189809696
For n_clusters = 7. The average silhouette_score is : 0.3217162533416576
For n_clusters = 8. The average silhouette_score is : 0.3088552521090658
For n_clusters = 9. The average silhouette_score is : 0.31220266982501016
For n_clusters = 10. The average silhouette_score is : 0.3147980365555416


Unnamed: 0,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicatessen
Segment 0,3.0,2.0,0.0,3.0,1.0,1.0
Segment 1,0.0,1.0,2.0,0.0,2.0,1.0
Segment 2,2.0,2.0,0.0,1.0,2.0,1.0
Segment 3,1.0,0.0,2.0,2.0,0.0,2.0
Segment 4,2.0,1.0,1.0,3.0,0.0,1.0
Segment 5,1.0,1.0,1.0,0.0,3.0,1.0
Segment 6,1.0,1.0,1.0,1.0,1.0,1.0
Segment 7,2.0,1.0,1.0,1.0,1.0,1.0
Segment 8,0.0,0.0,2.0,1.0,1.0,2.0
Segment 9,1.0,1.0,1.0,1.0,1.0,1.0


Unnamed: 0,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicatessen
Segment 0,2.975094,1.994333,0.002317,2.989372,1.008482,0.979341
Segment 1,-0.024906,0.994333,2.002317,-0.010628,2.008482,0.979341
Segment 2,1.975094,1.994333,0.002317,0.989372,2.008482,0.979341
Segment 3,0.975094,-0.005667,2.002317,1.989372,0.008482,1.979341
Segment 4,1.975094,0.994333,1.002317,2.989372,0.008482,0.979341
Segment 5,0.975094,0.994333,1.002317,-0.010628,3.008482,0.979341
Segment 6,0.975094,0.994333,1.002317,0.989372,1.008482,0.979341
Segment 7,1.975094,0.994333,1.002317,0.989372,1.008482,0.979341
Segment 8,-0.024906,-0.005667,2.002317,0.989372,1.008482,1.979341
Segment 9,0.975094,0.994333,1.002317,0.989372,1.008482,0.979341


Unnamed: 0,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicatessen
Segment 0,2.967087,1.990984,0.002099,2.997624,1.016181,0.988469
Segment 1,-0.032913,0.990984,2.002099,-0.002376,2.016181,0.988469
Segment 2,1.967087,1.990984,0.002099,0.997624,2.016181,0.988469
Segment 3,0.967087,-0.009016,2.002099,1.997624,0.016181,1.988469
Segment 4,1.967087,0.990984,1.002099,2.997624,0.016181,0.988469
Segment 5,0.967087,0.990984,1.002099,-0.002376,3.016181,0.988469
Segment 6,0.967087,0.990984,1.002099,0.997624,1.016181,0.988469
Segment 7,1.967087,0.990984,1.002099,0.997624,1.016181,0.988469
Segment 8,-0.032913,-0.009016,2.002099,0.997624,1.016181,1.988469
Segment 9,0.967087,0.990984,1.002099,0.997624,1.016181,0.988469


Sample point 0 predicted to be in Cluster 3
Sample point 1 predicted to be in Cluster 7
Sample point 2 predicted to be in Cluster 7
