In [1]:
import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans, DBSCAN, MeanShift, estimate_bandwidth
from sklearn.decomposition import PCA
from sklearn.metrics import davies_bouldin_score, silhouette_score
from sklearn import preprocessing
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.vq import vqalias



ModuleNotFoundError: No module named '_tkinter'

In [0]:
data = pd.read_csv('./processed_data1.csv')
frac_data = data.sample(frac = 0.01) 


In [0]:
MEAN = data.mean()
STD = data.std()
normalized_data = (data - MEAN)/STD
normalized_frac_data = (frac_data - MEAN)/STD

correlation_matrix = normalized_frac_data.corr()

print(correlation_matrix)

In [0]:
plt.plot(correlation_matrix)
plt.show()


In [0]:
pca = PCA(random_state=1, n_components=8)
pca.fit(normalized_frac_data.values)

#Plotting the Cumulative Summation of the Explained Variance
plt.figure()
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of Components')
plt.ylabel('Variance (%)') #for each component
plt.title('eCommerce Dataset Explained Variance')
plt.show()

In [0]:
# calculate distortion for a range of number of cluster
max_clusters = 100
distortions = []
for i in range(1, max_clusters):
    km = KMeans(
        n_clusters=i, init='random',
        n_init=10, max_iter=300,
        tol=1e-04, random_state=1
    )
    km.fit(normalized_frac_data)
    distortions.append(km.inertia_)

# plot
plt.plot(range(1, max_clusters), distortions, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Distortion')
plt.show()

In [0]:
# KMeans - normalized data

km_norm = KMeans(
    n_clusters=20, init='random',
    n_init=10, max_iter=300,
    tol=1e-04, random_state=0
).fit(normalized_frac_data.values)


In [0]:
# DBSCAN - normalized data
db_scan_model = DBSCAN(eps=0.5, min_samples=11)
db_norm = db_scan_model.fit(normalized_frac_data.values)
print(np.unique(db_norm.labels_, return_counts=True))

In [0]:
# Hierarchical clustering

hierachical = AgglomerativeClustering(n_clusters=20, affinity='euclidean', linkage='ward')\
                    .fit_predict(normalized_frac_data.values)


In [0]:
# Bandwidth estimation for MeanShift

print(estimate_bandwidth(normalized_frac_data, n_jobs=-1))

In [0]:
# MeanShift

ms = MeanShift().fit(normalized_frac_data.values)

print(np.unique(ms.labels_))


In [0]:
# DAVIES BOULDIN Score

km_score_norm = davies_bouldin_score(normalized_frac_data.values, km_norm.labels_)
dbscan_score_norm = davies_bouldin_score(normalized_frac_data.values, db_norm.labels_)
hierachical_score = davies_bouldin_score(normalized_frac_data.values, hierachical)
ms_score_norm = davies_bouldin_score(normalized_frac_data.values, ms.labels_)

In [0]:
# print davies bouldin score
print("Scores davies bouldin:")
print(f'MeanShift    | {ms_score_norm}')
print(f'kMeans       | {km_score_norm}')
print(f'DBSCAN       | {dbscan_score_norm}')
print(f'hierarchical | {hierachical_score}')


In [0]:
# SILHOUETTE Score

sil_km_score = silhouette_score(normalized_frac_data.values, km_norm.labels_)
sil_dbscan_score = silhouette_score(normalized_frac_data.values, db_norm.labels_)
sil_hierachical_score = silhouette_score(normalized_frac_data.values, hierachical)
sil_ms_score = silhouette_score(normalized_frac_data.values, ms.labels_)

In [0]:
# print silhouette score
print("Scores silhouette:")
print(f'MeanShift    |  {sil_ms_score}')
print(f'kMeans       |  {sil_km_score}')
print(f'hierarchical | {sil_hierachical_score}')
print(f'DBSCAN       | {sil_dbscan_score}')

In [0]:
# Correlation of cluster column with the rest

corr_df = pd.DataFrame(normalized_frac_data)

corr_df["cluster"] = km_norm.labels_

print(corr_df.corr())



In [0]:
plt.matshow(corr_df.corr())
plt.show()

In [0]:
# denormalisation of data

def denormalize_data(df:pd.DataFrame, mean, std):
    return (df * std) + mean

In [0]:
# assigning all the data to the clusters

def assign_cluster(observations: pd.DataFrame, centroids: pd.DataFrame):
    codes, distances = vq(observations.values, centroids.values)
    result = pd.DataFrame.copy(observations)
    result["cluster"] = codes
    return result

In [0]:
observations = pd.DataFrame.copy(normalized_data)
centroids = pd.DataFrame(km_norm.cluster_centers_)
observations = assign_cluster(observations, centroids)


In [0]:
# number of entities in each cluster
unique, counts = np.unique(observations["cluster"].values, return_counts=True)
print(unique, counts)
print(unique.shape)

In [0]:
# writing processed data with cluster to CSV

data_with_cluster = pd.DataFrame.copy(data)
data_with_cluster["cluster"] = observations["cluster"]
data_with_cluster.to_csv("processed_data_with_cluster.csv", index=False)

In [0]:
def print_statistic(df: pd.DataFrame, cluster_value:int):
    dfc = df[df["cluster"] == cluster_value]
    event_types, event_ocurences = np.unique(dfc["event_type"].values, return_counts=True)
    print("event_type occurences")
    print(pd.DataFrame([event_ocurences], columns=event_types))
    brand_types, brand_occurences = np.unique(dfc["brand"].values, return_counts=True)
    brand_sorted = np.argsort(-brand_occurences)
    print("brand occurences")
    print(pd.DataFrame([brand_occurences[brand_sorted[0:5]]], columns=brand_types[brand_sorted[0:5]]))
    min_price = dfc["price"].min()
    average_price = dfc["price"].mean()
    max_price = dfc["price"].max()
    print(pd.DataFrame(data=[[min_price, average_price, max_price]], columns=["min", "average", "max"]))

    print("min price by event type")
    print(dfc[["event_type", "price"]].groupby(["event_type"]).agg(['min', 'mean', 'max']))
    # print("max price by event type")
    # print(dfc[["event_type", "price"]].groupby(["event_type"]).nlargest(n=5, columns=["price"]))

    # print("min price by brand type")
    # print(dfc[["brand", "price"]].groupby(["brand"]).nsmallest(n=5, columns=["price"]))
    # print("max price by brand type")
    # print(dfc[["brand", "price"]].groupby(["brand"]).nlargest(n=5, columns=["price"]))


In [0]:
sub_df = data_with_cluster[["event_type", "brand", "price", "cluster"]]
print_statistic(sub_df, 0)