In [None]:
import pandas as pd
fp = "../data/olist_prepared/SP_ED_2017.csv"
df = pd.read_csv(fp)

The distance metric analyzed here is the euclidean distance metric. The distance between two weeks is small when the vector representation of each of the weeks are close to each other, that is the distance between them is small or the lengths are similar. The cosine similarity emphasizes similarity of direction in contrast - as long as two vectors have the same components, their similarity will be high. In contrast, euclidean similarity which is 1 - euclidean distance will be high when the euclidean distance between the points is small.

In [None]:
week_cols = df.columns.tolist()

In [None]:
df_unrolled = pd.DataFrame(df.values.reshape(df.shape[0] * df.shape[0],1)).reset_index(drop=True)
df_unrolled.columns = ["ED"]

In [None]:
import plotly.express as px

# Create a histogram
fig = px.histogram(df_unrolled.ED, x="ED",
                  title="Distribution of Euclidean Similarity for 2017")
fig.show()

In [None]:
from sklearn.cluster import SpectralClustering

In [None]:
clustering = SpectralClustering(n_clusters=2, affinity="precomputed",
        assign_labels='cluster_qr',
        random_state=0).fit(df)

In [None]:
cluster_info = {"woy": week_cols, "cluster": clustering.labels_}

In [None]:
df_cluster_info = pd.DataFrame.from_dict(cluster_info, orient="columns")
df_cluster_info["cluster"] = df_cluster_info["cluster"].astype(str)
df_cluster_info["woy"] = df_cluster_info["woy"].astype(int)

In [None]:
fp = "../data/olist_prepared/SP_weekly_revenue.csv"
df_weekly_rev = pd.read_csv(fp)

In [None]:
filter_2017 = df_weekly_rev["year"] == 2017
df_weekly_rev_2017 = df_weekly_rev[filter_2017]
df_weekly_rev_2017.loc[:, "woy"] = df_weekly_rev_2017["woy"].astype(int)

In [None]:
df_result = pd.merge(df_cluster_info, df_weekly_rev_2017, on="woy")

In [None]:
fig = px.violin(df_result, y="weekly_revenue", x="cluster", box=True, points="all")
fig.show()

In [None]:
fig = px.scatter(df_result, x='woy', y='weekly_revenue', text='cluster', color='cluster')

# Update layout to show labels
fig.update_traces(textposition='top center')

fig.show()