In [None]:
import pandas as pd
fp = "../data/olist_prepared/SP_ED_2017.csv"
df = pd.read_csv(fp)

In [None]:
df.shape

In [None]:
import numpy as np
delta = 0.5
df_trans = np.exp(- df ** 2 / (2. * delta ** 2))

def euc_sim(x):
  """
  This is a function that will be applied to each element.
  """
  return 1/(1+x)

df_trans = df_trans.map(euc_sim)
fp_trans = "../data/olist_prepared/SP_ES_2017.csv"
df_trans.to_csv(fp_trans, index=False)

In [None]:
df_trans.values.max()

In [None]:
fpp = "../data/olist_prepared/freq_prod_weekly_sale_SP_2017.parquet"
dfpp = pd.read_parquet(fpp)

In [None]:
from sklearn.metrics.pairwise import euclidean_distances
succ_vals = []
for i in range(dfpp.shape[0]):
    if i == 0:
        continue
    else:
        v1 = dfpp.iloc[(i-1),:].values.reshape(1,-1)
        v2 = dfpp.iloc[i,:].values.reshape(1,-1)
        es = 1/(euclidean_distances(v1, v2).flat[0] + 1)
        succ_vals.append(es)
        

In [None]:
df_succ = pd.DataFrame({"succ_es": succ_vals})
df_succ["pair"] = df_succ.index + 1

In [None]:
df_succ["succ_es"].plot.kde()

In [None]:
import plotly.express as px
# Create the scatter plot
fig = px.scatter(df_succ, x='pair', y='succ_es')

# Update layout for better readability
fig.update_traces(textposition='top center')

# Show the plot
fig.show()

The distance metric analyzed here is the euclidean distance metric. The distance between two weeks is small when the vector representation of each of the weeks are close to each other, that is the distance between them is small or the lengths are similar. The cosine similarity emphasizes similarity of direction in contrast - as long as two vectors have the same components, their similarity will be high. In contrast, euclidean similarity which is 1 - euclidean distance will be high when the euclidean distance between the points is small.

In [None]:
week_cols = df_trans.columns.tolist()

In [None]:
df.shape

In [None]:
df_unrolled = pd.DataFrame(df_trans.values.reshape(-1,1), columns = ["est"])

In [None]:
df_unrolled

In [None]:
from matplotlib import pyplot as plt
df_unrolled["est"].plot.hist()
plt.grid(True)

In [None]:
from sklearn.cluster import SpectralClustering

In [None]:
clustering = SpectralClustering(n_clusters=2, affinity="precomputed",
        assign_labels='cluster_qr',
        random_state=0).fit(df)

In [None]:
cluster_info = {"woy": week_cols, "cluster": clustering.labels_}

In [None]:
df_cluster_info = pd.DataFrame.from_dict(cluster_info, orient="columns")
df_cluster_info["cluster"] = df_cluster_info["cluster"].astype(str)
df_cluster_info["woy"] = df_cluster_info["woy"].astype(int)
fp = "../data/olist_prepared/SP_2017_es_cluster_info.csv"
df_cluster_info.to_csv(fp, index=False)

In [None]:
fp = "../data/olist_prepared/SP_weekly_revenue.csv"
df_weekly_rev = pd.read_csv(fp)

In [None]:
filter_2017 = df_weekly_rev["year"] == 2017
df_weekly_rev_2017 = df_weekly_rev[filter_2017]
df_weekly_rev_2017.loc[:, "woy"] = df_weekly_rev_2017["woy"].astype(int)

In [None]:
df_result = pd.merge(df_cluster_info, df_weekly_rev_2017, on="woy")

In [None]:
fig = px.violin(df_result, y="weekly_revenue", x="cluster", box=True, points="all")
fig.show()

In [None]:
fig = px.scatter(df_result, x='woy', y='weekly_revenue', text='cluster', color='cluster')

# Update layout to show labels
fig.update_traces(textposition='top center')

fig.show()