In [None]:
import pandas as pd
fp = "../data/olist_prepared/SP_CS_2017.csv"
df = pd.read_csv(fp)
week_cols = df.columns.tolist()

In [None]:
df.index = week_cols


In [None]:
df.loc["2","3"]

In [None]:
succ_vals = []
count = 0
for i in df.index:
    # if count == 0:
    #     count += 1
    #     continue
    #else:
    succ_vals.append(df.iloc[count, (count+1)])
    count +=1
    if count == (len(df.index) - 1):
        break
    

In [None]:
df_succ = pd.DataFrame({"succ_cs": succ_vals})
df_succ["week-num"] = df_succ.index + 1
fp = "../data/olist_prepared/SP_2017_cs_succ_weeks.csv"
df_succ.to_csv(fp, index=False)

In [None]:
num_quantiles = 4

# Define custom labels
custom_labels = ['Q1', 'Q2', 'Q3', 'Q4']

# Apply qcut with custom labels
df_succ["bins"] = pd.qcut(df_succ["succ_cs"], q=num_quantiles, labels=custom_labels)


In [None]:
df_succ["succ_cs"] = df_succ["succ_cs"].round(2)

In [None]:
import plotly.express as px
# Create the scatter plot
fig = px.scatter(df_succ, x='week-num', y='succ_cs', color="bins", labels={
                     "week-num": "Week of 2017",
                     "succ_cs": "Cosine Similarity with Next Week"
                 },
                title="Cosine Similarity of Consecutive Weeks of 2017")

# Update layout for better readability
fig.update_traces(textposition='top center')

# Show the plot
fig.show()

In [None]:
def quarter_label(x):
    if x <= 13:
        return "Qtr-1"
    elif ((x > 13) and (x <=26)):
        return "Qtr-2"
    elif ((x > 26) and (x <=39)):
        return "Qtr-3"
    else:
        return "Qtr-4"
        
df_succ["qtr"] = df_succ["week-num"].apply(quarter_label)


In [None]:
df_succ["SMA_4"] = df_succ['succ_cs'].rolling(window=4).mean()

In [None]:
import plotly.express as px
# Create the scatter plot
fig = px.scatter(df_succ, x='week-num', y='SMA_4', color="bins", labels={
                     "week-num": "Week of 2017",
                     "succ_cs": "Cosine Similarity with Next Week", "qtr": "Quarter of 2017",\
    "SMA_4": "Monthly moving avg of consecutive weeks"
    
    
                 },
                title="Moving Average Cosine Similarity of Consecutive Weeks of 2017")

# Update layout for better readability
fig.update_traces(textposition='top center')

# Show the plot
fig.show()

Context:
This dataset has weekly purchases of frequently purchased inventory items. Each row of the dataset is revenue from the sale of a particular inventory item in SP. So each column represents the weekly sale amount. Now, if two weeks sell the same items, then these weeks would have high cosine similarity. Note that this does not mean that the weeks have to have the same revenue for these items. This simply means that weeks with high cosine similarity are weeks that had a sale of similar inventory items. This is useful for many reasons:
1. Demand planners know when a group inventory items are in demand.
2. Price setters can set prices appropriately at that time.

Weeks with high cosine similarity are events that signal an affinity for groups of inventory items (columns) for a particular group of weeks (rows). The presence of weeks with high cosine similarity indicates that we have such affinities in our dataset. We will exploit this point later.

In [None]:
import plotly.express as px

fig = px.imshow(df, width=600, height=600)
fig.show()

A review of the heatmap shows many square regions of correlation 0.4 as you move your eyes across the diagonal of the heat map. These are weeks that have high cosine similarity. I see 3 clusters for sure, can make a case for a 4 th cluster

In [None]:
from sklearn.cluster import SpectralClustering
clustering = SpectralClustering(n_clusters=4, affinity="precomputed",
        assign_labels='cluster_qr',
        random_state=0).fit(df)

In [None]:
clustering.n_neighbors

In [None]:
cluster_info = {"woy": week_cols, "cluster": clustering.labels_}

In [None]:
df_cluster_info = pd.DataFrame.from_dict(cluster_info, orient="columns")
df_cluster_info["cluster"] = df_cluster_info["cluster"].astype("str")
df_cluster_info["woy"] = df_cluster_info["woy"].astype(int)
fp_c = "../data/olist_prepared/SP_2017_cs_cluster_info.csv"
df_cluster_info.to_csv(fp_c, index=False)

In [None]:
fp = "../data/olist_prepared/SP_weekly_revenue.csv"
df_weekly_rev = pd.read_csv(fp)

In [None]:
filter_2017 = df_weekly_rev["year"] == 2017
df_weekly_rev_2017 = df_weekly_rev[filter_2017]
df_weekly_rev_2017.loc[:, "woy"] = df_weekly_rev_2017["woy"].astype(int)

In [None]:
df_result = pd.merge(df_cluster_info, df_weekly_rev_2017, on="woy")

In [None]:
fig = px.violin(df_result, y="weekly_revenue", x="cluster", box=True, points="all")
fig.show()

In [None]:
fig = px.scatter(df_result, x='woy', y='weekly_revenue', text='cluster', color='cluster')

# Update layout to show labels
fig.update_traces(textposition='top center')

fig.show()