In [None]:
import pandas as pd
fp = "../data/olist_prepared/olist_daily_orders_prepared.csv"
df = pd.read_csv(fp)

In [None]:
# drop all order items that are not frequent purchases
df = df[df["freq_purch_prod"]].reset_index(drop=True)

In [None]:
# set up the temporal attributes for computation
df["order_purchase_timestamp"] = pd.to_datetime(df["order_purchase_timestamp"]) 
df["year"] = df["order_purchase_timestamp"].dt.year
df["month"] = df["order_purchase_timestamp"].dt.month
df["woy"] = df["order_purchase_timestamp"].dt.isocalendar().week

In [None]:
#filter out SP, RJ, MG as reqions of interest
geo_filter = df["customer_state"].isin(["SP", "RJ", "MG"])
df = df[geo_filter]

In [None]:
not_year_2016 = df["year"] != 2016

In [None]:
# drop the data for year 2016
df = df[not_year_2016]

In [None]:
df["year"].unique()

In [None]:
cols_needed = ["order_id", "product_id", "price", "customer_zip_code_prefix", "customer_city", "customer_state", "year",\
"month", "woy"]
df = df[cols_needed]

In [None]:
filter_SP = df["customer_state"] == "SP"
filter_RJ =  df["customer_state"] == "RJ"
filter_MG =  df["customer_state"] == "MG"
df_SP = df[filter_SP].reset_index(drop=True)
df_RJ = df[filter_RJ].reset_index(drop=True)
df_MG = df[filter_MG].reset_index(drop=True)

In [None]:
df_SP_2017 = df_SP[df_SP.year == 2017]

df_SP_weekly_FPS_2017 = pd.pivot_table(df_SP_2017, index= "woy", values = "price", columns= "product_id",\
aggfunc="sum", fill_value=0)

In [None]:
df_SP_weekly_rev = df_SP.groupby(["year", "woy"])["price"].sum().reset_index()
df_SP_weekly_rev.columns = ["year", "woy", "weekly_revenue"]
fp = "../data/olist_prepared/SP_weekly_revenue.csv"
df_SP_weekly_rev.to_csv(fp, index=False)

In [None]:
df_SP_weekly_rev

In [None]:
df_SP_2017 = df_SP[df_SP.year == 2017]

In [None]:
df_SP_weekly_FPS_2017 = pd.pivot_table(df_SP_2017, index= "woy", values = "price", columns= "product_id",\
aggfunc="sum", fill_value=0)

In [None]:
df_SP_weekly_FPS_2017

In [None]:
df_SP_2018 = df_SP[df_SP.year == 2018]

In [None]:
df_SP_weekly_FPS_2018 = pd.pivot_table(df_SP_2018, index= "woy", values = "price", columns= "product_id",\
aggfunc="sum", fill_value=0)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances

In [None]:
# Similarity Computation for SP, RJ and MG

In [None]:
df_SP_weekly_CS_2017 = pd.DataFrame(cosine_similarity(df_SP_weekly_FPS_2017), columns=df_SP_weekly_FPS_2017.index)
df_SP_weekly_CS_2018 = pd.DataFrame(cosine_similarity(df_SP_weekly_FPS_2018), columns=df_SP_weekly_FPS_2018.index)

df_SP_weekly_ED_2017 = pd.DataFrame(euclidean_distances(df_SP_weekly_FPS_2017), columns=df_SP_weekly_FPS_2017.index)

df_SP_weekly_ED_2018 = pd.DataFrame(euclidean_distances(df_SP_weekly_FPS_2018), columns=df_SP_weekly_FPS_2018.index) 

In [None]:
fp = "../data/olist_prepared/SP_ED_2017.csv"
df_SP_weekly_ED_2017.to_csv(fp, index=False)
fp = "../data/olist_prepared/SP_ED_2018.csv"
df_SP_weekly_ED_2018.to_csv(fp, index=False)
fp = "../data/olist_prepared/SP_CS_2017.csv"
df_SP_weekly_CS_2017.to_csv(fp, index=False)
fp = "../data/olist_prepared/SP_CS_2018.csv"
df_SP_weekly_CS_2018.to_csv(fp, index=False)
fp = "../data/olist_prepared/freq_prod_weekly_sale_SP_2017.parquet"
df_SP_weekly_FPS_2017.to_parquet(fp, index=True)
fp = "../data/olist_prepared/freq_prod_weekly_sale_SP_2018.parquet"
df_SP_weekly_FPS_2018.to_parquet(fp, index=True)

In [None]:
df_SP_weekly_ED_2017.shape

In [None]:
df_RJ_2017 = df_RJ[df_RJ.year == 2017]
df_RJ_2018 = df_RJ[df_RJ.year == 2018]

In [None]:
df_RJ_weekly_FPS_2017 = pd.pivot_table(df_RJ_2017, index= "woy", values = "price", columns= "product_id",\
aggfunc="sum", fill_value=0)
df_RJ_weekly_FPS_2018 = pd.pivot_table(df_RJ_2018, index= "woy", values = "price", columns= "product_id",\
aggfunc="sum", fill_value=0)
df_RJ_weekly_CS_2017 = pd.DataFrame(cosine_similarity(df_RJ_weekly_FPS_2017), columns=df_RJ_weekly_FPS_2017.index)
df_RJ_weekly_CS_2018 = pd.DataFrame(cosine_similarity(df_RJ_weekly_FPS_2018), columns=df_RJ_weekly_FPS_2018.index)

df_RJ_weekly_ED_2017 = pd.DataFrame(euclidean_distances(df_RJ_weekly_FPS_2017), columns=df_RJ_weekly_FPS_2017.index)
df_RJ_weekly_ED_2018 = pd.DataFrame(euclidean_distances(df_RJ_weekly_FPS_2018),columns=df_RJ_weekly_FPS_2018.index) 

In [None]:
fp = "../data/olist_prepared/RJ_ED_2017.csv"
df_RJ_weekly_ED_2017.to_csv(fp, index=False)
fp = "../data/olist_prepared/RJ_ED_2018.csv"
df_RJ_weekly_ED_2018.to_csv(fp, index=False)
fp = "../data/olist_prepared/RJ_CS_2017.csv"
df_RJ_weekly_CS_2017.to_csv(fp, index=False)
fp = "../data/olist_prepared/RJ_CS_2018.csv"
df_RJ_weekly_CS_2018.to_csv(fp, index=False)
fp = "../data/olist_prepared/freq_prod_weekly_sale_RJ_2017.parquet"
df_RJ_weekly_FPS_2017.to_parquet(fp, index=True)
fp = "../data/olist_prepared/freq_prod_weekly_sale_RJ_2018.parquet"
df_RJ_weekly_FPS_2018.to_parquet(fp, index=True)

In [None]:
df_MG_2017 = df_MG[df_MG.year == 2017]
df_MG_2018 = df_MG[df_MG.year == 2018]

In [None]:
df_MG_weekly_FPS_2017 = pd.pivot_table(df_MG_2017, index= "woy", values = "price", columns= "product_id",\
aggfunc="sum", fill_value=0)
df_MG_weekly_FPS_2018 = pd.pivot_table(df_MG_2018, index= "woy", values = "price", columns= "product_id",\
aggfunc="sum", fill_value=0)
df_MG_weekly_CS_2017 = pd.DataFrame(cosine_similarity(df_MG_weekly_FPS_2017), columns=df_MG_weekly_FPS_2017.index)
df_MG_weekly_CS_2018 = pd.DataFrame(cosine_similarity(df_MG_weekly_FPS_2018), columns=df_MG_weekly_FPS_2018.index)

df_MG_weekly_ED_2017 = pd.DataFrame(euclidean_distances(df_MG_weekly_FPS_2017), columns=df_MG_weekly_FPS_2017.index)
df_MG_weekly_ED_2018 = pd.DataFrame( euclidean_distances(df_MG_weekly_FPS_2018),columns=df_MG_weekly_FPS_2018.index) 

In [None]:
fp = "../data/olist_prepared/MG_ED_2017.csv"
df_MG_weekly_ED_2017.to_csv(fp, index=False)
fp = "../data/olist_prepared/MG_ED_2018.csv"
df_MG_weekly_ED_2018.to_csv(fp, index=False)
fp = "../data/olist_prepared/MG_CS_2017.csv"
df_MG_weekly_CS_2017.to_csv(fp, index=False)
fp = "../data/olist_prepared/MG_CS_2018.csv"
df_MG_weekly_CS_2018.to_csv(fp, index=False)
fp = "../data/olist_prepared/freq_prod_weekly_sale_MG_2017.parquet"
df_MG_weekly_FPS_2017.to_parquet(fp, index=True)
fp = "../data/olist_prepared/freq_prod_weekly_sale_MG_2018.parquet"
df_MG_weekly_FPS_2018.to_parquet(fp, index=True)