In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as ss
import pandas as pd
import numpy as np
import datetime


# fact table
sessions_df = pd.read_json("data/sessions.jsonl", lines=True)

# dimension tables
deliveries_df = pd.read_json("data/deliveries.jsonl", lines=True)
products_df = pd.read_json("data/products.jsonl", lines=True)
users_df = pd.read_json("data/users.jsonl", lines=True)

In [None]:
#MAKE_PLOTS = True
#MAKE_PAIRPLOT = True
DATE_FORMAT = "%Y-%m-%dT%H:%M:%S"
PRICE_MAX = 100_000    # for outliers
#WEIGHT_TRESHOLD = 50        # for outliers
#NUM_OF_HOURS = 12
#SEED = 42
SHOW_ALL_WARNINGS = False
SHOW_ONLY_ONE_WARNING = False

In [None]:
import warnings

if SHOW_ONLY_ONE_WARNING:
    warnings.filterwarnings(action='once')
elif not SHOW_ALL_WARNINGS:
    warnings.filterwarnings('ignore')

In [None]:
sessions_df

## adding a column with time of delivery

In [None]:
# 1. Change format to datetime

deliveries_df["purchase_timestamp"] = pd.to_datetime(deliveries_df["purchase_timestamp"], format=DATE_FORMAT)
deliveries_df["delivery_timestamp"] = pd.to_datetime(deliveries_df["delivery_timestamp"], format=DATE_FORMAT)

# 2. Add column time_of_delivery
deliveries_df["time_of_delivery"] = deliveries_df["delivery_timestamp"] - deliveries_df["purchase_timestamp"]

# 3. Delete not delivered products
deliveries_df = deliveries_df[deliveries_df["time_of_delivery"].notna()]

# 4. Set time_of_delivery as seconds
deliveries_df["time_of_delivery"] = deliveries_df["time_of_delivery"].apply(datetime.timedelta.total_seconds)


## join tables

In [None]:
# join rows where event_type is equal "BUY_PRODUCT"
# df.loc[df['col1'] == value]
sessions_df = sessions_df.loc[sessions_df["event_type"] == "BUY_PRODUCT"]
df = sessions_df.merge(products_df, on="product_id", how="left")
df = df.merge(deliveries_df, on="purchase_id", how="right")
df = df.merge(users_df, on="user_id", how="left")


df.to_csv('rrr.csv', 'w')
df

## add column with information about returning - 0/1

In [None]:
sessions_df_2 = pd.read_json("data/sessions.jsonl", lines=True)
sessions_df_2 = sessions_df_2.loc[sessions_df_2["event_type"] == "RETURN_PRODUCT"]
sessions_df_2 = sessions_df_2.drop(columns=["session_id", "timestamp", "user_id", "product_id", "offered_discount"])

sessions_df_2.rename(columns = {'event_type':'return_label'}, inplace = True)
df = df.merge(sessions_df_2, on="purchase_id", how="left")

df['return_label'] = df['return_label'].replace(np.nan, 0)
df['return_label'] = df['return_label'].replace("RETURN_PRODUCT", 1)

df.to_json('rrr.json')
df

In [None]:
to_show_price_df = df[df["price"] <= PRICE_MAX]
to_show_price_df = to_show_price_df[to_show_price_df["price"] > 0]
to_show_price_df.hist(column="price")

In [None]:
to_show_weight_df = df
to_show_weight_df = to_show_weight_df[to_show_weight_df["weight_kg"] <= 50]

to_show_weight_df.hist(column="weight_kg")

In [None]:
df.hist(column="time_of_delivery")

## 

In [None]:
def update_list_of_columns():
    banned_list_of_columns = [ ]
    columns_list = [col for col in df.columns.values.tolist() if col not in banned_list_of_columns]
    return columns_list

columns_list = update_list_of_columns()
columns_list

In [None]:
#sns.heatmap(df[columns_list].corr('spearman'));
sns.heatmap(df[columns_list].corr('pearson'), square=True)

In [None]:
df

In [None]:

X = df.copy()
# #['time_of_delivery'] = X['time_of_delivery'] / 1000000
# #X = X.drop(columns=["optional_attributes","session_id","timestamp","user_id","product_id","event_type","offered_discount", "purchase_id", "product_name", "category_path", "brand", "weight_kg","purchase_timestamp", "delivery_timestamp", "delivery_company","time_of_delivery", "name", "city","street", "return_label"])
# #X = X.drop(columns=["optional_attributes","session_id","timestamp","user_id","product_id","event_type","offered_discount", "purchase_id", "product_name", "category_path", "brand", "weight_kg", "purchase_timestamp", "delivery_timestamp", "delivery_company", "time_of_delivery", "name", "city","street", "return_label"])
# X = X.drop(columns=["optional_attributes"])

# # X.to_csv('rrr.csv', 'w')
y = X.pop("return_label")

# # Label encoding for categoricals
# for colname in X.select_dtypes("object"):
#     X[colname], _ = X[colname].factorize()

# # All discrete features should now have integer dtypes (double-check this before using MI!)
# discrete_features = X.dtypes == int
X[['price', 'time_of_delivery', 'session_id',"user_id","product_id","offered_discount", 'weight_kg', "time_of_delivery"]]

In [None]:
from sklearn.feature_selection import mutual_info_classif


def make_mi_scores(X, y, discrete_features):
    mi_scores = mutual_info_classif(X, y)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

#mi_scores = make_mi_scores(X, y, discrete_features)
#mi_scores[::3]  # show a few features with their MI scores
mutual_info_classif(X[['price', 'time_of_delivery', 'session_id',"user_id","product_id","offered_discount", "purchase_id",'weight_kg', "time_of_delivery"]], y)