# Data Analysis

This notebook aims to analyze data and understand how it's distributed.

## Imports

In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

## Analysis

### Data Load

In [None]:
file_path = "../data/datasets/Amazon_data.jsonl"
df = pd.read_json(file_path, lines=True)
df = df.groupby("main_cat", group_keys=False).apply(
    lambda x: x.sample(frac=0.10, random_state=42)
)
df = df.drop(columns=["image", "category"])
df

In [None]:
(df["brand"].apply(len) == 0).sum()

In [4]:
df["brand"] = df["brand"].apply(lambda x: "Brand Missing" if len(x) == 0 else x)
df["feature"] = df["feature"].apply(lambda x: ["Feature Missing"] if len(x) == 0 else x)
df["title"] = df["title"].apply(lambda x: "Title Missing" if len(x) == 0 else x)

In [None]:
(df["brand"].apply(len) == 0).sum()

In [None]:
df

### Data Plotting

In [None]:
sns.histplot(
    y=df["main_cat"], discrete=True, shrink=0.8, edgecolor="white", linewidth=0.5
)
plt.xlabel("Category")
plt.ylabel("Class Count")
plt.title("Histogram of Category Values")
plt.show()

### Data Save

In [8]:
filtered_df = df[["brand", "description", "feature", "title", "main_cat"]].reset_index(
    drop=True
)
filtered_df.to_csv("../data/datasets/new_filtered_amazon_data.csv", index=False)