In [None]:
# Exploratory Data Analysis for Tide Dynamic Pricing

# This notebook contains exploratory data analysis (EDA) for the dynamic pricing model of Tide at GlobalMart.
# The goal is to understand the data, identify patterns, and derive insights that can inform the pricing strategy.

# --- Import necessary libraries ---
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from azure.storage.blob import BlobServiceClient
import io

# Set visualization style
sns.set(style='whitegrid')

# --- Load raw data files directly from Azure Blob Storage ---
account_name = "globalmartmlsa"
account_key = "<YOUR_ACCESS_KEY>"  # Replace with your actual access key
container_name = "source"

blob_service_client = BlobServiceClient(
    f"https://{account_name}.blob.core.windows.net",
    credential=account_key
)
container_client = blob_service_client.get_container_client(container_name)

def load_csv_from_blob(blob_name):
    blob_client = container_client.get_blob_client(blob_name)
    stream = blob_client.download_blob().readall()
    return pd.read_csv(io.BytesIO(stream))

sales_data = load_csv_from_blob("sales_data_dictionary.csv")
inventory_data = load_csv_from_blob("inventory_data_dictionary.csv")
customer_behavior_data = load_csv_from_blob("customer_behavior_data_dictionary.csv")
competitor_data = load_csv_from_blob("competitor_data_dictionary.csv")

# --- Display the first few rows of each dataset ---
print("Sales Data:")
display(sales_data.head())
print("Inventory Data:")
display(inventory_data.head())
print("Customer Behavior Data:")
display(customer_behavior_data.head())
print("Competitor Data:")
display(competitor_data.head())

# --- Summary statistics for sales data ---
sales_data.describe(include='all')

# --- Check for missing values in all datasets ---
print("Sales Data missing values:")
print(sales_data.isnull().sum())
print("\nInventory Data missing values:")
print(inventory_data.isnull().sum())
print("\nCustomer Behavior Data missing values:")
print(customer_behavior_data.isnull().sum())
print("\nCompetitor Data missing values:")
print(competitor_data.isnull().sum())

# --- Visualize the distribution of SellingPrice in sales data ---
plt.figure(figsize=(10, 6))
sns.histplot(sales_data['SellingPrice'], bins=30, kde=True)
plt.title('Selling Price Distribution of Tide Products')
plt.xlabel('Selling Price')
plt.ylabel('Frequency')
plt.show()

# --- Correlation heatmap for numeric columns in sales data ---
plt.figure(figsize=(8, 6))
corr = sales_data.select_dtypes(include=[np.number]).corr()
sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Correlation Heatmap - Sales Data')
plt.show()

# --- Merge all datasets for joint analysis (if needed) ---
# Standardize date columns for merging
for df in [sales_data, inventory_data, customer_behavior_data, competitor_data]:
    if 'date' in df.columns:
        df.rename(columns={'date': 'Date'}, inplace=True)

# Merge on 'Date'
final_df = sales_data \
    .merge(competitor_data, on='Date', how='left') \
    .merge(customer_behavior_data, on='Date', how='left') \
    .merge(inventory_data, on='Date', how='left')

# Display merged DataFrame
print("Merged DataFrame:")
display(final_df.head())

# --- Summary statistics for merged data ---
final_df.describe(include='all')

# --- Check for missing values in merged data ---
print("Merged DataFrame missing values:")
print(final_df.isnull().sum())

# --- Visualize the distribution of SellingPrice in merged data ---
if 'SellingPrice' in final_df.columns:
    plt.figure(figsize=(10, 6))
    sns.histplot(final_df['SellingPrice'].dropna(), bins=30, kde=True)
    plt.title('Selling Price Distribution (Merged Data)')
    plt.xlabel('Selling Price')
    plt.ylabel('Frequency')
    plt.show()

# --- Correlation heatmap for merged data ---
plt.figure(figsize=(12, 8))
corr_matrix = final_df.select_dtypes(include=[np.number]).corr()
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Correlation Heatmap (Merged Data)')
plt.show()

# --- Insights and Next Steps ---
# Based on the exploratory data analysis of the raw data files, we can derive insights that will help in refining our dynamic pricing model.
# The next steps will involve feature engineering and model training.