# Exploratory Data Analysis on Superstore Dataset
This project is part of my data analysis portfolio.  
The goal is to explore sales and profitability trends from the Superstore dataset using Python, Pandas, and visualization libraries.

## Project Steps
1. Data Loading
2. Data Overview
3. Data Cleaning (if needed)
4. Exploratory Data Analysis (EDA)
5. Visualization
6. Insights and Conclusion

In [None]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Display settings
pd.set_option("display.max_columns", None)
pd.set_option("display.float_format", lambda x: f"{x:,.2f}")
sns.set(style="whitegrid", palette="pastel")

# Paths
DATA_PATH = "../data/Superstore.csv"  # adjust filename if needed
IMAGES_DIR = "../images"
os.makedirs(IMAGES_DIR, exist_ok=True)

print("Setup ready ✅")

In [None]:
# Load dataset
df = pd.read_csv(DATA_PATH, encoding="latin1")  # handles special characters

# Quick look at the data
df.head()


In [None]:
# Basic info
print("Shape (rows, columns):", df.shape)
print("\nColumn types:\n", df.dtypes)

# Missing values
print("\nMissing values per column:")
print(df.isna().sum())

# Check for duplicates
print("\nDuplicate rows:", df.duplicated().sum())

## Exploratory Data Analysis (EDA)
We will explore numerical and categorical variables, distributions, correlations, and key patterns in the dataset.

# Select numerical columns
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
df[num_cols].describe()

In [None]:
# Select categorical columns
cat_cols = df.select_dtypes(include=["object", "category"]).columns.tolist()
df[cat_cols].describe()

# Select numerical columns
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()


In [None]:
# Histograms for numerical columns
for col in num_cols:
    plt.figure(figsize=(8,5))  # maior gráfico
    sns.histplot(df[col], bins=30, kde=True, color="#4C72B0")  # cor personalizada
    plt.title(f'Distribution of {col}', fontsize=16)
    plt.xlabel(col, fontsize=12)
    plt.ylabel("Count", fontsize=12)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()


In [None]:
# Boxplots for numerical columns
for col in num_cols:
    plt.figure(figsize=(8,3))
    sns.boxplot(x=df[col], color="#55A868")
    plt.title(f'Boxplot of {col}', fontsize=16)
    plt.xlabel(col, fontsize=12)
    plt.tight_layout()
    plt.show()


In [None]:
# Top categories for categorical columns
for col in cat_cols:
    plt.figure(figsize=(10,5))
    top_values = df[col].value_counts().head(10)
    sns.barplot(x=top_values.values, y=top_values.index, palette="pastel")
    plt.title(f'Top 10 Categories in {col}', fontsize=16)
    plt.xlabel("Count", fontsize=12)
    plt.ylabel(col, fontsize=12)

    # Mostrar valores nas barras
    for i, v in enumerate(top_values.values):
        plt.text(v + 5, i, str(v), color='black', va='center', fontsize=10)

    plt.tight_layout()
    plt.show()



In [None]:
# Correlation matrix
if len(num_cols) >= 2:
    plt.figure(figsize=(8,6))
    sns.heatmap(df[num_cols].corr(), annot=True, cmap="coolwarm")
    plt.title("Correlation Heatmap (Numerical Features)")
    plt.tight_layout()
    plt.savefig(f"{IMAGES_DIR}/correlation_heatmap.png")
    plt.show()


## Insights & Next Steps
- Summarize the main findings here (top products, most profitable regions, patterns in sales)
- Optionally, create a mini-dashboard using Streamlit
- Consider adding predictive models for future sales or profit
