In [None]:
# 1. Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 2. Load data
df = pd.read_csv("data/sales_data.csv")

# 3. Basic Info
print(df.shape)
print(df.info())
print(df.describe())

# 4. Missing values
print(df.isnull().sum())

# 5. Date conversion
df['OrderDate'] = pd.to_datetime(df['OrderDate'])

# 6. Feature engineering
df['Year'] = df['OrderDate'].dt.year
df['Month'] = df['OrderDate'].dt.month
df['Day'] = df['OrderDate'].dt.day
df['Quarter'] = df['OrderDate'].dt.quarter

# 7. Duplicates
df.drop_duplicates(inplace=True)

# 8. Top-selling products
top_products = df.groupby('Product')['Revenue'].sum().sort_values(ascending=False).head(10)
print(top_products)

plt.figure(figsize=(10,5))
top_products.plot(kind='bar')
plt.title("Top 10 Products by Revenue")
plt.ylabel("Revenue")
plt.savefig("plots/top_products.png")
plt.show()

# 9. Revenue by category
cat_rev = df.groupby('Category')['Revenue'].sum().sort_values(ascending=False)

plt.figure(figsize=(10,5))
cat_rev.plot(kind='bar', color='steelblue')
plt.title("Revenue by Category")
plt.ylabel("Revenue")
plt.savefig("plots/category_revenue.png")
plt.show()

# 10. Revenue over time
plt.figure(figsize=(12,5))
df.groupby('Month')['Revenue'].sum().plot(kind='line')
plt.title("Monthly Revenue Trend")
plt.ylabel("Revenue")
plt.savefig("plots/monthly_revenue.png")
plt.show()

# 11. Region-wise profit
reg_profit = df.groupby('Region')['Profit'].sum()

plt.figure(figsize=(10,5))
reg_profit.plot(kind='bar', color='green')
plt.title("Profit by Region")
plt.ylabel("Profit")
plt.savefig("plots/region_profit.png")
plt.show()

# 12. Correlation heatmap
plt.figure(figsize=(10,6))
sns.heatmap(df.corr(), annot=True, cmap='Blues')
plt.title("Correlation Heatmap")
plt.savefig("plots/correlation_heatmap.png")
plt.show()