# Pandas + NumPy Assignment

## Goals
- Practice creating, cleaning, and analyzing data with NumPy arrays
- Move data into pandas DataFrames for analysis and reporting
- Use vectorized operations, groupby, and joins

## Instructions
- Work through each section in order.
- Show outputs for each step.
- Add short comments where the logic is not obvious.


In [None]:
import numpy as np
import pandas as pd


## Part A: Create data with NumPy

In [None]:
# 1) Create a random dataset of 100 rows with these columns:
# sales, units, region_id, product_id, day
rng = np.random.default_rng(42)

sales = rng.integers(100, 1001, size=100)
units = rng.integers(1, 21, size=100)
region_id = rng.integers(1, 5, size=100)
product_id = rng.integers(1, 7, size=100)
day = rng.integers(1, 31, size=100)

raw = np.column_stack([sales, units, region_id, product_id, day])
raw[:5]


In [None]:
# 2) Convert the dataset to a pandas DataFrame
cols = ["sales", "units", "region_id", "product_id", "day"]
df = pd.DataFrame(raw, columns=cols)
df.head()


In [None]:
# 3) Add price = sales / units, round to 2 decimals

df["price"] = (df["sales"] / df["units"]).round(2)
df.head()


In [None]:
# 4) Add high_value flag

df["high_value"] = df["sales"] > 800
df["high_value"].value_counts()


## Part B: Cleaning and validation

In [None]:
# 5) Insert 5 missing values into sales at random positions
missing_idx = rng.choice(df.index, size=5, replace=False)
df.loc[missing_idx, "sales"] = np.nan

# check missing
(df["sales"].isna().sum(), missing_idx)


In [None]:
# 6) Replace missing sales with median
median_sales = df["sales"].median()
df["sales"] = df["sales"].fillna(median_sales)

df["sales"].isna().sum()


In [None]:
# 7) Ensure dtypes

df["day"] = df["day"].astype(int)
df["price"] = df["price"].astype(float)

df.dtypes


## Part C: Analysis with pandas

In [None]:
# 8) Total sales by region_id

df.groupby("region_id")["sales"].sum()


In [None]:
# 9) Average price by product_id (descending)

df.groupby("product_id")["price"].mean().sort_values(ascending=False)


In [None]:
# 10) Top 5 rows by sales

df.sort_values("sales", ascending=False).head(5)[
    ["region_id", "product_id", "sales", "units", "price"]
]


In [None]:
# 11) Region summary: total sales, total units, average price

df.groupby("region_id").agg(
    total_sales=("sales", "sum"),
    total_units=("units", "sum"),
    avg_price=("price", "mean"),
)


## Part D: Combine datasets

In [None]:
# 12) Region lookup
region_lookup = pd.DataFrame({
    "region_id": [1, 2, 3, 4],
    "region_name": ["North", "South", "East", "West"],
})
region_lookup


In [None]:
# 13) Product lookup
product_lookup = pd.DataFrame({
    "product_id": [1, 2, 3, 4, 5, 6],
    "product_name": ["P1", "P2", "P3", "P4", "P5", "P6"],
})
product_lookup


In [None]:
# 14) Merge with lookups

df_joined = df.merge(region_lookup, on="region_id").merge(product_lookup, on="product_id")
df_joined.head(10)


## Part E: NumPy practice on arrays

In [None]:
# 15) From the original NumPy array
sales_col = raw[:, 0]
region_col = raw[:, 2]

sales_mean = sales_col.mean()

mean_by_region = {
    r: sales_col[region_col == r].mean()
    for r in np.unique(region_col)
}

max_sales = sales_col.max()
max_row_idx = sales_col.argmax()

sales_mean, mean_by_region, (max_sales, max_row_idx)


In [None]:
# 16) Boolean indexing

filtered = raw[(raw[:, 1] >= 10) & (raw[:, 0] >= 500)]
filtered[:5]


## Part F: Challenge (optional)

In [None]:
# 17) Pivot table of total sales by region and product

pivot = df_joined.pivot_table(
    index="region_name",
    columns="product_name",
    values="sales",
    aggfunc="sum",
)
pivot


In [None]:
# 18) Bar chart of total sales by region
# If you are using a notebook, this will display the chart.

import matplotlib.pyplot as plt

df_joined.groupby("region_name")["sales"].sum().plot(kind="bar")
plt.title("Total Sales by Region")
plt.xlabel("Region")
plt.ylabel("Sales")
plt.tight_layout()
plt.show()
