In [5]:
import os, json, time
from typing import Dict, Any, Optional
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [7]:
# ===== constants =====
UCI_URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
REQUIRED_COLS = ["sepal_length", "sepal_width", "petal_length", "petal_width", "species"]
NUM_COLS = ["sepal_length", "sepal_width", "petal_length", "petal_width"]

In [35]:
# ===== logging / utils =====
def log(msg: str, initials: str, student_id: str) -> None:
    """Print a timestamped message tagged with student initials and 9-digit ID."""
    print(f"[{time.strftime('%H:%M:%S')}] [{SPP}-{907394064}] {msg}")

def ensure_dir(path: str) -> None:
    """Create directory if it does not exist."""
    os.makedirs(path, exist_ok=True)


TODO:
          - Read CSV from self.url with header=None and names=REQUIRED_COLS.
          - Drop empty rows.
          - Set self.df and return it.

In [39]:
# ===== data loader (implementations required) =====
class IrisLoaderUCI:
    def __init__(self, url: str = UCI_URL):
        self.url = url
        self.df: Optional[pd.DataFrame] = None

    def load(self) -> pd.DataFrame:
        df = pd.read_csv(
            UCI_URL,
            header = None,
            names = REQUIRED_COLS
        )

        df.dropna().reset_index(drop=True)
        self.df = df
        return df

TODO:
          - Compute per-class counts for 'species'.
          - total = number of rows.
          - proportions = counts / total, rounded to 6 decimals.
          - Return dict: {"total": int, "counts": {class: int}, "proportions": {class: float}}.

In [37]:
 def check_class_balance(self) -> Dict[str, Any]:
        counts = self.df["species"].value_counts().to_dict()
        props = self.df["species"].value_counts(normalize=True).round(6).to_dict()
        out = {"counts": counts, "proportions": props}

        return out

TODO:
          - Use the last two digits of `student_id` as the random seed. (e.g student_id = 202312345 -> seed = 45)
          - Deterministically shuffle the DataFrame (use the provided seed).
          - Take the first k rows and save to out_csv without index.

In [41]:
def save_head(self, k: int = 10, out_csv: str = "./outputs/head.csv", seed: int = None) -> None:

    # seed is last two digigts of PSU ID
    seed = int(str(student_id)[-2:])
    tmp = self.df.sample(frac=1, random_state=seed).head(k)
    tmp.to_csv(out, index=False)

TODO:
          - Call check_class_balance() and save the resulting dict as JSON to out_json.

In [48]:
def save_class_balance(self, out_json: str = "./outputs/class_balance.json") -> None:
        rep = self.check_class_balance()
        with open(out, "w") as f:
            json.dump(rep, f, indent=2)

In [52]:
# ===== processor (implementations required) =====
class Processor:
    def __init__(self, df: pd.DataFrame):
        self.df = df.copy()

    def add_numeric_label(self, out_map: str = "./outputs/label_map.json") -> None:
        """
        TODO:
          - From self.df["species"], build alphabetical class list and mapping to integers {0..K-1}.
          - Create self.df["label"] via the mapping.
          - Save the mapping dict to JSON at out_map.
        """
        species_sorted = sorted(self.df["species"].unique())
        label_map = {name: i for i, name in enumerate(species_sorted)}
        self.df["label"] = self.df["species"].map(label_map).astype(int)

        # save label map where main() tells you to
        
        self.label_map = label_map
        return label_map
       

TODO:
          - For each col in NUM_COLS, compute:
              * min
              * max
              * mean
              * median
              * std
          - Return dict in the form:
              {
                  "sepal_length": {"min":..., "max":..., "mean":..., "median":..., "std":...},
                  "sepal_width": {...},
                  ...
              }

In [54]:
def stats(self) -> Dict[str, Any]:

    stats_out: Dict[str, Dict[str, float]] = {}
    for col in NUM_COLS:
        s = self.df[col].astype(float)
        stats_out[col] = {
            "min": float(s.min()),
            "max": float(s.max()),
            "mean": float(s.mean()),
            "median": float(s.median()),
            "std": float(s.std(ddof=1)),
        }
    return stats_out

In [None]:
TODO:
          - Split the DataFrame **in order** (no shuffle).
          - Validation size = round(n * val_ratio), clamp so both sets are >= 1 row.
          - First part = val set, remaining = train set.
          - Save to ./outputs/train.csv and ./outputs/val.csv (no index).
          - Return {"train_size": int, "val_size": int}

In [56]:
def train_val_split(self, val_ratio: float = 0.2) -> Dict[str, Any]:
    n = len(self.df)
    v = int(round(n * val_ratio))
    v = max(1, min(v, n - 1)) # clamp so 1..n-1
    val_df = self.df.iloc[:v].copy()
    train_df = self.df.iloc[v:].copy()
   
    self.train_df, self.val_df = train_df, val_df

    return {"train_size": len(train_df), "val_size": len(val_df)}
        

In [None]:
 def plot_hist(self, col: str = "petal_length", out: str = "./outputs/hist_petal_length.png") -> None:
        """
        TODO:
          - Make a histogram with 20 bins for column `col`.
          - Add title/xlabel/ylabel, tight_layout, save to `out` at dpi=150, then close the figure.
        """
        raise NotImplementedError("Implement Processor.plot_hist")

    def plot_label_bar(self, out: str = "./outputs/label_bar.png") -> None:
        """
        TODO:
          - Create a bar chart of species counts (include NaN if any; sort by class name).
          - Add title/xlabel/ylabel, tight_layout, save to `out` at dpi=150, then close the figure.
        """
        raise NotImplementedError("Implement Processor.plot_label_bar")

    def plot_scatter(self, x: str = "petal_length", y: str = "petal_width",
                     color_by: str = "species", out: str = "./outputs/scatter_petal.png") -> None:
        """
        TODO:
          - For each group in self.df grouped by `color_by` (include NaN group if any),
            plot a scatter of x vs y with a legend.
          - Add title/xlabel/ylabel, tight_layout, save to `out` at dpi=150, then close the figure.
        Preconditions:
          - Columns x, y, color_by exist in self.df.
        """
        raise NotImplementedError("Implement Processor.plot_scatter")

In [None]:
 def save_processed(self, out_csv: str = "./outputs/processed.csv") -> None:
        """
        TODO:
          - Save the full processed DataFrame to out_csv without index.
        """
        raise NotImplementedError("Implement Processor.save_processed")


In [None]:
# ===== main driver (edit initials/ID; you may tweak seed/val_ratio) =====
def main(initials: str, student_id: str, seed: int = 0, val_ratio: float = 0.2, uci_url: str = UCI_URL):
    # Validate ID format (exactly 9 digits)
    if not (isinstance(student_id, str) and student_id.isdigit() and len(student_id) == 9):
        raise ValueError("student_id must be a string of exactly 9 digits, e.g., '202312345'.")

    ensure_dir("./outputs")  # must be called exactly once
    log(f"Reading UCI CSV: {uci_url}", initials, student_id)

    loader = IrisLoaderUCI(url=uci_url)
    df = loader.load()
    log(f"Loaded shape: {df.shape}", initials, student_id)

    balance = loader.check_class_balance()
    loader.save_class_balance("./outputs/class_balance.json")
    log(f"Class balance saved: counts={balance.get('counts')}, proportions={balance.get('proportions')}",
        initials, student_id)

    loader.save_head(10, "./outputs/head.csv", seed=1)
    log("Saved head.csv", initials, student_id)

    proc = Processor(df)
    proc.add_numeric_label("./outputs/label_map.json")
    log("Added numeric label & saved label_map.json", initials, student_id)

    split_info = proc.train_val_split(val_ratio=val_ratio, seed=seed)
    log(f"Split: {split_info}", initials, student_id)

    stats = proc.stats()
    with open("./outputs/stats.json", "w") as f:
        json.dump(stats, f, indent=2)
    log("Saved stats.json", initials, student_id)

    proc.plot_hist("petal_length", "./outputs/hist_petal_length.png")
    proc.plot_label_bar("./outputs/label_bar.png")
    proc.plot_scatter("petal_length", "petal_width", out="./outputs/scatter_petal.png")
    log("Saved plots", initials, student_id)

    proc.save_processed("./outputs/processed.csv")
    log("Saved processed.csv", initials, student_id)


if __name__ == "__main__":
    main(initials="SPP", student_id="907394064")
    self.train_df.to_csv("./outputs/train.csv", index=False)
    self.val_df.to_csv("./outputs/val.csv", index=False)

