In [8]:
import pandas as pd
import numpy as np
import os

INPUT_CSV = "D:\MDA\Term III\Case Study III\Assignments\Part II\car-sales.csv" 
COUNTRY = "Canada"
START_DATE = "1995-01-01"
END_DATE = "2025-08-13"
USE_SEASONALITY = True       
RANDOM_SEED = 42             

rng = np.random.default_rng(RANDOM_SEED)

def load_canada_annual(path: str, country: str) -> pd.DataFrame:
    df = pd.read_csv(path)
    df = df[df["Entity"] == country].copy()
    # Keeping only needed columns
    df = df[["Year", "Electric cars sold", "Non-electric car sales"]].rename(
        columns={"Electric cars sold": "annual_electric",
                 "Non-electric car sales": "annual_nonelectric"}
    )
    # Ensuring numeric value
    df["Year"] = df["Year"].astype(int)
    df["annual_electric"] = pd.to_numeric(df["annual_electric"], errors="coerce")
    df["annual_nonelectric"] = pd.to_numeric(df["annual_nonelectric"], errors="coerce")
    return df

def extend_annual_1995_2025(annual_df: pd.DataFrame) -> pd.DataFrame:
    """Reindex to 1995..2025, linearly interpolate missing years (forward/backward), clip negatives to 0."""
    years_full = pd.Index(range(1995, 2026), name="Year")  
    df_full = annual_df.set_index("Year").reindex(years_full)

    # Linear interpolation across years
    for col in ["annual_electric", "annual_nonelectric"]:
        df_full[col] = df_full[col].interpolate(method="linear", limit_direction="both")

    # No negative sales
    df_full[["annual_electric", "annual_nonelectric"]] = df_full[["annual_electric", "annual_nonelectric"]].clip(lower=0)

    df_full["annual_total"] = df_full["annual_electric"] + df_full["annual_nonelectric"]
    return df_full.reset_index()

def seasonality_weights(dates: pd.Series) -> np.ndarray:
    """Return multiplicative weights per day (month + weekday), normalized to mean 1.0."""
    # Month factors 
    month_factor = {
        1: 0.92,  2: 0.94,  3: 0.98,  4: 1.03,
        5: 1.07,  6: 1.10,  7: 1.12,  8: 1.10,
        9: 1.04, 10: 1.00, 11: 0.96, 12: 1.02
    }
    # Weekday factors (lower on weekends)
    # Monday=0 ... Sunday=6
    weekday_factor = {0: 1.02, 1: 1.02, 2: 1.02, 3: 1.02, 4: 1.02, 5: 0.92, 6: 0.90}

    months = dates.dt.month.map(month_factor).astype(float).to_numpy()
    wdays = dates.dt.weekday.map(weekday_factor).astype(float).to_numpy()
    w = months * wdays

    # Normalize to mean 1.0 so only relative shape, not level
    return w / w.mean()

def random_daily_integers_for_year(total: int, weights: np.ndarray, rng: np.random.Generator) -> np.ndarray:
    """
    Convert an annual total into a vector of random *integers* per day (len(weights)),
    summing exactly to total, using Dirichlet + rounding + residue fix.
    """
    days = len(weights)
    if total == 0:
        return np.zeros(days, dtype=int)

    alpha = weights.astype(float)
    alpha[alpha <= 0] = 1.0
    probs = rng.dirichlet(alpha)

    # Initial integer allocation
    raw = probs * total
    daily = np.floor(raw).astype(int)
    residue = int(total - daily.sum())
    if residue > 0:
        # distribute the remaining +1s to the top 'residue' fractional parts
        frac_order = np.argsort((raw - daily))[-residue:]
        daily[frac_order] += 1
    elif residue < 0:
        # remove -1 from the largest daily entries
        idx_order = np.argsort(daily)[-abs(residue):]
        daily[idx_order] -= 1

    # Ensuring no negatives and exact sum
    daily = np.maximum(daily, 0)
    fix = int(total - daily.sum())
    if fix != 0:
        # add/subtract ones randomly to match the exact total
        choices = rng.integers(0, days, size=abs(fix))
        daily[choices] += np.sign(fix)

    assert daily.sum() == total, "Daily allocation does not sum to annual total."
    return daily

def build_daily_dataset(annual_extended: pd.DataFrame,
                        start_date: str,
                        end_date: str,
                        use_seasonality: bool,
                        rng: np.random.Generator) -> pd.DataFrame:
    dates = pd.date_range(start=start_date, end=end_date, freq="D")
    daily = pd.DataFrame({"Date": dates})
    daily["Year"] = daily["Date"].dt.year

    # Joining annual numbers
    ann = annual_extended.set_index("Year")[["annual_electric", "annual_nonelectric"]]
    daily = daily.join(ann, on="Year")

    # For each year, creating randomized daily integer splits for electric & non-electric
    out_rows = []
    for year, grp in daily.groupby("Year", sort=True):
    
        year_len = len(grp)

        if use_seasonality:
            w = seasonality_weights(grp["Date"])
        else:
            w = np.ones(year_len, dtype=float)

        elec_total = int(round(grp["annual_electric"].iloc[0]))
        nonelec_total = int(round(grp["annual_nonelectric"].iloc[0]))

        elec_daily = random_daily_integers_for_year(elec_total, w, rng)
        nonelec_daily = random_daily_integers_for_year(nonelec_total, w, rng)

        temp = grp[["Date"]].copy()
        temp["Daily_electric"] = elec_daily
        temp["Daily_nonelectric"] = nonelec_daily
        temp["Daily_total"] = temp["Daily_electric"] + temp["Daily_nonelectric"]
        out_rows.append(temp)

    daily_out = pd.concat(out_rows, ignore_index=True)
    return daily_out

def main():
    #Loading Canada annuals (2011–2025 in your file)
    annual_canada = load_canada_annual(INPUT_CSV, COUNTRY)

    #Extending it to 1995–2025 by linear interpolation & clipping
    annual_1995_2025 = extend_annual_1995_2025(annual_canada)

    # Building daily dataset with random integers (optionally seasonal)
    daily_df = build_daily_dataset(
        annual_extended=annual_1995_2025,
        start_date=START_DATE,
        end_date=END_DATE,
        use_seasonality=USE_SEASONALITY,
        rng=rng
    )

    out_dir = r"D:\MDA\Term III\Case Study III\Assignments\Part II"
    os.makedirs(out_dir, exist_ok=True)  

    out_path = os.path.join(
        out_dir,
        "daily_car_sales" + ".csv"
    )

    daily_df.to_csv(out_path, index=False)
    print(f"Saved: {out_path}")
    print(daily_df.head())
    print(daily_df.tail())


if __name__ == "__main__":
    main()


  INPUT_CSV = "D:\MDA\Term III\Case Study III\Assignments\Part II\car-sales.csv"


Saved: D:\MDA\Term III\Case Study III\Assignments\Part II\daily_car_sales.csv
        Date  Daily_electric  Daily_nonelectric  Daily_total
0 1995-01-01               2                227          229
1 1995-01-02               3                779          782
2 1995-01-03               0               2733         2733
3 1995-01-04               2                 10           12
4 1995-01-05               0               2975         2975
            Date  Daily_electric  Daily_nonelectric  Daily_total
11178 2025-08-09             126               2263         2389
11179 2025-08-10            1347               9463        10810
11180 2025-08-11             976              14815        15791
11181 2025-08-12            1209               1598         2807
11182 2025-08-13             678               1055         1733
