# Step 1: Fetch npm package download data  
We selected a mix of **widely used libraries** (like React, Lodash, Webpack) and **ecosystem‑governance" tools** (like `license‑kit` and `npm‑license‑corrections`).  
- The widely used libraries help us understand how adoption shifts in response to ecosystem-level events (e.g., a license change).  
- The governance tools serve as proxies for when maintainers and organizations care more about licensing hygiene and compliance.  
We fetch daily download counts from the npm registry and then aggregate by **month** to smooth out daily noise.  
We also mark each month as **pre** or **post** a key “event date” (e.g., when license governance or policy changes happened) for later analysis.

In [None]:
# ---
# Step 0: Setup Environment (Enhanced)
# Fetches npm download data for selected packages
# Adds multi-month pre/post windows and derived metrics for richer storytelling
# ---

import os
import pandas as pd
import requests
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14,6)

# Colab detection
try:
    from google.colab import files
    COLAB = True
except:
    COLAB = False
print("COLAB:", COLAB)

# ---
# Step 1: Robust fetch function
# Retry logic ensures we can recover from transient API failures
# ---
def fetch_npm_bulk(package, start, end, max_attempts=3, timeout=90):
    url = f"https://api.npmjs.org/downloads/range/{start}:{end}/{package}"
    attempt = 0
    while attempt < max_attempts:
        try:
            r = requests.get(url, timeout=timeout)
            r.raise_for_status()
            data = r.json()
            rows = [{'month': rec['day'][:7], 'downloads': rec['downloads']} for rec in data.get('downloads',[])]
            df = pd.DataFrame(rows)
            if df.empty:
                return pd.DataFrame(columns=['month','downloads'])
            df = df.groupby('month', as_index=False)['downloads'].sum()
            df['month_dt'] = pd.to_datetime(df['month'])
            return df
        except Exception as e:
            print(f"Attempt {attempt+1} failed for {package}: {e}")
            attempt += 1
    raise RuntimeError(f"Failed to fetch npm data for {package} after {max_attempts} attempts")

# ---
# Step 2: Packages and event dates (add more representative packages if desired)
# Each event month can represent license/security/governance change
# ---
packages = {
    'mongodb': ('2017-10-01','2025-10-01','2018-11'),
    '@elastic/elasticsearch': ('2020-01-01','2025-10-01','2021-02'),
    'react': ('2019-01-01','2025-10-01','2020-06'),
    'webpack': ('2018-01-01','2025-10-01','2020-03'),
    'lodash': ('2017-01-01','2025-10-01','2019-07'),
    'axios': ('2018-01-01','2025-10-01','2020-05'),
    'license-kit': ('2022-01-01','2025-10-01','2025-06'),
    'npm-license-corrections': ('2022-01-01','2025-10-01','2025-01'),
}

# Multi-month pre/post window
PRE_WINDOW = 6  # months before event
POST_WINDOW = 6 # months after event

all_rows = []
for pkg, (start, end, event_month) in packages.items():
    df_pkg = fetch_npm_bulk(pkg, start, end)
    if df_pkg.empty:
        print(f"No data for {pkg}, skipping.")
        continue

    # Drop incomplete last month if downloads too low
    last_month = df_pkg['month'].max()
    if df_pkg[df_pkg['month'] == last_month]['downloads'].sum() < 50:
        df_pkg = df_pkg[df_pkg['month'] != last_month]

    df_pkg['project'] = pkg
    df_pkg['month_dt'] = pd.to_datetime(df_pkg['month'])
    event_dt = pd.to_datetime(event_month)
    
    # Pre/post labels
    df_pkg['pre'] = ((df_pkg['month_dt'] < event_dt) & 
                     (df_pkg['month_dt'] >= event_dt - pd.DateOffset(months=PRE_WINDOW))).astype(int)
    df_pkg['post'] = ((df_pkg['month_dt'] >= event_dt) & 
                      (df_pkg['month_dt'] <= event_dt + pd.DateOffset(months=POST_WINDOW))).astype(int)
    df_pkg['treated'] = 1
    
    # Smoothed downloads and growth
    df_pkg['downloads_ma3'] = df_pkg['downloads'].rolling(3, min_periods=1).mean()
    df_pkg['pct_change'] = df_pkg['downloads_ma3'].pct_change().fillna(0)
    
    all_rows.append(df_pkg[['project','month','month_dt','downloads','downloads_ma3','pct_change','pre','post','treated']])

# Concatenate all packages
prepost_df = pd.concat(all_rows, ignore_index=True)
prepost_df.to_csv('prepost_downloads_real_enriched.csv', index=False)
print("Saved prepost_downloads_real_enriched.csv")

# ---
# Step 3: Rich overview plot
# ---
plt.figure(figsize=(16,6))
sns.lineplot(data=prepost_df, x='month_dt', y='downloads_ma3', hue='project')
plt.xticks(rotation=45)
plt.title("Smoothed Monthly npm Downloads (3-month MA) by Project")
plt.xlabel("Month")
plt.ylabel("Downloads (3-month MA)")
plt.grid(True)
plt.show()
