
# Análisis de actividad de commits

Este cuaderno calcula métricas agregadas por semana y mes a partir de los datos procesados
(`data/processed/`), genera visualizaciones en `reports/figures/` y resalta semanas con
picos de actividad mediante anotaciones.


In [None]:

from __future__ import annotations

from pathlib import Path
from typing import Iterable, Tuple

import matplotlib.pyplot as plt
import pandas as pd


In [None]:

DATA_DIR = Path("data/processed")
OUTPUT_DIR = Path("reports/figures")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

CANDIDATE_FILES = [
    DATA_DIR / "commits_clean.parquet",
    DATA_DIR / "commits_clean.csv",
]


In [None]:

def pick_existing_file(candidates: Iterable[Path]) -> Path:
    for path in candidates:
        if path.exists():
            return path
    raise FileNotFoundError(
        "No se encontraron archivos procesados. Ejecuta scripts/clean_data.py antes de correr este análisis."
    )


def ensure_columns(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    if "authored_at_utc" not in df.columns:
        df["authored_at_utc"] = pd.to_datetime(df["authored_at"], errors="coerce", utc=True)
    else:
        df["authored_at_utc"] = pd.to_datetime(df["authored_at_utc"], errors="coerce", utc=True)

    if "canonical_author" not in df.columns:
        df["canonical_author"] = df.get("author_email", pd.Series(dtype=str)).fillna("desconocido").str.lower()

    if "effective_loc" not in df.columns:
        insertions = pd.to_numeric(df.get("insertions", 0), errors="coerce").fillna(0)
        deletions = pd.to_numeric(df.get("deletions", 0), errors="coerce").fillna(0)
        df["effective_loc"] = insertions + deletions

    return df


def load_commits() -> pd.DataFrame:
    data_path = pick_existing_file(CANDIDATE_FILES)
    if data_path.suffix == ".parquet":
        df = pd.read_parquet(data_path)
    else:
        df = pd.read_csv(data_path)

    df = ensure_columns(df)
    df = df.sort_values("authored_at_utc").reset_index(drop=True)
    return df


In [None]:

def compute_aggregations(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
    indexed = df.set_index("authored_at_utc")

    weekly = (
        indexed.resample("W")
        .agg(
            commits=("subject", "count"),
            effective_loc=("effective_loc", "sum"),
            unique_authors=("canonical_author", pd.Series.nunique),
        )
        .reset_index()
    )

    monthly = (
        indexed.resample("MS")
        .agg(
            commits=("subject", "count"),
            effective_loc=("effective_loc", "sum"),
            unique_authors=("canonical_author", pd.Series.nunique),
        )
        .reset_index()
    )

    return weekly, monthly


def highlight_peaks(series: pd.Series, threshold: float | None = None, top_n: int = 3) -> pd.Series:
    threshold = threshold or (series.mean() + series.std())
    peaks = series[series >= threshold]
    if peaks.empty:
        return series.nlargest(top_n)
    return peaks.sort_values(ascending=False).head(top_n)


In [None]:

commits = load_commits()
weekly_metrics, monthly_metrics = compute_aggregations(commits)

print("Semanas analizadas:", len(weekly_metrics))
print("Meses analizados:", len(monthly_metrics))

weekly_metrics.tail()



## Visualizaciones

Se generan gráficos semanales y mensuales, guardados en `reports/figures/`. Los picos de
actividad se anotan directamente en la serie semanal de commits.


In [None]:

weekly_peaks = highlight_peaks(weekly_metrics["commits"], top_n=5)

fig, ax = plt.subplots(figsize=(12, 5))
ax.plot(weekly_metrics["authored_at_utc"], weekly_metrics["commits"], marker="o", label="Commits por semana")
ax.set_title("Actividad semanal de commits")
ax.set_xlabel("Semana")
ax.set_ylabel("Número de commits")
ax.grid(True, alpha=0.3)

for date, value in weekly_peaks.items():
    ax.annotate(
        f"Pico: {int(value)}",
        xy=(date, value),
        xytext=(0, 12),
        textcoords="offset points",
        ha="center",
        arrowprops={"arrowstyle": "->", "color": "tab:red"},
        fontsize=9,
        color="tab:red",
    )

weekly_fig_path = OUTPUT_DIR / "weekly_commits.png"
fig.savefig(weekly_fig_path, dpi=300, bbox_inches="tight")
weekly_fig_path


In [None]:

fig, ax = plt.subplots(figsize=(12, 5))
ax.bar(monthly_metrics["authored_at_utc"], monthly_metrics["effective_loc"], width=20, color="#4C72B0")
ax.set_title("LOC efectiva por mes")
ax.set_xlabel("Mes")
ax.set_ylabel("Líneas de código (insert+delete)")
ax.grid(axis="y", alpha=0.3)

loc_fig_path = OUTPUT_DIR / "monthly_effective_loc.png"
fig.autofmt_xdate()
fig.savefig(loc_fig_path, dpi=300, bbox_inches="tight")
loc_fig_path



## Próximos pasos

- Ajustar el umbral de picos (`highlight_peaks`) según la distribución real del repositorio.
- Extender las anotaciones para hitos conocidos (por ejemplo, lanzamientos o revisiones clave).
- Exportar las tablas agregadas a CSV para integrarlas en el manuscrito.
