# Project name

#### Load Python tools and Jupyter config

In [13]:
import os
import json
import requests
import pandas as pd
import jupyter_black
import altair as alt
import csv
import time
from bs4 import BeautifulSoup
from datetime import datetime

In [3]:
jupyter_black.load()
pd.options.display.max_columns = 100
pd.options.display.max_rows = 1000
pd.options.display.max_colwidth = None

In [4]:
today = pd.Timestamp("today").strftime("%Y%m%d")

---

## Read data

#### Headers

In [5]:
# Replace with your API key and Channel ID
API_KEY = os.environ.get("YOUTUBE_KEY")
CHANNEL_ID = "UCX6OQ3DkcsbYNE6H8uQQuVA"
OUTPUT_CSV = "data/raw/test.csv"

In [18]:
def get_wayback_snapshots(url, from_year, to_year, limit=30):
    api_url = f"https://web.archive.org/cdx/search/cdx?url={url}&output=json&from={from_year}&to={to_year}&limit={limit}"
    response = requests.get(api_url)
    data = response.json()
    snapshots = data[1:]  # Skip the header row
    return snapshots


def get_subscriber_count_from_snapshot(snapshot):
    timestamp = snapshot[1]
    original_url = snapshot[2]
    archive_url = f"https://web.archive.org/web/{timestamp}/{original_url}"

    response = requests.get(archive_url)
    soup = BeautifulSoup(response.content, "html.parser")
    subscriber_element = soup.find("yt-formatted-string", {"id": "subscriber-count"})

    if subscriber_element:
        return subscriber_element.text.strip()
    return None


def log_subscriber_counts_to_csv(snapshots, output_csv):
    with open(output_csv, mode="w", newline="") as file:
        writer = csv.writer(file)
        writer.writerow(["Timestamp", "Subscriber Count"])

        for snapshot in snapshots:
            timestamp = snapshot[1]
            subscriber_count = get_subscriber_count_from_snapshot(snapshot)
            if subscriber_count:
                # Convert timestamp to readable date format
                date = datetime.strptime(timestamp, "%Y%m%d%H%M%S").strftime(
                    "%Y-%m-%d %H:%M:%S"
                )
                writer.writerow([date, subscriber_count])
                print(f"Logged {subscriber_count} subscribers at {date}")
            else:
                print(f"Could not find subscriber count for snapshot at {timestamp}")

In [19]:
# Replace with your channel URL
channel_url = "https://www.youtube.com/channel/UCX6OQ3DkcsbYNE6H8uQQuVA"

In [20]:
# Get the snapshots
snapshots = get_wayback_snapshots(channel_url, 2022, 2024)

In [21]:
# Log to CSV
output_csv = "youtube_subscriber_counts.csv"

In [None]:
log_subscriber_counts_to_csv(snapshots, output_csv)

---

## Exports

#### JSON

In [17]:
# df.to_json(
#     f"data/processed/NAME.json",
#     indent=4,
#     orient="records",
# )

#### CSV

In [18]:
# df.to_csv(
#     f"data/processed/NAME.csv", index=False
# )

#### GeoJSON

In [19]:
# gdf.to_file(
#     f"data/processed/NAME.geojson",
#     driver="GeoJSON",
# )