In [1]:
import os
import pandas as pd
import janitor
import json
from tqdm.notebook import tqdm
import datetime
from utilities import read_json

# Use this to figure out getting the keys in json
# https://www.youtube.com/watch?v=rWcLDax-VmM&t=3s

# Google console for PyPI data
# https://console.cloud.google.com/marketplace/product/gcp-public-data-pypi/pypi?q=search&referrer=search
# https://console.cloud.google.com/bigquery?p=bigquery-public-data&d=pypi&page=dataset

# Schema here
# https://packaging.python.org/en/latest/guides/analyzing-pypi-package-downloads/#data-schema

pd.set_option('display.max_columns', 100)

output_path = "../output/cache_pypi_downloads_payload"

In [2]:
data_list = []
for filename in tqdm(os.listdir(output_path)):
    if filename.endswith(".json"):
        filename = filename.replace(".json", "")
        _, date, _ = filename.split("_", maxsplit=3)
        
        filepath = os.path.join(output_path, filename)
        try:
            data = read_json(filepath)
        except json.JSONDecodeError as e:
            # Handle the case where JSON is not valid
            print(f"Failed to decode JSON: {e} for {filename}")
            continue
    
#         data = read_json(filepath)
        downloads = data["rows"][0]["download_count"]
        package = data["args"][-1]
        
        data_list.append([package, date, downloads])
        pass

  0%|          | 0/82504 [00:00<?, ?it/s]

Failed to decode JSON: Expecting value: line 1 column 1 (char 0) for 2023-11-03_2023-11-04_vallocal
Failed to decode JSON: Expecting value: line 1 column 1 (char 0) for 2023-10-31_2023-11-01_vallocal
Failed to decode JSON: Expecting value: line 1 column 1 (char 0) for 2023-10-28_2023-10-29_vallocal
Failed to decode JSON: Expecting value: line 1 column 1 (char 0) for 2023-10-30_2023-10-31_vallocal
Failed to decode JSON: Expecting value: line 1 column 1 (char 0) for 2023-11-08_2023-11-09_vallocal
Failed to decode JSON: Expecting value: line 1 column 1 (char 0) for 2023-11-01_2023-11-02_vallocal
Failed to decode JSON: Expecting value: line 1 column 1 (char 0) for 2023-11-06_2023-11-07_vallocal
Failed to decode JSON: Expecting value: line 1 column 1 (char 0) for 2023-10-29_2023-10-30_vallocal
Failed to decode JSON: Expecting value: line 1 column 1 (char 0) for 2023-11-05_2023-11-06_vallocal
Failed to decode JSON: Expecting value: line 1 column 1 (char 0) for 2023-10-27_2023-10-28_vallocal


In [3]:
# Read in 100 random sample
df_random_sample = (
    pd.read_csv("../input/random_sample.csv")
    .assign(slug=lambda df: df["github"].str.replace("https://github.com/", ""))
    .assign(slug=lambda df: df["slug"].apply(lambda x: x[:-1] if x.endswith("/") else x))
    .assign(slug=lambda df: df["slug"].str.lower().str.strip())
    .assign(fileslug=lambda df: df["slug"].str.replace("/", "_"))
    .assign(treated=1)
    # Assign the two types
    .assign(boughtstars=lambda df: (df.index < 25).astype(int))
    .assign(treated2=lambda df: df["boughtstars"] + df["treated"])
)
df_random_sample.head(3)

  .assign(slug=lambda df: df["github"].str.replace("https://github.com/", ""))


Unnamed: 0,pkg,return_code,github_url,homepage,earliest_release,gh_url_check,github,pypi,label,slug,fileslug,treated,boughtstars,treated2
0,bird-ospf-link-db-parser,200.0,https://github.com/Andrew-Dickinson/bird-ospf-...,,2023-04-29T07:23:44,1.0,https://github.com/Andrew-Dickinson/bird-ospf-...,https://pypi.org/project/bird-ospf-link-db-par...,bird-ospf-link-db-parser\r\nhttps://github.com...,andrew-dickinson/bird-ospf-link-db-parser,andrew-dickinson_bird-ospf-link-db-parser,1,1,2
1,asciicli,200.0,https://github.com/mrq-andras/asciicli,https://github.com/mrq-andras/asciicli,2023-04-28T07:22:55,1.0,https://github.com/mrq-andras/asciicli,https://pypi.org/project/asciicli/#history,asciicli\r\nhttps://github.com/mrq-andras/asci...,mrq-andras/asciicli,mrq-andras_asciicli,1,1,2
2,bdpotentiometer,200.0,https://github.com/bond-anton/BDPotentiometer,https://github.com/bond-anton/BDPotentiometer,2023-04-27T06:35:18,1.0,https://github.com/bond-anton/BDPotentiometer,https://pypi.org/project/bdpotentiometer/#history,bdpotentiometer\r\nhttps://github.com/bond-ant...,bond-anton/bdpotentiometer,bond-anton_bdpotentiometer,1,1,2


In [6]:
df = (
    pd.DataFrame(data_list, columns=["pkg", "date", "download_count"])
    .assign(date=lambda df: pd.to_datetime(df["date"]))
    .sort_values(["pkg", "date"], ignore_index=True)
    .assign(tt_downloads=lambda df: df.groupby("pkg")["download_count"].cumsum())
    # Merge to get treatment status
    .merge(
        df_random_sample.select_columns("pkg", "treated", "boughtstars", "treated2"),
        how="left",
        on="pkg",
        validate="m:1",
        indicator=True,
    )    
    .assign(treated=lambda df: df["treated"].fillna(0).apply(int))
    .assign(boughtstars=lambda df: df["boughtstars"].fillna(0).apply(int))
    .assign(treated2=lambda df: df["treated2"].fillna(0).apply(int))
    .assign(date=lambda df: pd.to_datetime(df["date"]))
    .assign(date=lambda df: [dtobj.date() for dtobj in df["date"]])
    .query("date <= @datetime.date(2023, 8, 31)")
)
# Assert that 100 repos are assigned as treated
assert 100 == len(df.drop_duplicates("pkg").query("treated==1"))
assert 25 == len(df.drop_duplicates("pkg").query("treated2==2"))
df

Unnamed: 0,pkg,date,download_count,tt_downloads,treated,boughtstars,treated2,_merge
0,a-pandas-ex-df2htmlstring,2023-04-25,0,0,0,0,0,left_only
1,a-pandas-ex-df2htmlstring,2023-04-26,0,0,0,0,0,left_only
2,a-pandas-ex-df2htmlstring,2023-04-27,0,0,0,0,0,left_only
3,a-pandas-ex-df2htmlstring,2023-04-28,7,7,0,0,0,left_only
4,a-pandas-ex-df2htmlstring,2023-04-29,7,14,0,0,0,left_only
...,...,...,...,...,...,...,...,...
82486,zoomaker,2023-08-27,0,193,0,0,0,left_only
82487,zoomaker,2023-08-28,2,195,0,0,0,left_only
82488,zoomaker,2023-08-29,2,197,0,0,0,left_only
82489,zoomaker,2023-08-30,0,197,0,0,0,left_only


In [7]:
df.to_csv("../output/pkg_pypi_downloads.csv", index=False)