In [1]:
import json
import gzip
import requests
import pandas as pd
import numpy as np
from pathlib import Path
from tqdm.notebook import tqdm
import time
import random

In [2]:
# Packages as at 24 April 2023
with open(f"pypi-packages2023-04-24.json", 'r') as fh:
    ppkg20230424 = json.load(fh) 
print(f"{len(ppkg20230424)=:,}")
ppkg20230424[:5]

len(ppkg20230424)=448,888


['0', '0-0', '000', '00000a', '0-0-1']

In [3]:
# Packages as at 30 April 2023
with open(f"pypi-packages2023-04-30.json", 'r') as fh:
    ppkg20230430 = json.load(fh) 
print(f"{len(ppkg20230430)=:,}")
ppkg20230430[:5]

len(ppkg20230430)=450,433


['0', '0-0', '000', '00000a', '0-0-1']

In [4]:
# New packages between 24 April and 30 April
new_pkg =  set(ppkg20230430).difference(ppkg20230424)
print(len(new_pkg))
list(new_pkg)[:10]

1636


['robocorp-log',
 'oreors',
 'refractio',
 'gpt-review',
 'circleci-stubs',
 'aegoslib2',
 'shekhar-bmi-calculator',
 'pixivspidercreatedbyhanxu',
 'nettle',
 'minirony']

In [5]:
df = (
    pd.DataFrame(new_pkg, columns=["pkg"])
    .assign(
        return_code=np.nan,
        github_url=np.nan,
        homepage=np.nan,
        earliest_release=np.nan,
    )
)
df

Unnamed: 0,pkg,return_code,github_url,homepage,earliest_release
0,robocorp-log,,,,
1,oreors,,,,
2,refractio,,,,
3,gpt-review,,,,
4,circleci-stubs,,,,
...,...,...,...,...,...
1631,vijsamplepackage,,,,
1632,wagtail-sb-codefield,,,,
1633,conplex-dti,,,,
1634,marqeta-client,,,,


In [6]:
for ix, row in tqdm(df.iterrows()):
    return_code = row["return_code"]
    pkg = row["pkg"]
    
#     if (return_code==200) and (~pd.isna(row["github_url"])):
#         continue
    
    # Check if package metadata has been cached before
    cache_filepath = Path(f"../data/cache/{pkg}.json.gz")
    if cache_filepath.is_file():
        with gzip.open(cache_filepath, "r") as f:
            metadata = json.loads(f.read())
            return_code = 200
            df.at[ix, "return_code"] = return_code
    else:  # Query
        response = requests.get(f"https://pypi.org/pypi/{pkg}/json")
        return_code = response.status_code
        if return_code == 200:
            compressed_payload = gzip.compress(response.text.encode("utf-8"))
            with open(cache_filepath, "wb") as f:
                f.write(compressed_payload)
            metadata = response.json()
            time.sleep(random.random())
        else:
            print(f"Return code != 200 - {return_code, pkg}") 
        
        df.at[ix, "return_code"] = return_code
    
    if return_code == 200:
        # Store earliest release
        releases_data = metadata["releases"]
        for data in releases_data.values():
            try:
                earliest_release_datetime = data[0]["upload_time"]
                break
            except IndexError:
                continue
        df.at[ix, "earliest_release"] = earliest_release_datetime

        # Store homepage, if it exists
        homepage = metadata["info"]["home_page"]
        if homepage=="":
            homepage = np.nan
        df.at[ix, "homepage"] = homepage

        # Store GitHub url, if it exists
        project_urls = metadata["info"]["project_urls"]
        if project_urls:
            for key, val in project_urls.items():
                if "github" in val:
                    github_url = val
                else:
                    github_url = np.nan
        else:
            github_url = np.nan
        df.at[ix, "github_url"] = github_url
df.head(3)

0it [00:00, ?it/s]

Return code != 200 - (404, 'aegoslib2')
Return code != 200 - (404, 'the-lord-of-the-rings-test-sdk-1')
Return code != 200 - (404, 'img-compress-jay')
Return code != 200 - (404, 'aegoscode')
Return code != 200 - (404, 'compress2')
Return code != 200 - (404, 'my-tralit-library')
Return code != 200 - (404, 'twyn')
Return code != 200 - (404, 'easy-webui-api')
Return code != 200 - (404, 'aegoslib')
Return code != 200 - (404, 'drf-multistep-form')
Return code != 200 - (404, 'whatsapp-interface')
Return code != 200 - (404, 'abdupy')
Return code != 200 - (404, 'l1periodogram')
Return code != 200 - (404, 'sequoia-base3-pydantic-preview')
Return code != 200 - (404, 'kozh-fig-pac')
Return code != 200 - (404, 'aegosch')
Return code != 200 - (404, 'reusabletestcases')
Return code != 200 - (404, 'numpiy')
Return code != 200 - (404, 'myfirstpkgkaleda4642')
Return code != 200 - (404, 'myfirstpkgkaleda46424')
Return code != 200 - (404, 'zxcvb')
Return code != 200 - (404, 'password-manager89')
Return co

Unnamed: 0,pkg,return_code,github_url,homepage,earliest_release
0,robocorp-log,200.0,,,2023-04-23T10:53:41
1,oreors,200.0,,,2023-04-27T08:57:17
2,refractio,200.0,,,2023-04-27T10:36:54


In [7]:
df.head(20)

Unnamed: 0,pkg,return_code,github_url,homepage,earliest_release
0,robocorp-log,200.0,,,2023-04-23T10:53:41
1,oreors,200.0,,,2023-04-27T08:57:17
2,refractio,200.0,,,2023-04-27T10:36:54
3,gpt-review,200.0,https://github.com/dciborow/action-gpt/issues,,2023-04-25T22:54:54
4,circleci-stubs,200.0,,,2023-04-23T18:03:25
5,aegoslib2,404.0,,,
6,shekhar-bmi-calculator,200.0,,,2023-04-26T23:28:07
7,pixivspidercreatedbyhanxu,200.0,,https://gitee.com/UnderTurrets/pixiv-spider,2023-04-23T13:32:51
8,nettle,200.0,,,2023-04-29T08:43:02
9,minirony,200.0,,,2023-04-27T19:17:05


In [8]:
df.to_csv("new-pkg-30April-24April.csv", index=False)