In [1]:
import os
import time

import pandas as pd
import numpy as np
import janitor
import requests
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
from urllib.parse import urlparse

from utilities import read_jsongz

cache_output_path = "../input/cache_pypi_payload"

In [2]:
def is_github_repo_path(url: str) -> bool:
    """Check whether a URL is a link to a GitHub repository.
    
    GitHub repositories have a username and a repo name.
    Example: github.com/<user>/<repo>
    
    Parameters
    ----------
    url: str
        URL to check.
        
    Raises
    ------
    AssertionError
        Raises AssertionError when URL is not a github repo URL.
        
    Returns
    -------
    None
    """
    try:
        parsed_url = urlparse(url)
    except AttributeError:
        return False
    
    # Assert that domain is GitHub
    try:
        assert parsed_url.netloc == "github.com"
    except AssertionError:
        return False
    

    
    # Remove the trailing "/" at the start and end of the string
    path = parsed_url.path
    if path.startswith("/"):
        path = path[1:]
    if path.endswith("/"):
        path = path[:-1]
        
    # Assert that the path is of the form /<user>/<repo>
    path_components = path.split("/")
    try:
        assert len(path_components) == 2
    except AssertionError:
        return False
    
    # Assert that github url is not a sponsors page
    # E.g., https://github.com/sponsors/EinKara/ (real example)
    if "sponsors" in path_components[0]:
        return False
    
    return True

In [3]:
data_list = []
c = 0
for filename in tqdm(os.listdir(cache_output_path)):
    assert filename.endswith(".json.gz")

    package = filename.replace(".json.gz", "")

    data = read_jsongz(os.path.join(cache_output_path, filename))
    requirements = data["info"]["requires_dist"]
    if requirements:
        n_requirements = len(data["info"]["requires_dist"])
    else:
        n_requirements = 0
    raw_readme = data["info"]["description"]
    raw_readme_len = len(raw_readme)

    soup = BeautifulSoup(raw_readme, "html.parser")
    processed_readme = soup.get_text()
    processed_readme_len = len(processed_readme)

    ## Retrieve GitHub URL ---------------------------------------------------
    github_url = np.nan
    for key, url in data["info"]["project_urls"].items():
        if is_github_repo_path(url):
            github_url = url
    
    if pd.isna(github_url):
        print("fuck...")
        c+=1
    data_list.append(
        [
            package,
            github_url,
            n_requirements,
            raw_readme_len,
            processed_readme_len,
            raw_readme,
            processed_readme,
        ]
    )

  0%|          | 0/622 [00:00<?, ?it/s]

  soup = BeautifulSoup(raw_readme, "html.parser")


In [4]:
df = (pd.DataFrame(
    data_list,
    columns=[
        "pkg",
        "github_url",
        "n_requirements",
        "raw_readme_len",
        "processed_readme_len",
        "raw_readme",
        "processed_readme",
    ],)
            ## Get Slug (e.g., user/repo) ------------------------------------------
    .assign(slug=lambda df: df["github_url"].str.replace("https://github.com/", ""))
    .assign(slug=lambda df: df["slug"].apply(lambda x: x[:-1] if x.endswith("/") else x))
    .assign(slug=lambda df: df["slug"].str.lower().str.strip())
)
#       ## Merge to get GitHub slug from URL------------------------------------
#       .merge((
#           pd.read_csv("../input/check-github-url.csv")
#           .query("gh_url_check==1")
#           .select_columns(["pkg", "github_url", "github"])
#       ), how="left", on="pkg", validate="1:1", indicator=True
#       )

#      )
# assert 622 == len(df.query("_merge=='both'"))
# df = df.remove_columns("_merge")
df.to_csv("../output/pypi_readme.csv", index=False)

df

Unnamed: 0,pkg,github_url,n_requirements,raw_readme_len,processed_readme_len,raw_readme,processed_readme,slug
0,hexapterygon,https://github.com/GiorgosXou/hexapterygon,0,0,0,,,giorgosxou/hexapterygon
1,hakurei-sqlalchemy-graphqlapi,https://github.com/cancan101/graphql-db-api,4,2237,2237,# graphql-db-api [![PyPI version](https://badg...,# graphql-db-api [![PyPI version](https://badg...,cancan101/graphql-db-api
2,odoo12-addon-fieldservice-calendar,https://github.com/OCA/field-service,2,3120,2723,========================\nField Service - Cale...,========================\nField Service - Cale...,oca/field-service
3,shark-sac-korean-editor,https://github.com/sharkwodm/koreditor,0,148,148,1. modkr 폴더를 다운로드폴더 안에넣어주세요\r\n2. Termux어플에서 ...,1. modkr 폴더를 다운로드폴더 안에넣어주세요\r\n2. Termux어플에서 ...,sharkwodm/koreditor
4,passwordtools-yt,https://github.com/Yair-T/passwordtools,0,1397,1397,# passwordtools.\r\nThis Python package provid...,# passwordtools.\r\nThis Python package provid...,yair-t/passwordtools
...,...,...,...,...,...,...,...,...
617,winavsos,https://github.com/Tomzy2506/AVSOS,15,0,0,,,tomzy2506/avsos
618,hydro-tune,https://github.com/S-Lab-System-Group/Hydro,0,0,0,,,s-lab-system-group/hydro
619,sphericart-torch,https://github.com/lab-cosmo/sphericart,0,213,213,# TorchScript bindings to sphericart\n\nThis p...,# TorchScript bindings to sphericart\n\nThis p...,lab-cosmo/sphericart
620,pwnmodules,https://github.com/XKaguya/PwnModules,1,86,86,A open-source Pwntools Extern Functions.\r\nUs...,A open-source Pwntools Extern Functions.\r\nUs...,xkaguya/pwnmodules


### Check - GitHub URL discrepancies seem to originate from user's setup files

In [5]:
# df[df.duplicated("github_url", keep=False)].sort_values("github_url")

In [6]:
# One of the packages with URL as https://github.com/pypa/sampleproject
# https://pypi.org/project/msgraphlib/
data = read_jsongz(os.path.join(cache_output_path, "msgraphlib.json.gz"))
data["info"]["project_urls"]

{'Bug Tracker': 'https://github.com/pypa/sampleproject/issues',
 'Homepage': 'https://github.com/pypa/sampleproject'}

In [7]:
# Another one...
# https://pypi.org/project/colorfultxt/
data = read_jsongz(os.path.join(cache_output_path, "colorfultxt.json.gz"))
data["info"]["project_urls"]

{'Bug Tracker': 'https://github.com/pypa/sampleproject/issues',
 'Homepage': 'https://github.com/pypa/sampleproject'}

In [8]:
# Another one...
data = read_jsongz(os.path.join(cache_output_path, "local-age-detection-python-backend.json.gz"))
data["info"]["project_urls"]

{'Homepage': 'https://github.com/javatechy/dokr'}