In [1]:
import pandas as pd
import janitor
from urllib.parse import urlparse
from tqdm.notebook import tqdm
import requests
import time
import random

In [2]:
def is_github_repo_path(url: str) -> bool:
    """Check whether a URL is a link to a GitHub repository.
    
    GitHub repositories have a username and a repo name.
    Example: github.com/<user>/<repo>
    
    Parameters
    ----------
    url: str
        URL to check.
        
    Raises
    ------
    AssertionError
        Raises AssertionError when URL is not a github repo URL.
        
    Returns
    -------
    None
    """
    try:
        parsed_url = urlparse(url)
    except AttributeError:
        return False
    
    # Assert that domain is GitHub
    try:
        assert parsed_url.netloc == "github.com"
    except AssertionError:
        return False
    
    # Assert that the path is of the form /<user>/<repo>
    path_components = parsed_url.path.split("/")
    
    try:
        assert len(path_components)==3  # first component is ""
    except AssertionError:
        return False
    return True

In [3]:
def url_exists(url: str) -> bool:
    """Checks if URL exists.
    
    Parameters
    ----------
    url: str
        URL to check.
        
    Returns
    -------
    bool
        True if URL exists.
    """
    response = requests.head(url)
    return response.status_code == requests.codes.ok

In [4]:
df = (
    pd.read_csv("new-pkg-30April-24April.csv")
    .query("github_url==github_url | homepage==homepage")
    .reset_index(drop=True)
)
df

Unnamed: 0,pkg,return_code,github_url,homepage,earliest_release
0,gpt-review,200.0,https://github.com/dciborow/action-gpt/issues,,2023-04-25T22:54:54
1,pixivspidercreatedbyhanxu,200.0,,https://gitee.com/UnderTurrets/pixiv-spider,2023-04-23T13:32:51
2,text-content-generator,200.0,https://github.com/mheshze/TextContentGenerati...,,2023-04-24T08:11:28
3,jawalang,200.0,https://github.com/Arsybai/jawa-language,https://github.com/Arsybai/jawa-language,2023-04-25T00:08:33
4,gvec-to-python,200.0,https://github.com/me/spam.git,,2023-04-26T07:46:17
...,...,...,...,...,...
961,skeleton-plot,200.0,https://github.com/AllenInstitute/skeleton_plot,https://github.com/AllenInstitute/skeleton_plot,2023-04-24T19:08:22
962,drb-topic-geojson,200.0,,https://gitlab.com/drb-python/topics/geojson,2023-04-25T09:26:05
963,wagtail-sb-codefield,200.0,,https://gitlab.com/softbutterfly/open-source/w...,2023-04-29T10:00:13
964,conplex-dti,200.0,https://github.com/samsledje/ConPLex,https://github.com/samsledje/ConPLex,2023-04-27T20:34:20


In [5]:
for ix, row in tqdm(df.iterrows()):
    github_url = row["github_url"]
    homepage = row["homepage"]
    
    if is_github_repo_path(github_url):
        url = github_url
    elif is_github_repo_path(homepage):
        url = homepage
    else:
        df.at[ix, "gh_url_check"] = 0
        continue
        
    if url_exists(url):
        df.at[ix, "gh_url_check"] = 1
        df.at[ix, "github"] = url
    time.sleep(random.random())

0it [00:00, ?it/s]

In [6]:
df.query("gh_url_check==1")

Unnamed: 0,pkg,return_code,github_url,homepage,earliest_release,gh_url_check,github
3,jawalang,200.0,https://github.com/Arsybai/jawa-language,https://github.com/Arsybai/jawa-language,2023-04-25T00:08:33,1.0,https://github.com/Arsybai/jawa-language
5,foccoerpy,200.0,https://github.com/GaNiziolek/FoccoERPy,,2023-04-25T13:26:24,1.0,https://github.com/GaNiziolek/FoccoERPy
6,evaluateqa,200.0,https://github.com/MihailSalnikov/EvaluateQA,https://github.com/MihailSalnikov/EvaluateQA,2023-04-26T12:10:16,1.0,https://github.com/MihailSalnikov/EvaluateQA
8,sa-node-architecture,200.0,https://github.com/Simply-Artificial/NodeArchi...,https://github.com/Simply-Artificial/NodeArchi...,2023-04-27T17:03:58,1.0,https://github.com/Simply-Artificial/NodeArchi...
11,micropython-icm20948,200.0,https://github.com/jposada202020/MicroPython_I...,,2023-04-26T13:51:25,1.0,https://github.com/jposada202020/MicroPython_I...
...,...,...,...,...,...,...,...
955,odoo-addon-purchase-order-qty-by-product-category,200.0,https://github.com/OCA/purchase-workflow,https://github.com/OCA/purchase-workflow,2023-04-26T10:04:52,1.0,https://github.com/OCA/purchase-workflow
957,print-pretty-tree,200.0,https://github.com/itsbrex/print-pretty-tree,https://github.com/itsbrex/print-pretty-tree,2023-04-26T17:30:53,1.0,https://github.com/itsbrex/print-pretty-tree
958,epcrawler,200.0,https://github.com/dgsmiley18/epcrawler,https://github.com/dgsmiley18/epcrawler,2023-04-25T22:16:00,1.0,https://github.com/dgsmiley18/epcrawler
960,andeplane-pyodide-kernel,200.0,https://github.com/jupyterlite/pyodide-kernel,,2023-04-28T18:04:56,1.0,https://github.com/jupyterlite/pyodide-kernel


In [7]:
df.head(25)

Unnamed: 0,pkg,return_code,github_url,homepage,earliest_release,gh_url_check,github
0,gpt-review,200.0,https://github.com/dciborow/action-gpt/issues,,2023-04-25T22:54:54,0.0,
1,pixivspidercreatedbyhanxu,200.0,,https://gitee.com/UnderTurrets/pixiv-spider,2023-04-23T13:32:51,0.0,
2,text-content-generator,200.0,https://github.com/mheshze/TextContentGenerati...,,2023-04-24T08:11:28,,
3,jawalang,200.0,https://github.com/Arsybai/jawa-language,https://github.com/Arsybai/jawa-language,2023-04-25T00:08:33,1.0,https://github.com/Arsybai/jawa-language
4,gvec-to-python,200.0,https://github.com/me/spam.git,,2023-04-26T07:46:17,,
5,foccoerpy,200.0,https://github.com/GaNiziolek/FoccoERPy,,2023-04-25T13:26:24,1.0,https://github.com/GaNiziolek/FoccoERPy
6,evaluateqa,200.0,https://github.com/MihailSalnikov/EvaluateQA,https://github.com/MihailSalnikov/EvaluateQA,2023-04-26T12:10:16,1.0,https://github.com/MihailSalnikov/EvaluateQA
7,certora-cli-alpha-shelly-certoraclibeta,200.0,,https://pypi.org/project/certora-cli-alpha-she...,2023-04-28T13:28:28,0.0,
8,sa-node-architecture,200.0,https://github.com/Simply-Artificial/NodeArchi...,https://github.com/Simply-Artificial/NodeArchi...,2023-04-27T17:03:58,1.0,https://github.com/Simply-Artificial/NodeArchi...
9,ebyte-lora-e22-rpi,200.0,,https://github.com/xreef/EByte_LoRa_E22_raspbe...,2023-04-25T17:31:17,,


In [8]:
df.to_csv("check-github-url.csv", index=False)