In [1]:
import pandas as pd
import janitor
from pigeon import annotate

pd.set_option('display.max_rows', 100)

In [2]:
df = (
    pd.read_csv("check-github-url.csv")
    .query("github==github")
    .assign(pypi=lambda df: ["".join(["https://pypi.org/project/", pkg, "/#history"]) for pkg in df["pkg"]])
    .assign(label=lambda df: df["pkg"] + "\n" + df["github"] + "\n" + df["pypi"])
)
df.head(3)

Unnamed: 0,pkg,return_code,github_url,homepage,earliest_release,gh_url_check,github,pypi,label
3,jawalang,200.0,https://github.com/Arsybai/jawa-language,https://github.com/Arsybai/jawa-language,2023-04-25T00:08:33,1.0,https://github.com/Arsybai/jawa-language,https://pypi.org/project/jawalang/#history,jawalang\nhttps://github.com/Arsybai/jawa-lang...
5,foccoerpy,200.0,https://github.com/GaNiziolek/FoccoERPy,,2023-04-25T13:26:24,1.0,https://github.com/GaNiziolek/FoccoERPy,https://pypi.org/project/foccoerpy/#history,foccoerpy\nhttps://github.com/GaNiziolek/Focco...
6,evaluateqa,200.0,https://github.com/MihailSalnikov/EvaluateQA,https://github.com/MihailSalnikov/EvaluateQA,2023-04-26T12:10:16,1.0,https://github.com/MihailSalnikov/EvaluateQA,https://pypi.org/project/evaluateqa/#history,evaluateqa\nhttps://github.com/MihailSalnikov/...


In [17]:
random_sample = df.sample(n=100, random_state=0, ignore_index=True)
random_sample.head(3)

Unnamed: 0,pkg,return_code,github_url,homepage,earliest_release,gh_url_check,github,pypi,label
0,bird-ospf-link-db-parser,200.0,https://github.com/Andrew-Dickinson/bird-ospf-...,,2023-04-29T07:23:44,1.0,https://github.com/Andrew-Dickinson/bird-ospf-...,https://pypi.org/project/bird-ospf-link-db-par...,bird-ospf-link-db-parser\nhttps://github.com/A...
1,asciicli,200.0,https://github.com/mrq-andras/asciicli,https://github.com/mrq-andras/asciicli,2023-04-28T07:22:55,1.0,https://github.com/mrq-andras/asciicli,https://pypi.org/project/asciicli/#history,asciicli\nhttps://github.com/mrq-andras/asciic...
2,bdpotentiometer,200.0,https://github.com/bond-anton/BDPotentiometer,https://github.com/bond-anton/BDPotentiometer,2023-04-27T06:35:18,1.0,https://github.com/bond-anton/BDPotentiometer,https://pypi.org/project/bdpotentiometer/#history,bdpotentiometer\nhttps://github.com/bond-anton...


In [4]:
random_sample.to_csv("random_sample.csv", index=False)
# random_sample.to_csv("random_sample_annotated.csv", index=False)

In [4]:
# annotations = annotate(
#     random_sample["label"],
#     options=["Pass", "No Readme", "Not New"],
#     display_fn=lambda lab: print(lab)
# )

In [18]:
# for ix, row in random_sample.iterrows():
#     print(2+ix)
#     print(row["label"])
#     input("Press Enter to continue...")

In [22]:
df_annotated = (
    pd.read_csv("random_sample_annotated.csv")
    .clean_names()
    .assign(no_readme=lambda df: df["no_readme"].fillna(0).apply(int))
    .assign(not_new=lambda df: df["not_new"].fillna(0).apply(int))
    .remove_columns(["return_code", "github_url", "homepage", "pypi", "gh_url_check", "label"])
)
df_annotated.head(3)

Unnamed: 0,pkg,earliest_release,github,no_readme,not_new
0,bird-ospf-link-db-parser,2023-04-29T07:23:44,https://github.com/Andrew-Dickinson/bird-ospf-...,0,0
1,asciicli,2023-04-28T07:22:55,https://github.com/mrq-andras/asciicli,0,0
2,bdpotentiometer,2023-04-27T06:35:18,https://github.com/bond-anton/BDPotentiometer,0,0


In [23]:
df_annotated.query("no_readme==1")

Unnamed: 0,pkg,earliest_release,github,no_readme,not_new
3,django-tsp,2023-04-28T14:11:17,https://github.com/TravelSalesmanProblem/djang...,1,0
12,py-sls-lambda-toolkit,2023-04-23T18:27:19,https://github.com/0riion/py-sls-lambda-toolkit,1,0
25,assignment-manager,2023-04-24T15:53:38,https://github.com/PraxTube/assignment-manager,1,0
43,digitalassistant,2023-04-28T08:43:33,https://github.com/shubhvjain/ant,1,0
54,foccoerpy,2023-04-25T13:26:24,https://github.com/GaNiziolek/FoccoERPy,1,0
55,aacommpy,2023-04-24T12:32:00,https://github.com/jamesbond90/aacommpyDownloader,1,0
63,solverpy,2023-04-24T20:20:54,https://github.com/cbboyan/solverpy,1,0
89,adversary-armor,2023-04-26T05:44:37,https://github.com/haim-fisher-s/Adversary-Armor,1,0


In [24]:
df_annotated.query("not_new==1")

Unnamed: 0,pkg,earliest_release,github,no_readme,not_new
6,mujoco-dev,2023-04-25T17:25:35,https://github.com/deepmind/mujoco,0,1
11,odoo-addon-purchase-order-qty-by-product-category,2023-04-26T10:04:52,https://github.com/OCA/purchase-workflow,0,1
22,pyram-mogus,2023-04-23T15:04:08,https://github.com/pypa/sampleproject,0,1


In [26]:
passed = df_annotated.query("no_readme==0 & not_new==0")
print(len(passed))
passed.head()

89


Unnamed: 0,pkg,earliest_release,github,no_readme,not_new
0,bird-ospf-link-db-parser,2023-04-29T07:23:44,https://github.com/Andrew-Dickinson/bird-ospf-...,0,0
1,asciicli,2023-04-28T07:22:55,https://github.com/mrq-andras/asciicli,0,0
2,bdpotentiometer,2023-04-27T06:35:18,https://github.com/bond-anton/BDPotentiometer,0,0
4,mindmate,2023-04-24T04:25:43,https://github.com/yalattas/mindmate,0,0
5,botocore-a-la-carte-osis,2023-04-27T01:20:48,https://github.com/thejcannon/botocore-a-la-carte,0,0


In [27]:
passed.to_csv("passed.csv", index=False)