In [3]:
# This is needed because openai.text_completion calls asynchronous functions but 
# Jupyter is already running its own event loop.
import os
from pathlib import Path
import pandas as pd

import nest_asyncio
nest_asyncio.apply()

import github as gh


with open(os.path.join(Path.home(), '.github_token'), 'r') as handle:
    GITHUB_TOKEN = handle.read()

In [17]:
rate_limits = gh.get_rate_limits(token=GITHUB_TOKEN)
print(f"We have a rate limit of {rate_limits['limit']:,} requests, with {rate_limits['remaining']:,} available")  # noqa
print(f"The rate limit resets at: {rate_limits['reset_time']}")

We have a rate limit of 5,000 requests, with 4,959 available
The rate limit resets at: 2023-04-10T19:06:21


In [2]:
github_urls = [
    'https://github.com/scikit-learn/scikit-learn/',
    'https://github.com/scikit-learn-contrib/imbalanced-learn',
    'https://github.com/pandas-dev/pandas',
]

In [6]:
github_urls = [gh.clean_github_url(x) for x in github_urls]
github_urls

['https://github.com/scikit-learn/scikit-learn',
 'https://github.com/scikit-learn-contrib/imbalanced-learn',
 'https://github.com/pandas-dev/pandas']

In [8]:
user_repo_names = [gh.extract_username_reponame(x) for x in github_urls]
user_repo_names

[('scikit-learn', 'scikit-learn'),
 ('scikit-learn-contrib', 'imbalanced-learn'),
 ('pandas-dev', 'pandas')]

In [9]:
user_repo_arguments = [dict(user_name=u, repo_name=r) for u, r in user_repo_names]
user_repo_arguments

[{'user_name': 'scikit-learn', 'repo_name': 'scikit-learn'},
 {'user_name': 'scikit-learn-contrib', 'repo_name': 'imbalanced-learn'},
 {'user_name': 'pandas-dev', 'repo_name': 'pandas'}]

In [13]:
from helpsk.utility import Timer
with Timer(f"Attempting to extract metadata repos via `gh.get_github_metadata`"):
    results = await gh.run_async(
        func=gh.get_github_metadata,
        param_kwargs=user_repo_arguments,
        token=GITHUB_TOKEN,
    )

results = pd.DataFrame(results)
results

Timer Started: Attempting to extract metadata repos via `gh.get_github_metadata`
Timer Finished (0.55 seconds)


Unnamed: 0,error,user_name,repo_name,url,description,owner_login,owner_type,homepage,topics,subscribers_count,stargazers_count,forks_count,default_branch,archived,response_header
0,,scikit-learn,scikit-learn,https://api.github.com/repos/scikit-learn/scik...,scikit-learn: machine learning in Python,scikit-learn,Organization,https://scikit-learn.org,"[data-analysis, data-science, machine-learning...",2158,53747,24166,main,False,"(Server, Date, Content-Type, Transfer-Encoding..."
1,,scikit-learn-contrib,imbalanced-learn,https://api.github.com/repos/scikit-learn-cont...,A Python Package to Tackle the Curse of Imbal...,scikit-learn-contrib,Organization,https://imbalanced-learn.org,"[data-analysis, data-science, machine-learning...",144,6301,1231,master,False,"(Server, Date, Content-Type, Transfer-Encoding..."
2,,pandas-dev,pandas,https://api.github.com/repos/pandas-dev/pandas,Flexible and powerful data analysis / manipula...,pandas-dev,Organization,https://pandas.pydata.org,"[alignment, data-analysis, data-science, flexi...",1109,37889,16052,main,False,"(Server, Date, Content-Type, Transfer-Encoding..."


In [14]:
rate_limits = gh.get_rate_limits(token=GITHUB_TOKEN)
print(f"We have a rate limit of {rate_limits['limit']:,} requests, with {rate_limits['remaining']:,} available")  # noqa
print(f"The rate limit resets at: {rate_limits['reset_time']}")

{'limit': 5000, 'remaining': 4959, 'reset_time': '2023-04-10T19:06:21'}

In [18]:
from helpsk.utility import Timer
with Timer(f"Attempting to extract github readme URL via `gh.get_github_readme_url`"):
    results = await gh.run_async(
        func=gh.get_github_readme_url,
        param_kwargs=user_repo_arguments,
        token=GITHUB_TOKEN,
    )

results = pd.DataFrame(results)
results

Timer Started: Attempting to extract github readme URL via `gh.get_github_readme_url`
Timer Finished (0.74 seconds)


Unnamed: 0,error,user_name,repo_name,url,readme_url,response_header
0,,scikit-learn,scikit-learn,https://api.github.com/repos/scikit-learn/scik...,https://raw.githubusercontent.com/scikit-learn...,"(Server, Date, Content-Type, Transfer-Encoding..."
1,,scikit-learn-contrib,imbalanced-learn,https://api.github.com/repos/scikit-learn-cont...,https://raw.githubusercontent.com/scikit-learn...,"(Server, Date, Content-Type, Transfer-Encoding..."
2,,pandas-dev,pandas,https://api.github.com/repos/pandas-dev/pandas...,https://raw.githubusercontent.com/pandas-dev/p...,"(Server, Date, Content-Type, Transfer-Encoding..."


In [19]:
rate_limits = gh.get_rate_limits(token=GITHUB_TOKEN)
print(f"We have a rate limit of {rate_limits['limit']:,} requests, with {rate_limits['remaining']:,} available")  # noqa
print(f"The rate limit resets at: {rate_limits['reset_time']}")

We have a rate limit of 5,000 requests, with 4,956 available
The rate limit resets at: 2023-04-10T19:06:21


In [20]:
readme_urls = results['readme_url'].tolist()
readme_urls

['https://raw.githubusercontent.com/scikit-learn/scikit-learn/main/README.rst',
 'https://raw.githubusercontent.com/scikit-learn-contrib/imbalanced-learn/master/README.rst',
 'https://raw.githubusercontent.com/pandas-dev/pandas/main/README.md']

In [23]:
readme_arguments = [dict(url=x) for x in readme_urls]
readme_arguments

[{'url': 'https://raw.githubusercontent.com/scikit-learn/scikit-learn/main/README.rst'},
 {'url': 'https://raw.githubusercontent.com/scikit-learn-contrib/imbalanced-learn/master/README.rst'},
 {'url': 'https://raw.githubusercontent.com/pandas-dev/pandas/main/README.md'}]

In [24]:
from helpsk.utility import Timer
with Timer(f"Attempting to extract github readme via `gh.get_github_readme_contents`"):
    results = await gh.run_async(
        func=gh.get_github_readme_contents,
        param_kwargs=readme_arguments,
        token=GITHUB_TOKEN,
    )

results = pd.DataFrame(results)
results

Timer Started: Attempting to extract github readme via `gh.get_github_readme_contents`
Timer Finished (0.48 seconds)


Unnamed: 0,error,url,readme,response_header
0,,https://raw.githubusercontent.com/scikit-learn...,.. -*- mode: rst -*-\n\n|Azure|_ |CirrusCI|_ |...,"(Connection, Content-Length, Cache-Control, Co..."
1,,https://raw.githubusercontent.com/scikit-learn...,.. -*- mode: rst -*-\n\n.. _scikit-learn: http...,"(Connection, Content-Length, Cache-Control, Co..."
2,,https://raw.githubusercontent.com/pandas-dev/p...,"<div align=""center"">\n <img src=""https://pand...","(Connection, Content-Length, Cache-Control, Co..."


In [28]:
print(results.iloc[2]['readme'])

<div align="center">
  <img src="https://pandas.pydata.org/static/img/pandas.svg"><br>
</div>

-----------------

# pandas: powerful Python data analysis toolkit
[![PyPI Latest Release](https://img.shields.io/pypi/v/pandas.svg)](https://pypi.org/project/pandas/)
[![Conda Latest Release](https://anaconda.org/conda-forge/pandas/badges/version.svg)](https://anaconda.org/anaconda/pandas/)
[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.3509134.svg)](https://doi.org/10.5281/zenodo.3509134)
[![Package Status](https://img.shields.io/pypi/status/pandas.svg)](https://pypi.org/project/pandas/)
[![License](https://img.shields.io/pypi/l/pandas.svg)](https://github.com/pandas-dev/pandas/blob/main/LICENSE)
[![Coverage](https://codecov.io/github/pandas-dev/pandas/coverage.svg?branch=main)](https://codecov.io/gh/pandas-dev/pandas)
[![Downloads](https://static.pepy.tech/personalized-badge/pandas?period=month&units=international_system&left_color=black&right_color=orange&left_text=PyPI%20downloads%2