In [1]:
# status code 200 -> The request has succeeded.

In [16]:
import requests

response = requests.get("https://api.github.com/repositories",
                       headers = {"Accept": "application/vnd.github+json"})
print(response.status_code)

200


In [17]:
# response에 대한 요소 확인

print(f"인코딩: {response.encoding}")
print(f"콘텐츠 타입: {response.headers ['Content-Type']}")
print(f"서버: {response.headers ['server']}")

인코딩: utf-8
콘텐츠 타입: application/json; charset=utf-8
서버: GitHub.com


In [18]:
# 콘텐츠 타입이 application/json이기 때문에 json으로 변환 가능한 객체
# json 라이브러리를 활용하여 쉽게 구조화할 수 있음

import json
print(json.dumps(response.json()[0], indent=2)[:200])

# json이 dictionary와 다른 점: 문자열로 해서 통신하기에 좋음
# indent: 들여쓰기

{
  "id": 1,
  "node_id": "MDEwOlJlcG9zaXRvcnkx",
  "name": "grit",
  "full_name": "mojombo/grit",
  "private": false,
  "owner": {
    "login": "mojombo",
    "id": 1,
    "node_id": "MDQ6VXNlcjE=",



In [19]:
# 요청 시 Parameter 활용하기
# 422 -> 앞선 요청에 대해서 처리하지 못했다는 의미
# 요청은 문법에 맞게 잘 왔으나 서버가 해당 요청에 대해 처리할 수 없음

response = requests.get("https://api.github.com/search/repositories")
print(response.status_code)

422


In [20]:
# get 메서드에 Document의 규격대로 params를 추가해줌
# 원하는 API가 존재한다면 그 API에 해당하는 문서를 읽고 그에 맞게 요청해야 함

response = requests.get("https://api.github.com/search/repositories",
                        params = {"q": "data_science+language:python"},
                       headers = {"Accept": "application/vnd.github.v3.test-match+json"})
print(response.status_code)

200


In [21]:
response.headers["Content-Type"]

'application/json; charset=utf-8'

In [None]:
# json 콘텐츠 딕셔너리로 변환하기

import json
response.json()

In [23]:
response.json().keys()

dict_keys(['total_count', 'incomplete_results', 'items'])

In [24]:
len(response.json()["items"])

30

In [25]:
response.json()['items'][0]['text_matches']

KeyError: 'text_matches'

In [33]:
import sys
sys.setrecursionlimit(10**5)

def get_all_pages(url, params=None, headers=None):
    output_json=[]
    response = requests.get(url, params=params, headers=headers)
    if response.status_code == 200:
        output_json = response.json()
        if 'next' in response.links:
            next_url = response.links['next']['url']
            if next_url is not None:
                output_json += get_all_pages(next_url, params, headers)
    return output_json

In [35]:
import pandas as pd

out = get_all_pages(
    "https://api.github.com/repos/pytorch/pytorch/issues/comments",
    params={
        'since': '2022-01-01T10:00:01Z',
        'sorted': 'created',
        'direction': 'desc'
    },
    headers={'Accept' : 'application/vnd.github.v3+json'}
)

df = pd.DataFrame(out)
print(df['body'].count())
df[['id', 'created_at','body']].sample(1)

270


Unnamed: 0,id,created_at,body
55,1447241912,2023-02-27T22:59:16Z,Rebase failed due to Command `git -C /home/run...


In [29]:
#네트워크 오류를 감안한 코드 작성
# API 호출에는 여러가지 변수가 존재 : 연결중단, dns 조회실패, 연결시간 초과 등

#호출 실패에 대한 재시도를 할 수 있도록 HTTPAdapter를 사용
    #Retry 객체를 통해 초기화
    #total은 재시도횟수 / #status_forcelist는 재시도할 상태 코드 목록 / backoff_factor는 각 시도마다 간격을 늘려주는 수치

from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

retry_strategy = Retry(
    total=5,
    status_forcelist=[500, 503, 504],
    backoff_factor=1
)

retry_adapter = HTTPAdapter(max_retries=retry_strategy)

http = requests.Session()
http.mount("https://", retry_adapter)
http.mount("https://", retry_adapter)

response = http.get('https://api.github.com/search/repositories',
                    params={'q': 'data_science+language:python'})

for item in response.json()['items'][:5]:
    print(item['name'])

data-science-from-scratch
PySyft
data-science-blogs
galaxy
DataCamp


In [30]:
# 호출속도 조절과 네트워크 오류까지 포함한 올인원코드

from datetime import datetime as dt
import time
import sys
import pandas as pd
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

sys.setrecursionlimit(10**5)

def handle_rate_limits(response):
    now = dt.now()
    reset_time = dt.fromtimestamp(int(response.headers['X-RateLimit-Reset']))
    remaining_requests = response.headers['X-Ratelimit-Remaining']
    remaining_time =(reset_time-now).total_seconds()
    intervals = remaining_time / (1.0 +int(remaining_requests))
    
    print('Sleeping for', int(intervals))
    time.sleep(intervals)
    return True

def get_all_pages(url, params=None, headers=None):
    output_json=[]
    retry_strategy = Retry(
        total = 5,
        status_forcelist=[500, 503, 504],
        backoff_factor=1
    )

    retry_adapter = HTTPAdapter(max_retries=retry_strategy)

    http = requests.Session()
    http.mount("https://", retry_adapter)
    
    response = http.get(url, params=params, headers=headers)
    if response.status_code == 200:
        output_json = response.json()
        if 'next' in response.links:
            next_url = response.links['next']['url']
            if next_url is not None and handle_rate_limits(response):
                output_json += get_all_pages(next_url, params, headers)
    return output_json

out = get_all_pages(
    "https://api.github.com/repos/pytorch/pytorch/issues/comments",
    params={
        'since': '2022-01-01T10:00:01Z',
        'sorted': 'created',
        'direction': 'desc'
    },
    headers={'Accept' : 'application/vnd.github.v3+json'}
)

df = pd.DataFrame(out)
print(df['body'].count())
df[['id', 'created_at','body']].sample(1)

Sleeping for 65
Sleeping for 196
60


Unnamed: 0,id,created_at,body
32,1447265684,2023-02-27T23:15:49Z,### Merge started\nYour change will be merged ...
