Skip to content

Commit

Permalink
Run over multiple threads
Browse files Browse the repository at this point in the history
Most of the wait is from network resources, so might as well run lots of them at once.

Before: 43.59s
After: 4.65s
  • Loading branch information
RealOrangeOne committed Jul 18, 2021
1 parent b06e82c commit 2b5196e
Showing 1 changed file with 25 additions and 16 deletions.
41 changes: 25 additions & 16 deletions scrape.py
@@ -1,3 +1,4 @@
import concurrent.futures
import os
import shutil

Expand Down Expand Up @@ -27,25 +28,10 @@ def get_duration(seconds):
return f"{minutes} mins {seconds} secs"


api_data = requests.get(BASE_URL + "/json").json()

episodes = []

# Remove any outputs from a previous run
try:
shutil.rmtree(OUTPUT_DIR)
except:
pass

os.mkdir(OUTPUT_DIR)

# Do the stuff
for api_episode in api_data["items"]:
def create_episode(api_episode):
# RANT: What kind of API doesn't give the episode number?!
episode_number = int(api_episode["url"].split("/")[-1])

print(episode_number, end="\r")

api_soup = BeautifulSoup(api_episode["content_html"], "html.parser")

blurb = api_episode["summary"]
Expand Down Expand Up @@ -96,3 +82,26 @@ def get_duration(seconds):

with open(f"{OUTPUT_DIR}/episode-{episode_number}.md", "w") as f:
f.write(output)


def main():
api_data = requests.get(BASE_URL + "/json").json()

episodes = []

# Remove any outputs from a previous run
try:
shutil.rmtree(OUTPUT_DIR)
except:
pass

os.mkdir(OUTPUT_DIR)

# Run over multiple threads
with concurrent.futures.ThreadPoolExecutor() as executor:
for api_episode in api_data["items"]:
executor.submit(create_episode, api_episode)


if __name__ == "__main__":
main()

0 comments on commit 2b5196e

Please sign in to comment.