forked from mwichary/medium-export-image-fill
/
medium-export-image-fill.py
executable file
·169 lines (132 loc) · 5.58 KB
/
medium-export-image-fill.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
#!/usr/bin/env python
'''
Medium export image fill 1.01
by Marcin Wichary (aresluna.org)
Site: https://github.com/mwichary/medium-export-image-fill
This is free and unencumbered software released into the public domain.
Anyone is free to copy, modify, publish, use, compile, sell, or
distribute this software, either in source code form or as a compiled
binary, for any purpose, commercial or non-commercial, and by any
means.
For more information, please refer to <http://unlicense.org/>
'''
# Imports
# ---------------------------------
import argparse
import json
import os
import re
import string
import sys
import time
import urllib
from shutil import copyfile
from urllib import request
# Main entry point
# ---------------------------------
# Introduce yourself
print("Medium export image fill 1.01")
print("by Marcin Wichary (aresluna.org) and others")
print("use --help to see options")
print("")
# Prepare variables etc.
image_count_global = 0
if not os.path.isdir("original_articles"):
os.mkdir("original_articles")
# Find all the articles to be processed
articles = []
for filename in os.listdir("."):
if filename.endswith(".html"):
articles.append(filename)
if len(articles) == 0:
print("No Medium articles found here.")
print("Are you sure you're running this in the right directory?")
print("")
sys.exit()
print("To process: %i article(s)..." % (len(articles)))
print("(You can cancel any time. Next time you run, the script should resume at the last point.)")
print("")
# Loop 1: Go through all the articles
# -----------------------------------
for article_count, article_filename in enumerate(articles):
try:
# Display progress
count_string = '[%i/%i] ' % (article_count + 1, len(articles))
print('%s%s...' % (count_string, article_filename))
# Make a copy of the original file, just in case (only if it doesn't exist before)
backup_filename = "original_articles/" + article_filename
if not os.path.isfile(backup_filename):
copyfile(article_filename, backup_filename)
# Open file, find all images
with open(article_filename, 'r') as file:
article_contents = file.read()
images = re.findall(r'(<img class="graf-image"(.*?)src="(.*?)">)', article_contents)
# Loop 2: Go through all the images in an article
# -----------------------------------------------
for image_count, image in enumerate(images):
# Create an article-specific directory to download images into
directory_name = os.path.splitext(article_filename)[0]
if (image_count == 0) and not os.path.isdir(directory_name):
os.mkdir(directory_name)
image_server_url = image[2]
# Check the URL. If it starts with https://, it means we have to download the image.
# If it doesn't, it means we already downloaded the image and rewrote the URL to point
# to a local file
skip_download = False
if image_server_url[:8] != 'https://':
skip_download = True
# Get the image id, used to determine local filename
image_id = re.findall(r'\/(([^\/]*?)\.([a-z]+))$', image_server_url)[0][0]
# Update the user
progress_string = "[%i/%i] %s %s..." %\
(image_count + 1, len(images), "Skipping" if skip_download else "Downloading", image_id)
progress_string = progress_string.rjust(len(progress_string) + len(count_string))
sys.stdout.write("\r" + progress_string)
sys.stdout.write("\033[K") # Clear the end of the line
sys.stdout.flush()
if not skip_download:
# Rewrite the URL to get the maximum quality image from the server
image_server_high_quality_url = re.sub(r'\/max\/[0-9]+\/', '/', image_server_url, 1)
image_local_filename = directory_name + '/' + image_id
# Download the file (try a few times if need be)
downloaded = False
download_tries = 3
while not downloaded:
try:
# We need to impersonate a browser for downloads from CloudFlare to succeed
user_agent = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36 SE 2.X MetaSr 1.0'
headers = {'User-Agent': user_agent}
request = urllib.request.Request(image_server_high_quality_url, headers=headers)
with urllib.request.urlopen(request) as response:
with open(image_local_filename, 'wb') as out:
out.write(response.read())
except:
download_tries = download_tries - 1
if download_tries == 0:
print("")
print("Failed to download %s after 3 tries." % better_url)
print("Please try again later?")
sys.exit()
time.sleep(3) # Wait 3 seconds before retrying
else:
downloaded = True
image_count_global = image_count_global + 1
# Rewrite the URL to point to a local file, and re-save the article
article_contents = re.sub(r'(<img class="graf-image"(.*?)src="{0}">)'.format(re.escape(image_server_url)),
r'<img class="graf-image"\2src="{0}">'.format(image_local_filename),
article_contents, 1)
with open(article_filename, 'w') as file:
file.write(article_contents)
# Remove the last downloaded file from the screen
sys.stdout.write("\r")
sys.stdout.write("\033[K") # Clear the end of the line
sys.stdout.flush()
except KeyboardInterrupt:
print("")
print("Interrupted! Come back any time.")
sys.exit()
# End loop 1 (all the months)
print("")
print("Done!")
print("%i images downloaded in total." % image_count_global)
print("")