Skip to content

Commit

Permalink
Use f-strings instead of .format()
Browse files Browse the repository at this point in the history
  • Loading branch information
rivermont authored Nov 1, 2021
1 parent 134b95d commit 3a62f5e
Showing 1 changed file with 37 additions and 40 deletions.
77 changes: 37 additions & 40 deletions spidy/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,13 +50,13 @@ def get_full_time():
except OSError:
pass # Assumes only OSError will complain if /logs already exists

LOG_FILE = open(path.join(WORKING_DIR, 'logs', 'spidy_log_{0}.txt'.format(START_TIME)),
LOG_FILE = open(path.join(WORKING_DIR, 'logs', f'spidy_log_{START_TIME}.txt'),
'w+', encoding='utf-8', errors='ignore')
LOG_FILE_NAME = path.join('logs', 'spidy_log_{0}'.format(START_TIME))
LOG_FILE_NAME = path.join('logs', f'spidy_log_{START_TIME}')

# Error log location
ERR_LOG_FILE = path.join(WORKING_DIR, 'logs', 'spidy_error_log_{0}.txt'.format(START_TIME))
ERR_LOG_FILE_NAME = path.join('logs', 'spidy_error_log_{0}.txt'.format(START_TIME))
ERR_LOG_FILE = path.join(WORKING_DIR, 'logs', f'spidy_error_log_{START_TIME}.txt')
ERR_LOG_FILE_NAME = path.join('logs', f'spidy_error_log_{START_TIME}.txt')

LOGGER = logging.getLogger('SPIDY')
LOGGER.setLevel(logging.DEBUG)
Expand Down Expand Up @@ -101,15 +101,14 @@ def write_log(operation, message, package='spidy', status='INFO', worker=0):
"""
global LOG_FILE, log_mutex
with log_mutex:
message = '[{0}] [{1}] [WORKER #{2}] [{3}] [{4}]: {5}'\
.format(get_time(), package, str(worker), operation, status, message)
message = f'[{get_time()}] [{package}] [WORKER #{str(worker)}] [{operation}] [{status}]: {message}'
print(message)
if not LOG_FILE.closed:
LOG_FILE.write('\n' + message)


write_log('INIT', 'Starting spidy Web Crawler version {0}'.format(VERSION))
write_log('INIT', 'Report any problems to GitHub at https://github.com/rivermont/spidy')
write_log('INIT', f'Starting spidy Web Crawler version {VERSION}')
write_log('INIT', 'Report any problems on GitHub at https://github.com/rivermont/spidy/issues')


###########
Expand Down Expand Up @@ -214,8 +213,7 @@ def _lookup(self, url):
def _remember(self, url):
urlparsed = urllib.parse.urlparse(url)
robots_url = urlparsed.scheme + '://' + urlparsed.netloc + '/robots.txt'
write_log('ROBOTS',
'Reading robots.txt file at: {0}'.format(robots_url),
write_log('ROBOTS', f'Reading robots.txt file at: {robots_url}'),
package='reppy')
robots = Robots.fetch(robots_url)
checker = robots.agent(self.user_agent)
Expand Down Expand Up @@ -262,12 +260,11 @@ def crawl(url, thread_id=0):
save_page(url, page)
if SAVE_WORDS:
# Announce which link was crawled
write_log('CRAWL', 'Found {0} links and {1} words on {2}'.format(len(links), len(word_list), url),
write_log('CRAWL', f'Found {len(links)} links and {len(word_list)} words on {url}',
worker=thread_id)
else:
# Announce which link was crawled
write_log('CRAWL', 'Found {0} links on {1}'.format(len(links), url),
worker=thread_id)
write_log('CRAWL', f'Found {len(links)} links on {url}', worker=thread_id)
return links


Expand Down Expand Up @@ -319,7 +316,7 @@ def crawl_worker(thread_id, robots_index):
with save_mutex:
if COUNTER.val > 0:
try:
write_log('CRAWL', 'Queried {0} links.'.format(str(COUNTER.val)), worker=thread_id)
write_log('CRAWL', f'Queried {str(COUNTER.val)} links.', worker=thread_id)
info_log()
write_log('SAVE', 'Saving files...')
save_files()
Expand Down Expand Up @@ -356,8 +353,8 @@ def crawl_worker(thread_id, robots_index):

except Exception as e:
link = url
write_log('CRAWL', 'An error was raised trying to process {0}'
.format(link), status='ERROR', worker=thread_id)
write_log('CRAWL', f'An error was raised trying to process {link}',
status='ERROR', worker=thread_id)
err_mro = type(e).mro()

if SizeError in err_mro:
Expand Down Expand Up @@ -406,7 +403,7 @@ def crawl_worker(thread_id, robots_index):

elif 'Unknown MIME type' in str(e):
NEW_MIME_COUNT.increment()
write_log('ERROR', 'Unknown MIME type: {0}'.format(str(e)[18:]), worker=thread_id)
write_log('ERROR', f'Unknown MIME type: {str(e)[18:]}', worker=thread_id)
err_log(link, 'Unknown MIME', e)

else: # Any other error
Expand Down Expand Up @@ -498,15 +495,15 @@ def save_files():
todoList.write(site + '\n') # Save TODO list
except UnicodeError:
continue
write_log('SAVE', 'Saved TODO list to {0}'.format(TODO_FILE))
write_log('SAVE', f'Saved TODO list to {TODO_FILE}')

with open(DONE_FILE, 'w', encoding='utf-8', errors='ignore') as done_list:
for site in copy(DONE.queue):
try:
done_list.write(site + '\n') # Save done list
except UnicodeError:
continue
write_log('SAVE', 'Saved DONE list to {0}'.format(TODO_FILE))
write_log('SAVE', f'Saved DONE list to {TODO_FILE}')

if SAVE_WORDS:
update_file(WORD_FILE, WORDS.get_all(), 'words')
Expand Down Expand Up @@ -549,7 +546,7 @@ def mime_lookup(value):
elif value == '':
return '.html'
else:
raise HeaderError('Unknown MIME type: {0}'.format(value))
raise HeaderError(f'Unknown MIME type: {value}')


def save_page(url, page):
Expand All @@ -559,15 +556,15 @@ def save_page(url, page):
# Make file path
ext = mime_lookup(get_mime_type(page))
cropped_url = make_file_path(url, ext)
file_path = path.join(WORKING_DIR, 'saved', '{0}'.format(cropped_url))
file_path = path.join(WORKING_DIR, 'saved', cropped_url)

# Save file
with open(file_path, 'w', encoding='utf-8', errors='ignore') as file:
if ext == '.html':
file.write('''<!-- "{0}" -->
file.write(f'''<!-- "{url}" -->
<!-- Downloaded with the spidy Web Crawler -->
<!-- https://github.com/rivermont/spidy -->
'''.format(url))
''')
file.write(page.text)


Expand All @@ -583,24 +580,24 @@ def update_file(file, content, file_type):
for item in content:
open_file.write('\n' + str(item)) # Write all words to file
open_file.truncate() # Delete everything in file beyond what has been written (old stuff)
write_log('SAVE', 'Saved {0} {1} to {2}'.format(len(content), file_type, file))
write_log('SAVE', f'Saved {len(content)} {file_type} to {file}')


def info_log():
"""
Logs important information to the console and log file.
"""
# Print to console
write_log('LOG', 'Started at {0}'.format(START_TIME_LONG))
write_log('LOG', 'Log location: {0}'.format(LOG_FILE_NAME))
write_log('LOG', 'Error log location: {0}'.format(ERR_LOG_FILE_NAME))
write_log('LOG', '{0} links in TODO'.format(TODO.qsize()))
write_log('LOG', '{0} links in DONE'.format(DONE.qsize()))
write_log('LOG', 'TODO/DONE: {0}'.format(TODO.qsize() / DONE.qsize()))
write_log('LOG', '{0}/{1} new errors caught.'.format(NEW_ERROR_COUNT.val, MAX_NEW_ERRORS))
write_log('LOG', '{0}/{1} HTTP errors encountered.'.format(HTTP_ERROR_COUNT.val, MAX_HTTP_ERRORS))
write_log('LOG', '{0}/{1} new MIMEs found.'.format(NEW_MIME_COUNT.val, MAX_NEW_MIMES))
write_log('LOG', '{0}/{1} known errors caught.'.format(KNOWN_ERROR_COUNT.val, MAX_KNOWN_ERRORS))
write_log('LOG', f'Started at {START_TIME_LONG}')
write_log('LOG', f'Log location: {LOG_FILE_NAME}')
write_log('LOG', f'Error log location: {ERR_LOG_FILE_NAME}')
write_log('LOG', f'{TODO.qsize()} links in TODO')
write_log('LOG', f'{DONE.qsize()} links in DONE')
write_log('LOG', f'TODO/DONE: {TODO.qsize() / DONE.qsize()}')
write_log('LOG', f'{NEW_ERROR_COUNT.val}/{MAX_NEW_ERRORS} new errors caught.')
write_log('LOG', f'{HTTP_ERROR_COUNT.val}/{MAX_HTTP_ERRORS} HTTP errors encountered.')
write_log('LOG', f'{NEW_MIME_COUNT.val}/{MAX_NEW_MIMES} new MIMEs found.')
write_log('LOG', f'{KNOWN_ERROR_COUNT.val}/{MAX_KNOWN_ERRORS} known errors caught.')


def log(message, level=logging.DEBUG):
Expand All @@ -622,7 +619,7 @@ def handle_invalid_input(type_='input. (yes/no)'):
"""
Handles an invalid user input, usually from the input() function.
"""
write_log('INIT', 'Please enter a valid {0}'.format(type_), status='ERROR')
write_log('INIT', f'Please enter a valid {type_}', status='ERROR')
# could raise InputError but this means the user must go through the whole init process again


Expand All @@ -632,7 +629,7 @@ def err_log(url, error1, error2):
error1 is the trimmed error source.
error2 is the extended text of the error.
"""
LOGGER.error("\nURL: {0}\nERROR: {1}\nEXT: {2}\n\n".format(url, error1, str(error2)))
LOGGER.error(f"\nURL: {url}\nERROR: {error1}\nEXT: {str(error2)}\n\n")


def zip_saved_files(out_file_name, directory):
Expand All @@ -642,7 +639,7 @@ def zip_saved_files(out_file_name, directory):
shutil.make_archive(str(out_file_name), 'zip', directory) # Zips files
shutil.rmtree(directory) # Deletes folder
makedirs(directory) # Creates empty folder of same name
write_log('SAVE', 'Zipped documents to {0}.zip'.format(out_file_name))
write_log('SAVE', f'Zipped documents to {out_file_name}.zip')


########
Expand Down Expand Up @@ -1260,10 +1257,10 @@ def main():
with open(WORD_FILE, 'w', encoding='utf-8', errors='ignore'):
pass

write_log('INIT', 'Successfully started spidy Web Crawler version {0}...'.format(VERSION))
write_log('INIT', f'Successfully started spidy Web Crawler version {VERSION}...')
LOGGER.log(logging.INFO, 'Successfully started crawler.')

write_log('INIT', 'Using headers: {0}'.format(HEADER))
write_log('INIT', f'Using headers: {HEADER}')

robots_index = RobotsIndex(RESPECT_ROBOTS, HEADER['User-Agent'])

Expand All @@ -1274,6 +1271,6 @@ def main():
if __name__ == '__main__':
main()
else:
write_log('INIT', 'Successfully imported spidy Web Crawler version {0}.'.format(VERSION))
write_log('INIT', f'Successfully imported spidy Web Crawler version {VERSION}.')
write_log('INIT',
'Call `crawler.main()` to start crawling, or refer to DOCS.md to see use of specific functions.')

0 comments on commit 3a62f5e

Please sign in to comment.