In [1]:
from selenium import webdriver
from bs4 import BeautifulSoup
import csv
import json
import time
import os
import re
chrome_options= webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
driver = webdriver.Chrome('C:\Program Files\Google\Chrome\Application\chromedriver',options=chrome_options)
def cprint(content,module='DEBUG',*args):
    if args:
        print('\033[1;32;43m ['+module+'] \033[0m '+ content + '\033[1;35m' +str(args) +' \033[0m' + time.strftime(" |%Y-%m-%d %H:%M:%S|", time.localtime()) )
    else:
        print('\033[1;32;43m ['+module+'] \033[0m '+ content + time.strftime(" |%Y-%m-%d %H:%M:%S|", time.localtime()))
def file_write_log(path,info):
    with open(path, mode='a+',encoding="utf-8") as file_a:
        file_a.write( str(info) + time.strftime(" |%Y-%m-%d %H:%M:%S|", time.localtime()) + '\n')
    file_a.close()
FOLDER_PATH = '../web/_posts/'
test_url = 'https://dev.to/cruip/50-free-tools-and-resources-to-create-awesome-user-interfaces-1c1b'
DATA_FILE_PATH = 'dev-to-articles.csv'

In [2]:

def get_article_content(driver,url):
    '''
    Args: webdriver, target url
    Return: BeautifulSoup Soup
    '''
    cprint('Sending request to ','Selenium',url)
    driver.get(url)
    time.sleep(2)
    
    return BeautifulSoup(driver.page_source, "html.parser")

def remove_invalid_text(title):
    rstr = r"[\/\\\:\*\?\"\<\>\|]"  # '/ \ : * ? " < > |'
    new_title = re.sub(rstr, "", title) 
    return new_title

def generate_post_filename(soup):
    '''
    Args: soup of source post
    Return: valid file name for riinosite3
    '''
    filename =  soup.find("time", {"class":"date"})['datetime'][:10]+'-'+soup.head.title.string.replace(' ','-')+'.md'
    filename = remove_invalid_text(filename)
    return filename

def init_yaml_header(soup):
    '''
    Args: soup of source post
    Return: valid yaml for riinosite3
    '''
    title = soup.head.title.string
    date = soup.find("time", {"class":"date"})['datetime'][:10]
    author = soup.find("a",{"class":"flex items-center mr-4 mb-4 s:mb-0 fw-medium crayons-link"}).contents[-1].replace('\n','').strip()
    yaml = ['---\n',
            'layout: post\n'
            f'title: "{title}"\n',
            f'author: "{author}"\n',
            f'date: {date}\n',
            'toc: false\n'
            'tags:\n'
           ]
    for tag in soup.find_all("a",{"class":"crayons-tag"}):
        yaml.append('    - '+tag.text[1:]+'\n')
    yaml.append('---\n')
    return yaml
def save_markdown_file(soup,folder_path):
    '''
    Args: save markdown file from soup into target folder path
    Return: None
    '''
    with open(FOLDER_PATH+generate_post_filename(soup), mode='w',encoding="utf-8") as file_w:
        #write yaml
        file_w.writelines(init_yaml_header(soup))
        #write body
        for i in soup.find("div", {"id":"article-body"}).contents:
            file_w.write(str(i))
        cprint('Write file ssuccessfully ','FILE',FOLDER_PATH+generate_post_filename(soup))
    file_w.close()
def get_all_url(DATA_FILE_PATH):
    '''
    read csv file and collect all urls of posts
    Args: csv file path
    Return: list of url
    '''
    url_list = []
    if os.path.exists(DATA_FILE_PATH):
        if not os.path.getsize(DATA_FILE_PATH):
            cprint(DATA_FILE_PATH +'is empty')
        else:
            with open(DATA_FILE_PATH, mode='r',encoding="utf-8") as data_file_r:
                csv_reader = csv.DictReader(data_file_r)
                line_count = 0
                
                for row in csv_reader:
                    if line_count == 0:
                        cprint(f'Processing CSV header {", ".join(row)}','CSV')
                        line_count += 1
                    url_list.append(row['url'])
                    line_count += 1
                cprint(f'File processed successfully with {line_count-1} ids.','CSV')
            data_file_r.close()

    else:
        cprint(DATA_FILE_PATH +' does not exist')
    return url_list

In [3]:
#save_markdown_file(get_article_content(driver,test_url),FOLDER_PATH)

In [6]:
url_list = get_all_url(DATA_FILE_PATH)
counter = 100
for url in url_list[100:400]:
    try:
        save_markdown_file(get_article_content(driver,url),FOLDER_PATH)
    except:
        time.sleep(1)
        cprint('Save file failed, filename might be invalid','FILE')
        file_write_log('markdown-error-log.txt',url)
    cprint(f'{counter+1} / {len(url_list)} done')
    counter+=1

[1;32;43m [CSV] [0m Processing CSV header id, title, url, main_image_url, reading_time, author_name, author_username, author_id, published_at, tag_names, keywords_for_search, comments_count, public_reactions_count, highlight |2021-04-27 14:33:11|
[1;32;43m [CSV] [0m File processed successfully with 2380 ids. |2021-04-27 14:33:11|
[1;32;43m [Selenium] [0m Sending request to [1;35m('https://dev.to//tracycss/free-react-resources-you-should-have-in-your-pocket-4gl3',) [0m |2021-04-27 14:33:11|
[1;32;43m [FILE] [0m Write file ssuccessfully [1;35m('../web/_posts/2020-09-27-Free-React-resources-you-should-have-in-your-pocket.---DEV-Community.md',) [0m |2021-04-27 14:33:16|
[1;32;43m [DEBUG] [0m 101 / 2380 done |2021-04-27 14:33:16|
[1;32;43m [Selenium] [0m Sending request to [1;35m('https://dev.to//nas5w/10-javascript-quiz-questions-and-answers-to-sharpen-your-skills-255m',) [0m |2021-04-27 14:33:16|
[1;32;43m [FILE] [0m Write file ssuccessfully [1;35m('../web/_posts/2020