In [34]:
#THIS NOTEBOOK WILL COLLECT ENTITY INFO FROM GIVEN URL, AND SAVE THEM INTO CSV FILE, WHILE MAINTAINING A UNIQUE KEY FILE.

# import requests
from selenium import webdriver
from bs4 import BeautifulSoup
import csv
import json
import time
import os
chrome_options= webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
driver = webdriver.Chrome('C:\Program Files\Google\Chrome\Application\chromedriver',options=chrome_options)
def cprint(content,module='DEBUG',*args):
    if args:
        print('\033[1;32;43m ['+module+'] \033[0m '+ content + '\033[1;35m' +str(args) +' \033[0m' + time.strftime(" |%Y-%m-%d %H:%M:%S|", time.localtime()) )
    else:
        print('\033[1;32;43m ['+module+'] \033[0m '+ content + time.strftime(" |%Y-%m-%d %H:%M:%S|", time.localtime()))
        

ID_FILE_PATH = 'unique_id.csv'
DATA_FILE_PATH = 'dev-to-articles.csv'
DATA_FILE_HEADER = ["id",
                    "title",
                    "url",
                    "main_image_url",
                    "reading_time",
                    "author_name",
                    "author_username",
                    "author_id",
                    "published_at",
                    "tag_names",
                    "keywords_for_search",
                    "comments_count",
                    'public_reactions_count',
                    'highlight'
                   ]
#Custom Function
def prase_data(data):
    '''
    Args:
        data: response data
    Return:
        A row for csv 
    '''
    res=[]
    res.append(data['id'])#id
    res.append(data['title'])#title
    res.append('https://dev.to/'+data['path'])#url
    res.append(data['main_image'])#main_image_url
    res.append(data['reading_time'])#reading_time
    res.append(data['user']['name'])#author_name
    res.append(data['user']['username'])#author_username
    res.append(data['user']['id'])#author_id
    res.append(data['published_at'])#published_at
    tags=[]
    keywords_for_search=[]
    if 'tags' in data:
        for tag in data['tags']:
            tags.append(tag['name'])
            if tag['keywords_for_search']:
                keywords_for_search.append(tag['keywords_for_search'])
    res.append('+'.join(tags))#tag_names
    res.append('+'.join(keywords_for_search))#keywords_for_search
    res.append(data['comments_count'])#comments_count
    res.append(data['public_reactions_count'])
    res.append(data['highlight'])
    return res
            
#Custom Function
def get_unique_id(data):
    '''
    Args:
        data: response data
    Return:
        Unique id of data 
    '''
    if 'id' in data:
        return data['id']
    else:
        cprint('Error: "id" is not founded')
        return -1
    
#Custom Function
def get_dev_to_url(per_page,page):
    '''
    Args:
        per_page : number of articles
        page : page number
    '''
    return 'https://dev.to/search/feed_content?per_page={}&page={}&sort_by=public_reactions_count&sort_direction=desc&approved=&class_name=Article&published_at%5Bgte%5D=2020-03-29T13%3A57%3A39Z'.format(per_page,page)


#Custom Function
def get_articles(per_page,page):
    '''
    WARNING: THIS FUNCTION WILL SEND GET REQUEST
    Args:
        per_page : number of articles
        page : page number
    Returns:
        [{'reading_time': 8,
          'main_image': 'https://res.cloudinary.com/practicaldev/image/fetch/s--6dRCSMFR--/c_imagga_scale,f_auto,fl_progressive,h_420,q_auto,w_1000/https://dev-to-uploads.s3.amazonaws.com/i/m3gr1kxfgnpsjtbvt5qj.png',
          'readable_publish_date_string': "May 12 '20",
          'cloudinary_video_url': None,
          'video_duration_in_minutes': 0,
          'title': '50+ free tools and resources to create awesome user interfaces',
          'video_duration_string': '00:00',
          'tags': [{'name': 'design', 'keywords_for_search': None},
           {'name': 'webdev', 'keywords_for_search': 'web development'},
           {'name': 'css', 'keywords_for_search': None},
           {'name': 'html', 'keywords_for_search': None}],
          'path': '/davidepacilio/50-free-tools-and-resources-to-create-awesome-user-interfaces-1c1b',
          'comments_count': 78,
          'public_reactions_count': 5287,
          'id': 332325,
          'published_at': '2020-05-12T11:11:52.146Z',
          'class_name': 'Article',
          'user': {'name': 'Davide Pacilio',
           'id': 327338,
           'pro': None,
           'profile_image_90': 'https://res.cloudinary.com/practicaldev/image/fetch/s--eOvQz5aX--/c_fill,f_auto,fl_progressive,h_90,q_auto,w_90/https://dev-to-uploads.s3.amazonaws.com/uploads/user/profile_image/327338/960ca0da-3b56-4ff6-a6c2-096e820f97dd.jpeg',
           'username': 'davidepacilio'},
          'tag_list': ['design', 'webdev', 'css', 'html'],
          'flare_tag': None,
          'user_id': 327338,
          'highlight': None,
          'readable_publish_date': "May 12 '20",
          'podcast': {'slug': None,
           'image_url': 'https://res.cloudinary.com/practicaldev/image/fetch/s--6dRCSMFR--/c_imagga_scale,f_auto,fl_progressive,h_420,q_auto,w_1000/https://dev-to-uploads.s3.amazonaws.com/i/m3gr1kxfgnpsjtbvt5qj.png',
           'title': '50+ free tools and resources to create awesome user interfaces'},
          '_score': None,
          'published_at_int': 1589281912,
          'published_timestamp': '2020-05-12T11:11:52.146Z'}]
    Return False if failed
    '''
    driver.get(get_dev_to_url(per_page,page))
    time.sleep(2)
    soup = BeautifulSoup(driver.page_source, "html.parser")
    res = json.loads(str(soup.body.string))
    if res:
        if 'result' in res:
            cprint('Send Request successfully with para:','[Selenium]',per_page,page)
            return res['result']
        else:
            cprint('Request Failed : Key "result" do not exist','[Selenium]')
            return False
    else:
        cprint('Request Failed : No response data','[Selenium]')
        return False
    

    
#test API
# get_articles(1,1)

unique_ids=set()
if os.path.exists(ID_FILE_PATH):
    if not os.path.getsize(ID_FILE_PATH):#if blank file, write head
        with open(ID_FILE_PATH, mode='w',encoding="utf-8") as id_file_w:
            writer = csv.writer(id_file_w)
            writer.writerow(['id'])
            cprint('Empty unique id file, write header','[Unique ID]')
        id_file_w.close()
    else:
        with open(ID_FILE_PATH, mode='r',encoding="utf-8") as id_file_r:
            csv_reader = csv.DictReader(id_file_r)
            line_count = 0
            for row in csv_reader:
                if line_count == 0:
                    cprint(f'Processing unique id file with index: {", ".join(row)}','[Unique ID]')
                    line_count += 1
                unique_ids.add(row["id"])
                line_count += 1
            cprint(f'Unique id file processed successfully with {line_count-1} ids.','[Unique ID]')
        id_file_r.close()

#start add new data
with open(DATA_FILE_PATH,"a+",encoding="utf-8") as csvfile_a, open(ID_FILE_PATH, mode='a+',encoding="utf-8") as id_file_a: 
    writer = csv.writer(csvfile_a)
    row_counter=0
    id_writer = csv.writer(id_file_a)
    id_counter=0
    if not os.path.getsize(DATA_FILE_PATH):
        writer.writerow(DATA_FILE_HEADER)
    #request
    for page in range (100,120):
        articles = get_articles(20,page)
        for article in articles:
            idx = get_unique_id(article)
            if idx in unique_ids:
#                 print('[CSV]Found existed unique id, skip ',idx)
                id_counter+=1
                continue
            else:
                id_writer.writerow([idx])
                writer.writerow(prase_data(article))
#                 print('[CSV]Row saved successfully with id: ',idx)
                row_counter+=1
                continue
csvfile_a.close()
id_file_a.close()
cprint('Done')
cprint('Appended row number:','[CSV]',row_counter)
cprint('Skipped data number: ','[CSV]',id_counter)
# resp = requests.get(get_dev_to_url(1,1))
# if resp.status_code != 200:
#     # This means something went wrong.
#     raise ApiError('GET Status Code:{}'.format(resp.status_code))
# res = resp.json():



[1;32;43m [[Unique ID]] [0m Processing unique id file with index: id |2021-03-31 15:33:30|
[1;32;43m [[Unique ID]] [0m Unique id file processed successfully with 1980 ids. |2021-03-31 15:33:30|
[1;32;43m [[Selenium]] [0m Send Request successfully with para:[1;35m(20, 100) [0m |2021-03-31 15:33:33|
[1;32;43m [[Selenium]] [0m Send Request successfully with para:[1;35m(20, 101) [0m |2021-03-31 15:33:36|
[1;32;43m [[Selenium]] [0m Send Request successfully with para:[1;35m(20, 102) [0m |2021-03-31 15:33:39|
[1;32;43m [[Selenium]] [0m Send Request successfully with para:[1;35m(20, 103) [0m |2021-03-31 15:33:42|
[1;32;43m [[Selenium]] [0m Send Request successfully with para:[1;35m(20, 104) [0m |2021-03-31 15:33:45|
[1;32;43m [[Selenium]] [0m Send Request successfully with para:[1;35m(20, 105) [0m |2021-03-31 15:33:48|
[1;32;43m [[Selenium]] [0m Send Request successfully with para:[1;35m(20, 106) [0m |2021-03-31 15:33:50|
[1;32;43m [[Selenium]] [0m Send Reque