# Crawling Main

In [None]:
import datetime
import requests
from bs4 import BeautifulSoup

class Baekjoon():
    # get today's date
    def get_date(self):
        now = datetime.datetime.now()
        formatted_today = f'{now.year}-{now.month:02}-{now.day:02}'
        return formatted_today

    # get info from web
    def get_info(self, subject, URL, selector):
        result = []
        res = requests.get(URL)
        soup = BeautifulSoup(res.text, 'html.parser')
        soup = soup.select_one(selector)
        # users info
        if subject == 'users':
            infos = soup.find_all('div', {'class': 'member'})
            for info in infos:
                result.append(info.h4.a.text)
            return result
        # problems info
        elif subject == 'problems':
            infos = soup.find_all('tr')
            for info in infos:
                result_span = info.find('span', {'class': 'result-ac'})          
                # 맞았으면 통째로 반환
                if result_span:
                    result.append(info)
            return result

    # get user ids -> returns dictionary of key: user_id ,val: []
    def get_user_ids(self):
        user_URL = 'https://www.acmicpc.net/group/member/10060'
        user_selector = 'body > div.wrapper > div.container.content > div.row > div:nth-child(5)'

        user_ids = self.get_info('users', user_URL, user_selector)

        # initialize dictionary to record todays' solves
        dic = {}
        for user_id in user_ids:
            dic[user_id] = []
        return user_ids, dic

    # get results for each user
    def get_all_results(self):
        user_ids, user_dict = self.get_user_ids()
        formatted_today = self.get_date()
        for user_id in user_ids:
            grading_URL = f'https://www.acmicpc.net/status?problem_id=&user_id={user_id}&language_id=-1&result_id=-1'
            grading_selector = '#status-table > tbody'
            prob_infos = self.get_info('problems', grading_URL, grading_selector)

            # 테스트로 각 아이디 채점 현황당 3건 씩만 보기
            for prob_info in prob_infos[:6]:
                # get prob infos
                prob = prob_info.find('a', {'class': 'problem_title'})
                prob_title = prob.get('title')
                prob_num = prob.text
                solved_time = prob_info.find('a', {'class': 'real-time-update'}).get('title')
                
                # get tier info
                solved = Solved(prob_num)
                prob_tier_info = solved.get_tier()
                prob_tier = prob_tier_info.get('alt')
                prob_tier_src = prob_tier_info.get('src')

                # make dict and add to prob_infos_list
                # solved time을 어떻게 처리해야될지 고민해보기
                prob_info_dict = {
                    'question_title': prob_title,
                    'question_number': prob_num,
                    'question_tier': prob_tier,
                    'question_site': 'B',
                    'solved_time': solved_time,
                }
                user_dict[user_id].append(prob_info_dict)
        return user_dict
    
class Solved():
    URL = 'https://solved.ac/search?query='
    selector = '#__next > div.contents > div:nth-child(3) > div:nth-child(2) > div > div.StickyTable__Wrapper-akg1ak-3.tcQcH.sticky-table > div > div:nth-child(2)'
    baek_URL = 'https://www.acmicpc.net/problem/'
    
    def __init__(self, prob_num):
        self.prob_num = str(prob_num)
    
    def get_tier(self):
        self.URL += self.prob_num
        self.baek_URL += self.prob_num
        res = requests.get(self.URL)
        soup = BeautifulSoup(res.text, 'html.parser')
        soup = soup.select_one(self.selector)
        soup_a = soup.find('a', {'href':self.baek_URL})
        tier_info = soup_a.img
        return tier_info

In [None]:
baek = Baekjoon()

baek.get_all_results()

# Selenium Test

In [None]:
import datetime
import requests

from pprint import pprint
from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [None]:
driver = webdriver.PhantomJS()
driver.get("https://www.acmicpc.net/status?problem_id=&user_id=rhyuys115&language_id=-1&result_id=-1")

wait = WebDriverWait(driver, 10)

# click proceed
proceed = wait.until(EC.presence_of_element_located((By.LINK_TEXT, "Proceed")))
proceed.click()

# wait for the content to be present
wait.until(EC.presence_of_element_located((By.ID, "workskin")))

soup = BeautifulSoup(driver.page_source, "html.parser")
soup.prettify()

In [None]:
selector = '#solution-25830320 > td:nth-child(3) > img'
url = 'https://www.acmicpc.net/status?problem_id=&user_id=rhyuys115&language_id=-1&result_id=-1'
res = requests.get(url)
soup = BeautifulSoup(res.text, 'html.parser')
soup = soup.select_one(selector)
print(soup)

In [None]:
a = {'a': 'asdf', 'b': 'qwer', 'c': 'wret'}

In [None]:
a[:-1]

In [None]:
import datetime

In [None]:
a = datetime.datetime(1998, 8, 9)
a.hour

In [None]:
int('08')

In [None]:
import datetime
solved_time = '2021-01-28 20:56:53'
year, month, day, hour, minute = map(int, [solved_time[:4], solved_time[5:7], solved_time[8:10], solved_time[-8:-6], solved_time[-5:-3]])
a = datetime.datetime(year, month, day, hour, minute)
a

In [None]:
a.hour

# datetime test

In [None]:
import datetime
d = datetime.date.today().day - 1
nday = datetime.date.today() - datetime.timedelta(days=d)
nday

# Multithreading Test

In [2]:
import requests
from bs4 import BeautifulSoup
import time

BASE_URL = "https://news.ycombinator.com/"
STORY_LINKS = []

for i in range(10):
    resp = requests.get(f"{BASE_URL}news?p={i}")
    soup = BeautifulSoup(resp.content, "html.parser")
    stories = soup.find_all("a", attrs={"class":"storylink"})
    links = [x["href"] for x in stories if "http" in x["href"]]
    STORY_LINKS += links
    time.sleep(0.25)

print(len(STORY_LINKS))

for url in STORY_LINKS[:3]:
    print(url)

285
https://www.aboutamazon.com/news/company-news/email-from-jeff-bezos-to-employees
https://carltheperson.com/posts/10-things-linux
https://koreajoongangdaily.joins.com/2020/12/07/business/industry/SK-hynix-NAND-Flash/20201207153100497.html


In [3]:
import time

def download_url(url):
    t0 = time.time()
    resp = requests.get(url)
    t1 = time.time()
    print(f"Request took {round(t1-t0, 2)} seconds.")
    
    title = "".join(x for x in url if x.isalpha()) + "html"
    
    with open(title, "wb") as fh:
        fh.write(resp.content)

download_url("https://beckernick.github.io/what-blogging-taught-me-about-software/")

Request took 0.36 seconds.


In [7]:
def download_url(url):
    print(url)
    resp = requests.get(url)
    title = "".join(x for x in url if x.isalpha()) + "html"
    
    with open('urls/'+title, "wb") as fh:
        fh.write(resp.content)
        
    time.sleep(0.25)
        
def download_stories(story_urls):
    for url in story_urls:
        download_url(url)

def main(story_urls):
    t0 = time.time()
    download_stories(story_urls)
    t1 = time.time()
    print(f"{t1-t0} seconds to download {len(story_urls)} stories.")

In [8]:
main(STORY_LINKS[:5])

https://www.aboutamazon.com/news/company-news/email-from-jeff-bezos-to-employees
https://carltheperson.com/posts/10-things-linux
https://koreajoongangdaily.joins.com/2020/12/07/business/industry/SK-hynix-NAND-Flash/20201207153100497.html
https://documentation.divio.com/
https://daveceddia.com/react-confirmation-modal-state-machine/
4.088822364807129 seconds to download 5 stories.
