In [11]:
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import pandas as pd
import os
import time

start_url = "https://atcddd.fhi.no/atc_ddd_index/"

def crawler_base():
    data = pd.DataFrame({"atc_code": [],
                    "name": [],
                    "url": []})
    
    res = requests.get(start_url)
    soup = BeautifulSoup(res.text, 'html.parser')
    tag0 = soup.select("""#content > div:nth-child(5) > div:nth-child(2) > p""")
    temp = int(len(tag0[0].contents)/3)
    
    for i in range(temp):
        temp_data = [None] * 3
        char_index = i * 3
        name_index = char_index + 1
    
        char = tag0[0].contents[char_index].strip()
        name =  tag0[0].contents[name_index].text
        part_url = tag0[0].contents[name_index].a.get("href")
        full_url = start_url + part_url[2:]
        
        temp_data[0] = char
        temp_data[1] = name
        temp_data[2] = full_url
    
        data.loc[i] = temp_data

    data.to_csv(f"dataset/whocc_lvl{1}.csv", header=True, index=False)

def crawler_core(lvl, url):
    data_handler = pd.DataFrame({"atc_code": [], "name": [], "url": []})
    
    res = requests.get(url)
    soup = BeautifulSoup(res.text, 'html.parser')
    if lvl == 2 or lvl == 3 or lvl == 4:
        if lvl == 2:
            tag0 = soup.select("""#content > p:nth-child(4)""")
    
        if lvl == 3:
            tag0 = soup.select("""#content > p:nth-child(6)""")
    
        if lvl == 4:
            tag0 = soup.select("""#content > p:nth-child(8)""")
    
        temp = int((len(tag0[0].contents) - 3 ) / 3) 
    
        for i in range(temp):
            temp_data = [None] * 3
            char_index = i * 3
            name_index = char_index + 1
        
            char = tag0[0].contents[char_index].strip().replace('\n', '')
            name =  tag0[0].contents[name_index].text
            part_url = tag0[0].contents[name_index].a.get("href")
            full_url = start_url + part_url[2:]
            
            temp_data[0] = char
            temp_data[1] = name
            temp_data[2] = full_url
        
            data_handler.loc[i] = temp_data

    if lvl == 5:
        tag0 = soup.select("""#content""")
        try:
            temp = tag0[0].ul.table.contents
            for i in range(3, len(temp)):
                temp_data = [None] * 3
                
                char = temp[i].text.replace("\xa0", " ").strip()[:7]
                name = temp[i].text.replace("\xa0", " ").strip()[8:]
                full_url = None
                
                temp_data[0] = char
                temp_data[1] = name
                temp_data[2] = full_url
    
                data_handler.loc[i] = temp_data
        except AttributeError:
            pass
                
    return data_handler

def whoccCrawler(lvl=5):
    start_url = "https://atcddd.fhi.no/atc_ddd_index/"
    
    print(time.ctime(time.time()))
    if not os.path.exists("dataset"):
        os.mkdir("dataset")
        
    if lvl == 1:
        crawler_base()
        return
        
    for i in range(lvl, 0, -1):
        if i == 1:
            crawler_base()
            whoccCrawler()
        try:
            upper_lvl = pd.read_csv(f"dataset/whocc_lvl{i-1}.csv")
            print(f"lvl found: {i-1}")
            
            df_list = []
            for n in tqdm(range(len(upper_lvl["url"]))):
                temp = crawler_core(lvl=i, url=upper_lvl["url"][n])
                df_list.append(temp)

            data = pd.concat(df_list, ignore_index=True)
            
            data.to_csv(f"dataset/whocc_lvl{i}.csv",  header=True, index=False)
            if i == lvl:
                break
            else:
                whoccCrawler()

        except FileNotFoundError:
            print(f"lvl {i-1} not found")
            continue

    print(time.ctime(time.time()))

In [12]:
whoccCrawler()

Thu Mar 21 19:16:53 2024
lvl 4 not found
lvl 3 not found
lvl 2 not found
lvl 1 not found
Thu Mar 21 19:16:55 2024
lvl 4 not found
lvl 3 not found
lvl 2 not found
lvl found: 1


100%|████████████████████████████████████████████████████| 14/14 [00:13<00:00,  1.06it/s]


Thu Mar 21 19:17:08 2024
lvl 4 not found
lvl 3 not found
lvl found: 2


 34%|█████████████████▋                                  | 32/94 [00:31<01:00,  1.02it/s]


KeyboardInterrupt: 