In [1]:
import time
import logging
import pandas as pd
import requests
from tqdm import tqdm

In [2]:
logging.basicConfig(
    level=logging.ERROR,
    filename='app.log',
    filemode='a',
    format='%(message)s'
)

def create_url(
        country_code="2060",
        station_code="00000",
        day_start="01",
        day_end="31",
        month="01",
        year="2000",
        page="1"
):
    url = f"http://www.meteomanz.com/sy2?cou={country_code}&ty=hp&ind={station_code}&d1={day_start}&m1={month}&y1={year}&d2={day_end}&m2={month}&y2={year}&so=001&np={page}"    
    return url


def create_header(
        userAgent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
        referer="http://www.meteomanz.com/",
        host="www.meteomanz.com",
        acceptLanguage="en-US,en;q=0.9",
        connection="keep-alive",
        accept="text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7"
):
    headers = {
        "User-Agent": userAgent,
        "Referer": referer,
        "Host": host,
        "Accept-Language": acceptLanguage,
        "Connection": connection,
        "Accept": accept
    }
    return headers

YEARS = [*map(str, range(2000, 2025, 1))]
MONTH = [*map(lambda x: str(x).zfill(2), range(1, 13, 1))]
DAY = [*map(lambda x: str(x).zfill(2), range(1, 32, 1))]
PAGE = [*map(str, range(1, 11, 1))]

In [3]:
HEADERS = create_header()

for y in tqdm(YEARS, desc="Year"):
    data = pd.DataFrame()
    for m in tqdm(MONTH, desc="Month"):
        for p in tqdm(PAGE, desc="Page"):
            URL = create_url(
                country_code="2060",
                station_code="00000",
                day_start="01",
                day_end="31",
                month=m,
                year=y,
                page=p
            )
            r = requests.get(url = URL, headers = HEADERS)
            while r.status_code != 200:
                time.sleep(3)
                logging.error(f"Status Code: {r.status_code} - {y}-{m}:{p}")
                r = requests.get(url = URL, headers = HEADERS)
            get_data = True
            while get_data:
                try:
                    df = pd.read_html(URL, storage_options=HEADERS)[0]
                    if len(df) != 0:
                        data = pd.concat([data, df])
                        data.reset_index(drop=True, inplace=True)
                    get_data = False
                except:
                    time.sleep(3)
                    r = requests.get(url = URL, headers = HEADERS)
                    logging.error(f"Status Code: {r.status_code} - {y}-{m}:{p}")
                    print(f"Status Code: {r.status_code} - {y}-{m}:{p}")
                    get_data = True
                    
                    
                    
                    
                # try:
                #     df = pd.read_html(URL, storage_options=HEADERS)[0]
                #     if len(df) != 0:
                #         data = pd.concat([data, df])
                #         data.reset_index(drop=True, inplace=True)
                # except:
                #     time.sleep(3)
                #     try:
                #         df = pd.read_html(URL, storage_options=HEADERS)[0]
                #         if len(df) != 0:
                #             data = pd.concat([data, df])
                #             data.reset_index(drop=True, inplace=True)
                #     except:
                #         logging.error(f"Error in {y}-{m}:{p}")
                #         print(f"Error in {y}-{m}:{p}")
                #         continue
    data.to_csv(f"output/{y}.csv", index=False)


Year:   0%|          | 0/25 [00:00<?, ?it/s]
Month:   0%|          | 0/12 [00:00<?, ?it/s][A

Page:   0%|          | 0/10 [00:00<?, ?it/s][A[A

Page:  10%|█         | 1/10 [00:07<01:11,  7.98s/it][A[A

Page:  20%|██        | 2/10 [00:17<01:10,  8.85s/it][A[A

Page:  30%|███       | 3/10 [00:25<01:00,  8.62s/it][A[A

Page:  40%|████      | 4/10 [00:32<00:46,  7.78s/it][A[A

Page:  50%|█████     | 5/10 [00:36<00:33,  6.61s/it][A[A

Page:  60%|██████    | 6/10 [00:39<00:21,  5.27s/it][A[A

Page:  70%|███████   | 7/10 [00:42<00:13,  4.48s/it][A[A

Page:  80%|████████  | 8/10 [00:44<00:07,  3.90s/it][A[A

Page:  90%|█████████ | 9/10 [00:47<00:03,  3.53s/it][A[A

Page: 100%|██████████| 10/10 [00:50<00:00,  5.04s/it][A[A

Month:   8%|▊         | 1/12 [00:50<09:14, 50.40s/it][A

Page:   0%|          | 0/10 [00:00<?, ?it/s][A[A

Page:  10%|█         | 1/10 [00:07<01:09,  7.70s/it][A[A

Page:  20%|██        | 2/10 [00:16<01:05,  8.21s/it][A[A

Page:  30%|███       |

Status Code: 200 - 2000-02:7




Page:  70%|███████   | 7/10 [01:23<00:49, 16.46s/it][A[A

Page:  80%|████████  | 8/10 [01:26<00:24, 12.04s/it][A[A

Page:  90%|█████████ | 9/10 [01:28<00:09,  9.04s/it][A[A

Page: 100%|██████████| 10/10 [01:31<00:00,  9.14s/it][A[A

Month:  17%|█▋        | 2/12 [02:21<12:25, 74.51s/it][A

Page:   0%|          | 0/10 [00:00<?, ?it/s][A[A

Page:  10%|█         | 1/10 [00:08<01:18,  8.67s/it][A[A

Page:  20%|██        | 2/10 [00:16<01:04,  8.12s/it][A[A

Page:  30%|███       | 3/10 [00:24<00:57,  8.17s/it][A[A

Page:  40%|████      | 4/10 [00:32<00:47,  7.87s/it][A[A

Page:  50%|█████     | 5/10 [00:41<00:41,  8.37s/it][A[A

Page:  60%|██████    | 6/10 [00:49<00:33,  8.32s/it][A[A

Page:  70%|███████   | 7/10 [00:55<00:22,  7.46s/it][A[A

Page:  80%|████████  | 8/10 [00:58<00:11,  5.99s/it][A[A

Page:  90%|█████████ | 9/10 [01:00<00:05,  5.01s/it][A[A

Page: 100%|██████████| 10/10 [01:03<00:00,  6.37s/it][A[A

Month:  25%|██▌       | 3/12 [03:25<10:26, 69.

ConnectionError: HTTPConnectionPool(host='www.meteomanz.com', port=80): Max retries exceeded with url: /sy2?cou=2060&ty=hp&ind=00000&d1=01&m1=06&y1=2000&d2=31&m2=06&y2=2000&so=001&np=8 (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x0000020651E231D0>: Failed to resolve 'www.meteomanz.com' ([Errno 11001] getaddrinfo failed)"))