In [1]:
%idle_timeout 2880
%glue_version 5.0
%worker_type G.1X
%number_of_workers 5

import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
  
sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)

Welcome to the Glue Interactive Sessions Kernel
For more information on available magic commands, please type %help in any new cell.

Please view our Getting Started page to access the most up-to-date information on the Interactive Sessions kernel: https://docs.aws.amazon.com/glue/latest/dg/interactive-sessions.html
Installed kernel version: 1.0.7 
Current idle_timeout is None minutes.
idle_timeout has been set to 2880 minutes.
Setting Glue version to: 5.0
Previous worker type: None
Setting new worker type to: G.1X
Previous number of workers: None
Setting new number of workers to: 5
Trying to create a Glue session for the kernel.
Session Type: glueetl
Worker Type: G.1X
Number of Workers: 5
Idle Timeout: 2880
Session ID: e1369e33-9643-4405-aaa9-2ad89ad0c854
Applying the following default arguments:
--glue_kernel_version 1.0.7
--enable-glue-datacatalog true
Waiting for session e1369e33-9643-4405-aaa9-2ad89ad0c854 to get into ready status...
Session e1369e33-9643-4405-aaa9-2ad89ad0c854 ha

In [11]:
import requests
import xml.etree.ElementTree as ET
import pandas as pd
from datetime import datetime, timedelta
import time
import boto3
from io import StringIO




In [4]:
# 시작 날짜 및 종료 날짜 설정
start_date = datetime.today() - timedelta(days=1)
end_date = datetime.today()  # 오늘 날짜




In [5]:
# 기본 URL
base_url = "http://air.jeju.go.kr/rest/JejuAirService/getJejuAirList/?date="
# 데이터를 저장할 리스트
all_data = []
no_data_count = 0  # 데이터를 찾을 수 없는 연속적인 날을 세는 변수
max_retries = 5  # 최대 재시도 횟수
retry_delay = 3  # 재시도 간 지연 (초)

# 날짜 범위에 따라 데이터를 요청
current_date = start_date
while current_date <= end_date:
    date_str = current_date.strftime("%Y%m%d")
    url = f"{base_url}{date_str}"
    print(f"Fetching data for date: {date_str}")
    
    for attempt in range(max_retries):
        try:
            response = requests.get(url, timeout=10, headers={'User-Agent': 'Mozilla/5.0'})
            if response.status_code == 200:
                root = ET.fromstring(response.text)
                items = root.findall(".//list")
                if not items:  # 데이터가 없는 경우
                    print(f"No data found for date: {date_str}")
                    no_data_count += 1
                    all_data.append({"DATE": date_str})
                    if no_data_count >= 30:  # 30일 연속 데이터가 없으면 타임아웃
                        print("No data found for 30 consecutive days. Stopping data collection.")
                        break
                else:
                    no_data_count = 0  # 데이터를 찾았으므로 카운트 초기화
                    for item in items:
                        all_data.append({
                            "DATE": date_str,
                            "SITE": item.find("SITE").text if item.find("SITE") is not None else "",
                            "PM10": item.find("PM10").text if item.find("PM10") is not None else "",
                            "TEMP": item.find("TEMP").text if item.find("TEMP") is not None else "",
                            # 필요한 다른 데이터 필드 추가
                        })
                break
            else:
                print(f"Failed to fetch data for date: {date_str}, status code: {response.status_code}")
        except requests.exceptions.RequestException as e:
            print(f"Attempt {attempt + 1} failed for date {date_str}: {e}")
            time.sleep(retry_delay)
    else:
        print(f"Max retries exceeded for date: {date_str}")
    
    # 다음 날짜로 이동
    current_date += timedelta(days=1)
    time.sleep(1)  # 요청 간 지연 추가

# 데이터프레임 생성
df = pd.DataFrame(all_data)

print(df)

Fetching data for date: 20240101
Fetching data for date: 20240102
Fetching data for date: 20240103
Fetching data for date: 20240104
Fetching data for date: 20240105
Fetching data for date: 20240106
Fetching data for date: 20240107
Fetching data for date: 20240108
Fetching data for date: 20240109
Fetching data for date: 20240110
Fetching data for date: 20240111
Fetching data for date: 20240112
Fetching data for date: 20240113
Fetching data for date: 20240114
Fetching data for date: 20240115
Fetching data for date: 20240116
Fetching data for date: 20240117
Fetching data for date: 20240118
Fetching data for date: 20240119
Fetching data for date: 20240120
Fetching data for date: 20240121
Fetching data for date: 20240122
Fetching data for date: 20240123
Fetching data for date: 20240124
Fetching data for date: 20240125
Fetching data for date: 20240126
Fetching data for date: 20240127
Fetching data for date: 20240128
Fetching data for date: 20240129
Fetching data for date: 20240130
Fetching d

In [6]:
# AWS S3 버킷에 업로드할 설정
bucket_name = 'airflow-test001'
file_name = f'data/jeju-air-info/jeju-air-info{start_date}.csv'




In [12]:
#boto3 s3 클라이언트 생성
s3_client=boto3.client('s3')

#DataFrame을 CSV 형식으로 StringIO 객체에 저장
csv_buffer=StringIO()
df.to_csv(csv_buffer, index=False)


#버킷에 업로드
csv_buffer.seek(0) #파일 시작 위치로 이동
s3_client.put_object(Bucket=bucket_name, Key=file_name, Body=csv_buffer.getvalue())

print(f"파일이 {bucket_name} 버킷에 {file_name}로 업로드되었습니다.")

??? airflow-test001 ??? data/jeju-air-info/jeju-air-info.csv? ????????.
