In [10]:
%help


# Available Magic Commands

## Sessions Magic

----
    %help                             Return a list of descriptions and input types for all magic commands. 
    %profile            String        Specify a profile in your aws configuration to use as the credentials provider.
    %region             String        Specify the AWS region in which to initialize a session. 
                                      Default from ~/.aws/config on Linux or macOS, 
                                      or C:\Users\ USERNAME \.aws\config" on Windows.
    %idle_timeout       Int           The number of minutes of inactivity after which a session will timeout. 
                                      Default: 2880 minutes (48 hours).
    %timeout            Int           The number of minutes after which a session will timeout. 
                                      Default: 2880 minutes (48 hours).
    %session_id_prefix  String        Define a String that will precede all session IDs in the format 
                                      [session_id_prefix]-[session_id]. If a session ID is not provided,
                                      a random UUID will be generated.
    %status                           Returns the status of the current Glue session including its duration, 
                                      configuration and executing user / role.
    %session_id                       Returns the session ID for the running session.
    %list_sessions                    Lists all currently running sessions by ID.
    %stop_session                     Stops the current session.
    %glue_version       String        The version of Glue to be used by this session. 
                                      Currently, the only valid options are 2.0, 3.0 and 4.0. 
                                      Default: 2.0.
    %reconnect          String        Specify a live session ID to switch/reconnect to the sessions.
----

## Selecting Session Types

----
    %streaming          String        Sets the session type to Glue Streaming.
    %etl                String        Sets the session type to Glue ETL.
    %glue_ray           String        Sets the session type to Glue Ray.
    %session_type       String        Specify a session_type to be used. Supported values: streaming, etl and glue_ray. 
----

## Glue Config Magic 
*(common across all session types)*

----

    %%configure         Dictionary    A json-formatted dictionary consisting of all configuration parameters for 
                                      a session. Each parameter can be specified here or through individual magics.
    %iam_role           String        Specify an IAM role ARN to execute your session with.
                                      Default from ~/.aws/config on Linux or macOS, 
                                      or C:\Users\%USERNAME%\.aws\config` on Windows.
    %number_of_workers  int           The number of workers of a defined worker_type that are allocated 
                                      when a session runs.
                                      Default: 5.
    %additional_python_modules  List  Comma separated list of additional Python modules to include in your cluster 
                                      (can be from Pypi or S3).
    %%tags        Dictionary          Specify a json-formatted dictionary consisting of tags to use in the session.
    
    %%assume_role Dictionary, String  Specify a json-formatted dictionary or an IAM role ARN string to create a session 
                                      for cross account access.
                                      E.g. {valid arn}
                                      %%assume_role 
                                      'arn:aws:iam::XXXXXXXXXXXX:role/AWSGlueServiceRole' 
                                      E.g. {credentials}
                                      %%assume_role
                                      {
                                            "aws_access_key_id" : "XXXXXXXXXXXX",
                                            "aws_secret_access_key" : "XXXXXXXXXXXX",
                                            "aws_session_token" : "XXXXXXXXXXXX"
                                       }
----

                                      
## Magic for Spark Sessions (ETL & Streaming)

----
    %worker_type        String        Set the type of instances the session will use as workers. 
    %connections        List          Specify a comma separated list of connections to use in the session.
    %extra_py_files     List          Comma separated list of additional Python files From S3.
    %extra_jars         List          Comma separated list of additional Jars to include in the cluster.
    %spark_conf         String        Specify custom spark configurations for your session. 
                                      E.g. %spark_conf spark.serializer=org.apache.spark.serializer.KryoSerializer
----
                                      
## Magic for Ray Session

----
    %min_workers        Int           The minimum number of workers that are allocated to a Ray session. 
                                      Default: 1.
    %object_memory_head Int           The percentage of free memory on the instance head node after a warm start. 
                                      Minimum: 0. Maximum: 100.
    %object_memory_worker Int         The percentage of free memory on the instance worker nodes after a warm start. 
                                      Minimum: 0. Maximum: 100.
----

## Action Magic

----

    %%sql               String        Run SQL code. All lines after the initial %%sql magic will be passed
                                      as part of the SQL code.  
    %matplot      Matplotlib figure   Visualize your data using the matplotlib library.
                                      E.g. 
                                      import matplotlib.pyplot as plt
                                      # Set X-axis and Y-axis values
                                      x = [5, 2, 8, 4, 9]
                                      y = [10, 4, 8, 5, 2]
                                      # Create a bar chart 
                                      plt.bar(x, y) 
                                      # Show the plot
                                      %matplot plt    
    %plotly            Plotly figure  Visualize your data using the plotly library.
                                      E.g.
                                      import plotly.express as px
                                      #Create a graphical figure
                                      fig = px.line(x=["a","b","c"], y=[1,3,2], title="sample figure")
                                      #Show the figure
                                      %plotly fig

  
                
----



In [2]:
# yfinance 모듈 추가
%additional_python_modules yfinance==0.2.48

Welcome to the Glue Interactive Sessions Kernel
For more information on available magic commands, please type %help in any new cell.

Please view our Getting Started page to access the most up-to-date information on the Interactive Sessions kernel: https://docs.aws.amazon.com/glue/latest/dg/interactive-sessions.html
Installed kernel version: 1.0.5 
Additional python modules to be included:
yfinance==0.2.48


In [1]:
%idle_timeout 2880
%glue_version 4.0
%worker_type G.1X
%number_of_workers 5

import sys
import pandas as pd
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job

import yfinance as yf
  
sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)

Current idle_timeout is None minutes.
idle_timeout has been set to 2880 minutes.
Setting Glue version to: 4.0
Previous worker type: None
Setting new worker type to: G.1X
Previous number of workers: None
Setting new number of workers to: 5
Trying to create a Glue session for the kernel.
Session Type: glueetl
Worker Type: G.1X
Number of Workers: 5
Idle Timeout: 2880
Session ID: 273ee12e-e078-4f41-90d5-a6949c022813
Applying the following default arguments:
--glue_kernel_version 1.0.5
--enable-glue-datacatalog true
--additional-python-modules yfinance==0.2.48
Waiting for session 273ee12e-e078-4f41-90d5-a6949c022813 to get into ready status...
Session 273ee12e-e078-4f41-90d5-a6949c022813 has been created.



In [5]:
# 수집할 종목 리스트 (예시: SK하이닉스)
symbol = '000660.KS'
ticker = yf.Ticker(symbol)
# 오늘부터 5년전 데이터 조회
from datetime import datetime

startMd = '01-01'
endMd = '12-31'
dfs = []

# 현재 연도를 가져와서 5년 전부터 현재 연도까지 반복
current_year = datetime.now().year

for year in range(current_year - 5, current_year + 1):
    # 연도별 데이터 조회
    df_year = ticker.history(interval='1d', period='1y', start=f"{year}-{startMd}", end=f"{year}-{endMd}", auto_adjust=False)
    dfs.append(df_year)

df = pd.concat(dfs)
df.reset_index(inplace=True)

# pandas Dataframe을 Spark DataFrame으로 변환
spark_df = spark.createDataFrame(df)
spark_df.show()
spark_df.printSchema()

+-------------------+-------+-------+-------+-------+--------------+-------+---------+------------+
|               Date|   Open|   High|    Low|  Close|     Adj Close| Volume|Dividends|Stock Splits|
+-------------------+-------+-------+-------+-------+--------------+-------+---------+------------+
|2019-01-01 15:00:00|61300.0|61400.0|60000.0|60600.0| 56981.5234375|1934295|      0.0|         0.0|
|2019-01-02 15:00:00|60200.0|60300.0|57500.0|57700.0|54254.68359375|3337621|      0.0|         0.0|
|2019-01-03 15:00:00|57500.0|58500.0|56700.0|58300.0|  54818.859375|3388087|      0.0|         0.0|
|2019-01-06 15:00:00|59000.0|59700.0|58400.0|58700.0|55194.96484375|2273750|      0.0|         0.0|
|2019-01-07 15:00:00|57900.0|60500.0|57600.0|59200.0| 55665.1171875|3062192|      0.0|         0.0|
|2019-01-08 15:00:00|59600.0|63800.0|59400.0|63600.0|59802.38671875|4928656|      0.0|         0.0|
|2019-01-09 15:00:00|64100.0|65600.0|64000.0|65300.0|61400.87890625|6034432|      0.0|         0.0|


In [6]:
# S3에 parquet 형식으로 저장
s3_output_path = 's3://jackie-python-lib/stock_data/'
final_path = f"{s3_output_path}{symbol}"




In [7]:
# parquet 형식으로 저장
spark_df.write.mode('overwrite').parquet(final_path)

print(f"데이터 저장 완료: {final_path}")

데이터 저장 완료: s3://jackie-python-lib/stock_data/000660.KS


In [8]:
# 종료
job.commit()


