In [None]:
# Department: ESTSOFT
# Class: AI Modelling
# Category: Machine learning
# Title: Data extraction
# Contributors: Jeong Gukho, Jeong Woogun, Kim Hyungeun, Kim Juneon, Kimm Soo Min
# # Last modified date: 07/04/25

### **Library**

In [4]:
# Library
import os
import chardet
import pandas as pd
import numpy as np

### **Data Extraction**

**Data source**

1. 한국전력공사(KEPCO) - https://bigdata.kepco.co.kr/cmsmain.do?scode=S01&pcode=000171&redirect=Y
2. 기상청 - https://data.kma.go.kr/data/grnd/selectAsosRltmList.do?pgmNo=36
3. 공공데이터포털 - https://www.data.go.kr/data/15049904/fileData.do?recommendDataYn=Y

In [5]:
# Detect encoding
def detect_encoding(file: str):
	with open(file, 'rb') as f:
		raw_data = f.read(10000)
		result = chardet.detect(raw_data)
		print(result)

**Extreme weather**
- 840 samples in total
- Jan 2015 to Dec 2024
- 7 different regions
 	- Seoul
	- Incheon
	- Daejeon 
	- Daegu
	- Ulsan
	- Gwangju
	- Busan
- 6 Features
	- Year
	- Month
	- Number of Tropical Nights
	- Number of Heat Wave Days
	- Number of Cold Wave Days
	- Region 

In [None]:
# Load file
df_extreme_weather = pd.read_csv('data/cleaned/weather/extreme_weather.csv', encoding='euc-kr')

# Rename headers
df_extreme_weather.columns = ["Year",
							  "Month",
							  "Number of Tropical Nights",
							  "Number of Heat Wave Days",
							  "Number of Cold Wave Days",
							  "Region"]

# Translate Korean to English
kor_to_eng = {"서울특별시": "Seoul",
    		  "부산광역시": "Busan",
    		  "대구광역시": "Daegu",
    		  "인천광역시": "Incheon",
    		  "광주광역시": "Gwangju",
    		  "대전광역시": "Daejeon",
    		  "울산광역시": "Ulsan"}
df_extreme_weather['Region'] = df_extreme_weather['Region'].map(kor_to_eng)

# Save file
df_extreme_weather.to_csv('data/cleaned/weather/extreme_weather_cleaned.csv', index=False, encoding='utf-8')

**Weather forecast**
- 840 samples in total
- Jan 2015 to Mar 2025
- 7 different regions
 	- Seoul
	- Incheon
	- Daejeon 
	- Daegu
	- Ulsan
	- Gwangju
	- Busan
- 18 Features
	- Region
	- Date
	- Avg Temperature (Celsius)
	- Avg Max Temperature (Celsius)
	- Avg Min Temperature (Celsius)
	- Avg Local Pressure (hPa)
	- Avg Sea Level Pressure (hPa)
	- Avg Vapor Pressure (hPa)
	- Avg Dew Point Temp (Celsius)
	- Avg Relative Humidity (%)
	- Monthly Precipitation (mm)
	- Small Pan Evaporation (mm)
	- Avg Wind Speed (m/s)
	- Avg Cloud Cover (1/10)
	- Sunshine Rate (%)
	- Total Solar Radiation (MJ/m^2)
	- Avg Min Supercool Temp (Celsius)
	- Avg Ground Temp (Celsius)


In [7]:
# Weather forecast
# Load file
df_weather_forecast = pd.read_csv('data/cleaned/weather/weather_forecast.csv', encoding='euc-kr')
df_weather_forecast = df_weather_forecast.drop(columns=['지점', '최고기온(°C)', '최저기온(°C)', '최고기온 나타난날(yyyymmdd)','최저기온 나타난날(yyyymmdd)',  
														'최고해면기압(hPa)', '최저해면기압(hPa)', '최고해면기압 나타난날(yyyymmdd)', '최저해면기압 나타난날(yyyymmdd)', 
														'최고수증기압(hPa)', '최저수증기압(hPa)', '최고수증기압 나타난날(yyyymmdd)', '최저수증기압 나타난날(yyyymmdd)',
														'최소상대습도(%)', '최소상대습도 나타난날(yyyymmdd)', 
														'일최다강수량(mm)', '1시간최다강수량(mm)', '10분최다강수량(mm)', '일최다강수량 나타난날(yyyymmdd)', '1시간최다강수량 나타난날(yyyymmdd)', '10분최다강수량 나타난날(yyyymmdd)', 
														'소형일최대증발량(mm)', '대형총증발량(mm)', '대형일최대증발량(mm)', '소형일최대증발량 나타난날(yyyymmdd)', '대형일최대증발량 나타난날(yyyymmdd)', 
														'최대풍속(m/s)', '최대순간풍속(m/s)', '최대풍속 풍향(16방위)', '최대순간풍속 풍향(16방위)', '최대풍속 나타난날(yyyymmdd)', '최대순간풍속 나타난날(yyyymmdd)', '최다풍향(16방위)', 
														'평균중하층운량(1/10)', '합계 일조시간(hr)', 
														'최심적설(cm)', '최심신적설(cm)', '3시간신적설합(cm)', '최심적설 나타난날(yyyymmdd)', '최심신적설 나타난날(yyyymmdd)', 
														'최저초상온도(°C)', '최저초상온도 나타난날(yyyymmdd)', 
														'0.05m평균지중온도(°C)', '0.1m평균지중온도(°C)', '0.2m평균지중온도(°C)', '0.3m평균지중온도(°C)', '0.5m평균지중온도(°C)', '1.0m평균지중온도(°C)', '1.5m평균지중온도(°C)', '3.0m평균지중온도(°C)', '5.0m평균지중온도(°C)'])

# Rename headers
df_weather_forecast.columns = ["Region",
							   "Date",
							   "Avg Temperature (Celsius)",
							   "Avg Max Temperature (Celsius)",
							   "Avg Min Temperature (Celsius)",
							   "Avg Local Pressure (hPa)",
							   "Avg Sea Level Pressure (hPa)",
							   "Avg Vapor Pressure (hPa)",
							   "Avg Dew Point Temp (Celsius)",
							   "Avg Relative Humidity (%)",
							   "Monthly Precipitation (mm)",
							   "Small Pan Evaporation (mm)",
							   "Avg Wind Speed (m/s)",
							   "Avg Cloud Cover (1/10)",
							   "Sunshine Rate (%)",
							   "Total Solar Radiation (MJ/m^2)",
							   "Avg Min Supercool Temp (Celsius)",
							   "Avg Ground Temp (Celsius)"]

# Translate Korean to English
kor_to_eng = {"서울": "Seoul",
    		  "부산": "Busan",
    		  "대구": "Daegu",
    		  "인천": "Incheon",
    		  "광주": "Gwangju",
    		  "대전": "Daejeon",
    		  "울산": "Ulsan"}
df_weather_forecast['Region'] = df_weather_forecast['Region'].map(kor_to_eng)

# Convert the Date to datetime format and extract year and month
df_weather_forecast['Date'] = pd.to_datetime(df_weather_forecast['Date'])
df_weather_forecast['Year'] = df_weather_forecast['Date'].dt.year
df_weather_forecast['Month'] = df_weather_forecast['Date'].dt.month
df_weather_forecast = df_weather_forecast.drop(columns=['Date'])

# Save file
df_weather_forecast.to_csv('data/cleaned/weather/weather_forecast_cleaned.csv', index=False, encoding='utf-8')
df_weather_forecast.info()

**Electricity consumption**
- 840 samples in total
- Jan 2015 to Dec 2024
- 7 different regions
 	- Seoul
	- Incheon
	- Daejeon 
	- Daegu
	- Ulsan
	- Gwangju
	- Busan
- 5 Features
	- Year
	- Month
	- Province
	- Number of Households
	- Avg Power Consumption per Household (kWh)

In [8]:
# Load file
df_electricity = pd.read_csv('data/cleaned/electricity/electricity_consumption.csv', encoding='euc-kr')
df_electricity = df_electricity.drop(columns=['시군구', '가구당 평균 전기요금(원)'])

# Rename headers
df_electricity.columns = ["Year",
						  "Month",
						  "Region",
						  "Number of Households",
						  "Avg Electricity Consumption per Household (kWh)"]

# Translate Korean to English
kor_to_eng = {"서울특별시": "Seoul",
    		  "부산광역시": "Busan",
    		  "대구광역시": "Daegu",
    		  "인천광역시": "Incheon",
    		  "광주광역시": "Gwangju",
    		  "대전광역시": "Daejeon",
    		  "울산광역시": "Ulsan"}
df_electricity['Region'] = df_electricity['Region'].map(kor_to_eng)

# Save file
df_electricity.to_csv('data/cleaned/electricity/electricity_consumption_cleaned.csv', index=False, encoding='utf-8')
df_electricity.info()

**Gas supply**
- 756 samples in total
- Jan 2016 to Dec 2024
- 7 different regions
 	- Seoul
	- Incheon
	- Daejeon 
	- Daegu
	- Ulsan 
	- Gwangju
	- Busan
- 9 Features
	- Year
	- Month
	- Province
	- Number of Households
	- Avg Power Consumption per Household (kWh)
	- Avg Electricity Bill per Household (KRW)
	- Number of Tropical Nights
	- Number of Heatwave Days
	- Number of Coldwave Days

### **Library**

### **Data Extraction**

In [5]:
# Detect encoding
def detect_encoding(file: str):
	with open(file, 'rb') as f:
		raw_data = f.read(10000)
		result = chardet.detect(raw_data)
		print(result)

In [None]:
# Load file
df_extreme_weather = pd.read_csv('data/cleaned/weather/extreme_weather.csv', encoding='euc-kr')

# Rename headers
df_extreme_weather.columns = ["Year",
							  "Month",
							  "Number of Tropical Nights",
							  "Number of Heat Wave Days",
							  "Number of Cold Wave Days",
							  "Region"]

# Translate Korean to English
kor_to_eng = {"서울특별시": "Seoul",
    		  "부산광역시": "Busan",
    		  "대구광역시": "Daegu",
    		  "인천광역시": "Incheon",
    		  "광주광역시": "Gwangju",
    		  "대전광역시": "Daejeon",
    		  "울산광역시": "Ulsan"}
df_extreme_weather['Region'] = df_extreme_weather['Region'].map(kor_to_eng)

# Save file
df_extreme_weather.to_csv('data/cleaned/weather/extreme_weather_cleaned.csv', index=False, encoding='utf-8')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 840 entries, 0 to 839
Data columns (total 6 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Year                       840 non-null    int64  
 1   Month                      840 non-null    int64  
 2   Number of Tropical Nights  840 non-null    float64
 3   Number of Heat Wave Days   840 non-null    float64
 4   Number of Cold Wave Days   840 non-null    float64
 5   Region                     840 non-null    object 
dtypes: float64(3), int64(2), object(1)
memory usage: 39.5+ KB


In [7]:
# Weather forecast
# Load file
df_weather_forecast = pd.read_csv('data/cleaned/weather/weather_forecast.csv', encoding='euc-kr')
df_weather_forecast = df_weather_forecast.drop(columns=['지점', '최고기온(°C)', '최저기온(°C)', '최고기온 나타난날(yyyymmdd)','최저기온 나타난날(yyyymmdd)',  
														'최고해면기압(hPa)', '최저해면기압(hPa)', '최고해면기압 나타난날(yyyymmdd)', '최저해면기압 나타난날(yyyymmdd)', 
														'최고수증기압(hPa)', '최저수증기압(hPa)', '최고수증기압 나타난날(yyyymmdd)', '최저수증기압 나타난날(yyyymmdd)',
														'최소상대습도(%)', '최소상대습도 나타난날(yyyymmdd)', 
														'일최다강수량(mm)', '1시간최다강수량(mm)', '10분최다강수량(mm)', '일최다강수량 나타난날(yyyymmdd)', '1시간최다강수량 나타난날(yyyymmdd)', '10분최다강수량 나타난날(yyyymmdd)', 
														'소형일최대증발량(mm)', '대형총증발량(mm)', '대형일최대증발량(mm)', '소형일최대증발량 나타난날(yyyymmdd)', '대형일최대증발량 나타난날(yyyymmdd)', 
														'최대풍속(m/s)', '최대순간풍속(m/s)', '최대풍속 풍향(16방위)', '최대순간풍속 풍향(16방위)', '최대풍속 나타난날(yyyymmdd)', '최대순간풍속 나타난날(yyyymmdd)', '최다풍향(16방위)', 
														'평균중하층운량(1/10)', '합계 일조시간(hr)', 
														'최심적설(cm)', '최심신적설(cm)', '3시간신적설합(cm)', '최심적설 나타난날(yyyymmdd)', '최심신적설 나타난날(yyyymmdd)', 
														'최저초상온도(°C)', '최저초상온도 나타난날(yyyymmdd)', 
														'0.05m평균지중온도(°C)', '0.1m평균지중온도(°C)', '0.2m평균지중온도(°C)', '0.3m평균지중온도(°C)', '0.5m평균지중온도(°C)', '1.0m평균지중온도(°C)', '1.5m평균지중온도(°C)', '3.0m평균지중온도(°C)', '5.0m평균지중온도(°C)'])

# Rename headers
df_weather_forecast.columns = ["Region",
							   "Date",
							   "Avg Temperature (Celsius)",
							   "Avg Max Temperature (Celsius)",
							   "Avg Min Temperature (Celsius)",
							   "Avg Local Pressure (hPa)",
							   "Avg Sea Level Pressure (hPa)",
							   "Avg Vapor Pressure (hPa)",
							   "Avg Dew Point Temp (Celsius)",
							   "Avg Relative Humidity (%)",
							   "Monthly Precipitation (mm)",
							   "Small Pan Evaporation (mm)",
							   "Avg Wind Speed (m/s)",
							   "Avg Cloud Cover (1/10)",
							   "Sunshine Rate (%)",
							   "Total Solar Radiation (MJ/m^2)",
							   "Avg Min Supercool Temp (Celsius)",
							   "Avg Ground Temp (Celsius)"]

# Translate Korean to English
kor_to_eng = {"서울": "Seoul",
    		  "부산": "Busan",
    		  "대구": "Daegu",
    		  "인천": "Incheon",
    		  "광주": "Gwangju",
    		  "대전": "Daejeon",
    		  "울산": "Ulsan"}
df_weather_forecast['Region'] = df_weather_forecast['Region'].map(kor_to_eng)

# Convert the Date to datetime format and extract year and month
df_weather_forecast['Date'] = pd.to_datetime(df_weather_forecast['Date'])
df_weather_forecast['Year'] = df_weather_forecast['Date'].dt.year
df_weather_forecast['Month'] = df_weather_forecast['Date'].dt.month
df_weather_forecast = df_weather_forecast.drop(columns=['Date'])

# Save file
df_weather_forecast.to_csv('data/cleaned/weather/weather_forecast_cleaned.csv', index=False, encoding='utf-8')
df_weather_forecast.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 861 entries, 0 to 860
Data columns (total 19 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Region                            861 non-null    object 
 1   Avg Temperature (Celsius)         861 non-null    float64
 2   Avg Max Temperature (Celsius)     861 non-null    float64
 3   Avg Min Temperature (Celsius)     861 non-null    float64
 4   Avg Local Pressure (hPa)          861 non-null    float64
 5   Avg Sea Level Pressure (hPa)      861 non-null    float64
 6   Avg Vapor Pressure (hPa)          861 non-null    float64
 7   Avg Dew Point Temp (Celsius)      861 non-null    float64
 8   Avg Relative Humidity (%)         861 non-null    int64  
 9   Monthly Precipitation (mm)        861 non-null    float64
 10  Small Pan Evaporation (mm)        728 non-null    float64
 11  Avg Wind Speed (m/s)              861 non-null    float64
 12  Avg Clou

In [8]:
# Load file
df_electricity = pd.read_csv('data/cleaned/electricity/electricity_consumption.csv', encoding='euc-kr')
df_electricity = df_electricity.drop(columns=['시군구', '가구당 평균 전기요금(원)'])

# Rename headers
df_electricity.columns = ["Year",
						  "Month",
						  "Region",
						  "Number of Households",
						  "Avg Electricity Consumption per Household (kWh)"]

# Translate Korean to English
kor_to_eng = {"서울특별시": "Seoul",
    		  "부산광역시": "Busan",
    		  "대구광역시": "Daegu",
    		  "인천광역시": "Incheon",
    		  "광주광역시": "Gwangju",
    		  "대전광역시": "Daejeon",
    		  "울산광역시": "Ulsan"}
df_electricity['Region'] = df_electricity['Region'].map(kor_to_eng)

# Save file
df_electricity.to_csv('data/cleaned/electricity/electricity_consumption_cleaned.csv', index=False, encoding='utf-8')
df_electricity.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 840 entries, 0 to 839
Data columns (total 5 columns):
 #   Column                                           Non-Null Count  Dtype  
---  ------                                           --------------  -----  
 0   Year                                             840 non-null    int64  
 1   Month                                            840 non-null    int64  
 2   Region                                           840 non-null    object 
 3   Number of Households                             840 non-null    float64
 4   Avg Electricity Consumption per Household (kWh)  840 non-null    float64
dtypes: float64(2), int64(2), object(1)
memory usage: 32.9+ KB


In [10]:
# Load file
df_gas = pd.read_csv("data/cleaned/gas/gas_supply.csv", encoding='euc-kr')

# Rename headers
df_gas.columns = ['Date',
				  'Seoul',
				  'Incheon',
				  'Gyeonggi',
				  'Gangwon',
				  'Chungcheong',
				  'Jeonbuk',
				  'Gwangju',
				  'Daegu',
				  'Busan',
				  'Jeju']

# Convert the Date to datetime format and extract year and month
df_gas['Date'] = pd.to_datetime(df_gas['Date'])
df_gas['Year'] = df_gas['Date'].dt.year
df_gas['Month'] = df_gas['Date'].dt.month
df_gas = df_gas.drop(columns=['Date'])

# Drop unnecessary columns
df_gas = df_gas.drop(columns=['Gyeonggi', 'Gangwon', 'Jeonbuk', 'Jeju'])

# Rename Chungcheong to Daejeon
df_gas.rename(columns={'Chungcheong': 'Daejeon'}, inplace=True)

# Reshape
df_gas = pd.melt(df_gas,
				 id_vars=['Year', 'Month'],
				 value_vars=['Seoul', 'Incheon', 'Gwangju', 'Daegu', 'Busan', 'Daejeon'],
				 var_name='Region',
				 value_name='Total Gas Supply (ton)')

# Scale the amount of supply based on the population
df_gas['Total Gas Supply (ton)'] = df_gas['Total Gas Supply (ton)'].astype(float)
df_gas.loc[df_gas['Region'] == 'Daejeon', 'Total Gas Supply (ton)'] *= 0.31

# Extrapolate Ulsan
monthly_avg_ulsan = df_gas.groupby(['Year', 'Month'])['Total Gas Supply (ton)'].mean().reset_index()
monthly_avg_ulsan['Region'] = 'Ulsan'
df_ulsan = monthly_avg_ulsan[['Year', 'Month', 'Region', 'Total Gas Supply (ton)']]
df_gas = pd.concat([df_gas, df_ulsan], ignore_index=True)

# Save csv
df_gas.to_csv('data/cleaned/gas/gas_supply_cleaned.csv', index=False, encoding='utf-8')
df_gas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 756 entries, 0 to 755
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Year                    756 non-null    int32  
 1   Month                   756 non-null    int32  
 2   Region                  756 non-null    object 
 3   Total Gas Supply (ton)  756 non-null    float64
dtypes: float64(1), int32(2), object(1)
memory usage: 17.8+ KB


In [11]:
# Merge data
df = pd.merge(df_electricity, df_gas, on=['Region', 'Year', 'Month'], how='inner')
df = pd.merge(df, df_extreme_weather, on=['Region', 'Year', 'Month'], how='inner')
df = pd.merge(df, df_weather_forecast, on=['Region', 'Year', 'Month'], how='inner')

# Compute average gas supply per household
df['Avg Gas Supply per Household (ton)'] = df['Total Gas Supply (ton)'] * df['Region'].apply(lambda x: 0.173 if x == 'Ulsan' else 0.437) / df['Number of Households']
df = df.drop(columns=['Total Gas Supply (ton)'])

# Handle missing values
df.ffill(inplace=True)

# Save file
df.to_csv('data/cleaned/energy_consumption.csv', index=False, encoding='utf-8')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 756 entries, 0 to 755
Data columns (total 25 columns):
 #   Column                                           Non-Null Count  Dtype  
---  ------                                           --------------  -----  
 0   Year                                             756 non-null    int64  
 1   Month                                            756 non-null    int64  
 2   Region                                           756 non-null    object 
 3   Number of Households                             756 non-null    float64
 4   Avg Electricity Consumption per Household (kWh)  756 non-null    float64
 5   Number of Tropical Nights                        756 non-null    float64
 6   Number of Heat Wave Days                         756 non-null    float64
 7   Number of Cold Wave Days                         756 non-null    float64
 8   Avg Temperature (Celsius)                        756 non-null    float64
 9   Avg Max Temperature (Celsius)   