# 기상데이터 모델링 분석

사용할 분석 모델 : Logistic regression, Random Forest, Support Vector Machine

## 라이브러리 불러오기

In [11]:
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
from shapely.geometry import Point
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, recall_score
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
import joblib
import numpy as np
from sklearn.inspection import permutation_importance
from scipy.spatial import cKDTree
import os

In [None]:
# ! python -m pip install --upgrade pandas


## 데이터 불러오기

### 기상 정보 데이터

In [3]:
weather_data = pd.read_csv("./data/weather_fire_label.csv", encoding='cp949')

In [16]:
weather_data

Unnamed: 0,지점,지점명,일시,기온(°C),풍향(deg),풍속(m/s),강수량(mm),습도(%),실효습도,산불
0,12,안면도(감),2022-01-01 00:00:00,-4.5,34.0,2.6,0.0,59.0,17.70,0
1,12,안면도(감),2022-01-01 01:00:00,-4.6,37.8,2.4,0.0,58.0,29.79,0
2,12,안면도(감),2022-01-01 02:00:00,-5.0,62.6,2.7,0.0,65.0,31.68,0
3,12,안면도(감),2022-01-01 03:00:00,-5.0,68.4,2.9,0.0,54.0,29.85,0
4,12,안면도(감),2022-01-01 04:00:00,-4.9,65.6,2.7,0.0,51.0,26.64,0
...,...,...,...,...,...,...,...,...,...,...
16418967,996,화동,2024-12-31 19:00:00,-0.5,268.2,3.1,0.0,38.0,18.96,0
16418968,996,화동,2024-12-31 20:00:00,-0.8,266.5,3.8,0.0,38.0,19.38,0
16418969,996,화동,2024-12-31 21:00:00,-1.6,262.5,5.2,0.0,46.0,21.78,0
16418970,996,화동,2024-12-31 22:00:00,-2.4,263.6,3.9,0.0,55.0,26.16,0


### 기상 관측 지점 데이터

In [17]:
loc_data = pd.read_csv("./data/meta-loc-edited.csv", encoding='cp949')

In [18]:
loc_data

Unnamed: 0,지점,시작일,종료일,지점명,위도,경도
0,12,2007-11-30,,안면도(감),36.5333,126.3167
1,96,2020-10-23,,독도,37.2395,131.8698
2,116,2023-10-21,,관악(레),37.4453,126.9640
3,144,1993-03-16,2011-12-09,군산(레),36.0123,126.7834
4,160,2020-10-23,,부산(레),35.1188,129.0000
...,...,...,...,...,...,...
567,992,2021-11-10,,하빈,35.9062,128.4464
568,993,2024-04-06,,제주금악,33.3423,126.3099
569,994,2022-05-02,,심포,35.8545,126.6421
570,995,2023-12-19,,오천,35.9286,129.3822


In [19]:
loc_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 572 entries, 0 to 571
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   지점      572 non-null    int64  
 1   시작일     572 non-null    object 
 2   종료일     26 non-null     object 
 3   지점명     572 non-null    object 
 4   위도      572 non-null    float64
 5   경도      572 non-null    float64
dtypes: float64(2), int64(1), object(3)
memory usage: 26.9+ KB


In [20]:
loc_gdf = gpd.GeoDataFrame(
    loc_data, geometry=gpd.points_from_xy(loc_data['경도'], loc_data['위도']), crs="EPSG:4326"
)

In [21]:
loc_gdf

Unnamed: 0,지점,시작일,종료일,지점명,위도,경도,geometry
0,12,2007-11-30,,안면도(감),36.5333,126.3167,POINT (126.3167 36.5333)
1,96,2020-10-23,,독도,37.2395,131.8698,POINT (131.8698 37.2395)
2,116,2023-10-21,,관악(레),37.4453,126.9640,POINT (126.964 37.4453)
3,144,1993-03-16,2011-12-09,군산(레),36.0123,126.7834,POINT (126.7834 36.0123)
4,160,2020-10-23,,부산(레),35.1188,129.0000,POINT (129 35.1188)
...,...,...,...,...,...,...,...
567,992,2021-11-10,,하빈,35.9062,128.4464,POINT (128.4464 35.9062)
568,993,2024-04-06,,제주금악,33.3423,126.3099,POINT (126.3099 33.3423)
569,994,2022-05-02,,심포,35.8545,126.6421,POINT (126.6421 35.8545)
570,995,2023-12-19,,오천,35.9286,129.3822,POINT (129.3822 35.9286)


## 데이터 전처리


### 테스트 데이터 분리

In [5]:
features = ['기온(°C)', '풍향(deg)', '풍속(m/s)', '강수량(mm)', '습도(%)', '실효습도']
X = weather_data[features]
y = weather_data['산불']

In [6]:
# 학습용 / 테스트 데이터셋 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

## Logistic Regression

In [7]:
# data feature scailing
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### 랜덤 언더 샘플링 적용

In [8]:
rus = RandomUnderSampler(random_state=42)
X_res_under, y_res_under = rus.fit_resample(X_train_scaled, y_train)

In [9]:
print(f"언더샘플링 학습 데이터 사이즈 : {X_res_under.shape}")
print(f"언더샘플링 산불 발생 데이터 (1) 개수 : {y_res_under.sum()}개")
print(f"언더샘플링 산불 미발생 데이터 (0) 개수 : {(y_res_under == 0).sum()}개")

언더샘플링 학습 데이터 사이즈 : (14910, 6)
언더샘플링 산불 발생 데이터 (1) 개수 : 7455개
언더샘플링 산불 미발생 데이터 (0) 개수 : 7455개


In [10]:
# 모델 학습
model_under = LogisticRegression(solver='liblinear', random_state=42)
model_under.fit(X_res_under, y_res_under)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,100


In [11]:
# 확률 예측
proba_under = model_under.predict_proba(X_test_scaled)[:, 1]

In [12]:
y_pred_under = model_under.predict(X_test_scaled)

In [13]:
# 모델 평가
print(confusion_matrix(y_test, y_pred_under))
print(classification_report(y_test, y_pred_under))
print(f"AUC-ROC (Under-sampling): {roc_auc_score(y_test, proba_under):.4f}")
print(f"Recall (Under-sampling): {recall_score(y_test, y_pred_under):.4f}")

[[2400911  881020]
 [    491    1373]]
              precision    recall  f1-score   support

           0       1.00      0.73      0.84   3281931
           1       0.00      0.74      0.00      1864

    accuracy                           0.73   3283795
   macro avg       0.50      0.73      0.42   3283795
weighted avg       1.00      0.73      0.84   3283795

AUC-ROC (Under-sampling): 0.8054
Recall (Under-sampling): 0.7366


### SMOTE 적용

In [14]:
sm = SMOTE(random_state=42)
X_res_smote, y_res_smote = sm.fit_resample(X_train_scaled, y_train)

In [15]:
print(f"SMOTE 학습 데이터 사이즈 : {X_res_smote.shape}")
print(f"SMOTE 산불 발생 데이터 (1) 개수 : {y_res_smote.sum()}개")
print(f"SMOTE 산불 미발생 데이터 (0) 개수 : {(y_res_smote == 0).sum()}개")

SMOTE 학습 데이터 사이즈 : (26255444, 6)
SMOTE 산불 발생 데이터 (1) 개수 : 13127722개
SMOTE 산불 미발생 데이터 (0) 개수 : 13127722개


In [16]:
# 모델 학습
model_smote = LogisticRegression(solver='liblinear', random_state=42)
model_smote.fit(X_res_smote, y_res_smote)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,100


In [17]:
# 확률 예측
proba_smote = model_smote.predict_proba(X_test_scaled)[:, 1]

In [18]:
y_pred_smote = model_smote.predict(X_test_scaled)

In [19]:
print(confusion_matrix(y_test, y_pred_smote))
print(classification_report(y_test, y_pred_smote))
print(f"AUC-ROC (SMOTE): {roc_auc_score(y_test, proba_smote):.4f}")
print(f"Recall (SMOTE): {recall_score(y_test, y_pred_smote):.4f}")

[[2392340  889591]
 [    486    1378]]
              precision    recall  f1-score   support

           0       1.00      0.73      0.84   3281931
           1       0.00      0.74      0.00      1864

    accuracy                           0.73   3283795
   macro avg       0.50      0.73      0.42   3283795
weighted avg       1.00      0.73      0.84   3283795

AUC-ROC (SMOTE): 0.8051
Recall (SMOTE): 0.7393


## Random Forest

### 언더샘플링 적용

In [20]:
rus = RandomUnderSampler(random_state=42)
X_res_under, y_res_under = rus.fit_resample(X_train_scaled, y_train)

In [21]:
print(f"언더샘플링 학습 데이터 사이즈 : {X_res_under.shape}")
print(f"언더샘플링 산불 발생 데이터 (1) 개수 : {y_res_under.sum()}개")
print(f"언더샘플링 산불 미발생 데이터 (0) 개수 : {(y_res_under == 0).sum()}개")

언더샘플링 학습 데이터 사이즈 : (14910, 6)
언더샘플링 산불 발생 데이터 (1) 개수 : 7455개
언더샘플링 산불 미발생 데이터 (0) 개수 : 7455개


In [22]:
# 모델 학습
model_under = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
model_under.fit(X_res_under, y_res_under)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [23]:
proba_under = model_under.predict_proba(X_test_scaled)[:, 1]

In [24]:
y_pred_under = model_under.predict(X_test_scaled)

In [25]:
print(confusion_matrix(y_test, y_pred_under))
print(classification_report(y_test, y_pred_under))
print(f"AUC-ROC (Under-sampling): {roc_auc_score(y_test, proba_under):.4f}")
print(f"Recall (Under-sampling): {recall_score(y_test, y_pred_under):.4f}")

[[2533759  748172]
 [    348    1516]]
              precision    recall  f1-score   support

           0       1.00      0.77      0.87   3281931
           1       0.00      0.81      0.00      1864

    accuracy                           0.77   3283795
   macro avg       0.50      0.79      0.44   3283795
weighted avg       1.00      0.77      0.87   3283795

AUC-ROC (Under-sampling): 0.8777
Recall (Under-sampling): 0.8133


### SMOTE 적용


In [26]:
sm = SMOTE(random_state=42)
X_res_smote, y_res_smote = sm.fit_resample(X_train_scaled, y_train)

In [27]:
print(f"SMOTE 학습 데이터 사이즈 : {X_res_smote.shape}")
print(f"SMOTE 산불 발생 데이터 (1) 개수 : {y_res_smote.sum()}개")
print(f"SMOTE 산불 미발생 데이터 (0) 개수 : {(y_res_smote == 0).sum()}개")

SMOTE 학습 데이터 사이즈 : (26255444, 6)
SMOTE 산불 발생 데이터 (1) 개수 : 13127722개
SMOTE 산불 미발생 데이터 (0) 개수 : 13127722개


In [None]:
# Random Forest 모델 학습 (시간 상당히 소요)
model_smote = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
model_smote.fit(X_res_smote, y_res_smote)


0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [38]:
# 확률 예측
proba_smote = model_smote.predict_proba(X_test_scaled)[:, 1]
y_pred_smote = model_smote.predict(X_test_scaled)

In [39]:
print(confusion_matrix(y_test, y_pred_smote))
print(classification_report(y_test, y_pred_smote))
print(f"AUC-ROC (SMOTE): {roc_auc_score(y_test, proba_smote):.4f}")
print(f"Recall (SMOTE): {recall_score(y_test, y_pred_smote):.4f}")

[[3281545     386]
 [   1267     597]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00   3281931
           1       0.61      0.32      0.42      1864

    accuracy                           1.00   3283795
   macro avg       0.80      0.66      0.71   3283795
weighted avg       1.00      1.00      1.00   3283795

AUC-ROC (SMOTE): 0.7752
Recall (SMOTE): 0.3203


In [None]:
# feature importance 확인
importance_smote = pd.Series(model_smote.feature_importances_, index=features).sort_values(ascending=False)
print(importance_smote)

습도(%)      0.291539
실효습도       0.233199
풍속(m/s)    0.192306
기온(°C)     0.162299
풍향(deg)    0.110642
강수량(mm)    0.010016
dtype: float64


In [29]:
model_smote_filename = 'random_forest_smote_model.joblib'

In [None]:
# SMOTE 이용해 학습한 Random forest 모델 저장
joblib.dump(model_smote, model_smote_filename)

['random_forest_smote_model.joblib']

In [30]:
# 모델 불러오기 테스트
load_model = joblib.load(model_smote_filename)
predictions = load_model.predict(X_test_scaled)
probabilities = load_model.predict_proba(X_test_scaled)[:, 1]

In [31]:
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))
print(f"AUC-ROC (SMOTE): {roc_auc_score(y_test, probabilities):.4f}")
print(f"Recall (SMOTE): {recall_score(y_test, predictions):.4f}")

[[3281545     386]
 [   1267     597]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00   3281931
           1       0.61      0.32      0.42      1864

    accuracy                           1.00   3283795
   macro avg       0.80      0.66      0.71   3283795
weighted avg       1.00      1.00      1.00   3283795

AUC-ROC (SMOTE): 0.7752
Recall (SMOTE): 0.3203


## Support Vector Machine

### 언더샘플링

In [32]:
rus = RandomUnderSampler(random_state=42)
X_res_under, y_res_under = rus.fit_resample(X_train_scaled, y_train)

In [33]:
print(f"언더샘플링 학습 데이터 사이즈 : {X_res_under.shape}")
print(f"언더샘플링 산불 발생 데이터 (1) 개수 : {y_res_under.sum()}개")
print(f"언더샘플링 산불 미발생 데이터 (0) 개수 : {(y_res_under == 0).sum()}개")

언더샘플링 학습 데이터 사이즈 : (14910, 6)
언더샘플링 산불 발생 데이터 (1) 개수 : 7455개
언더샘플링 산불 미발생 데이터 (0) 개수 : 7455개


In [34]:
# 모델 학습
model_under = SVC(kernel='rbf', probability=True, random_state=42, class_weight='balanced')
model_under.fit(X_res_under, y_res_under)

0,1,2
,C,1.0
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,True
,tol,0.001
,cache_size,200
,class_weight,'balanced'


In [35]:
# 확률 예측
proba_under = model_under.predict_proba(X_test_scaled)[:, 1]
y_pred_under = model_under.predict(X_test_scaled)

In [36]:
print(confusion_matrix(y_test, y_pred_under))
print(classification_report(y_test, y_pred_under))
print(f"AUC-ROC (Under-sampling): {roc_auc_score(y_test, proba_under):.4f}")
print(f"Recall (Under-sampling): {recall_score(y_test, y_pred_under):.4f}")

[[2449837  832094]
 [    466    1398]]
              precision    recall  f1-score   support

           0       1.00      0.75      0.85   3281931
           1       0.00      0.75      0.00      1864

    accuracy                           0.75   3283795
   macro avg       0.50      0.75      0.43   3283795
weighted avg       1.00      0.75      0.85   3283795

AUC-ROC (Under-sampling): 0.8191
Recall (Under-sampling): 0.7500


In [None]:
result = permutation_importance(model_under, X_test_scaled, y_test, n_repeats=10, random_state=42, scoring='accuracy')
importance_means = result.importances_mean
feature_names=['습도(%)', '실효습도', '풍속(m/s)', '기온(°C)', '풍향(deg)', '강수량(mm)']
feature_importance = pd.Series(importance_means, index=feature_names).sort_values(ascending=False)

In [None]:
print(feature_importance)

### SMOTE

In [9]:
sm = SMOTE(random_state=42)
X_res_smote, y_res_smote = sm.fit_resample(X_train_scaled, y_train)

In [10]:
print(f"SMOTE 학습 데이터 사이즈 : {X_res_smote.shape}")
print(f"SMOTE 산불 발생 데이터 (1) 개수 : {y_res_smote.sum()}개")
print(f"SMOTE 산불 미발생 데이터 (0) 개수 : {(y_res_smote == 0).sum()}개")

SMOTE 학습 데이터 사이즈 : (26255444, 6)
SMOTE 산불 발생 데이터 (1) 개수 : 13127722개
SMOTE 산불 미발생 데이터 (0) 개수 : 13127722개


In [None]:
# SVC 모델 학습
model_smote = SVC(kernel='rbf', probability=True, random_state=42, class_weight='balanced')
model_smote.fit(X_res_smote, y_res_smote) # 학습 시간이 너무 오래 걸려 실행 보류

In [None]:
# 확률 예측
proba_smote = model_smote.predict_proba(X_test_scaled)[:, 1]
y_pred_smote = model_smote.predict(X_test_scaled)

In [None]:
print(confusion_matrix(y_test, y_pred_smote))
print(classification_report(y_test, y_pred_smote))
print(f"AUC-ROC (SMOTE): {roc_auc_score(y_test, proba_smote):.4f}")
print(f"Recall (SMOTE): {recall_score(y_test, y_pred_smote):.4f}")

In [None]:
# SMOTE 이용해 학습한 SVC 모델 저장
model_smote_svc_filename = 'SVC_smote_model.joblib'
joblib.dump(model_smote, model_smote_svc_filename)

In [None]:
# 모델 불러오기 테스트
load_model = joblib.load(model_smote_svc_filename)
predictions = load_model.predict(X_test_scaled)
probabilities = load_model.predict_proba(X_test_scaled)[:, 1]
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))
print(f"AUC-ROC (SMOTE): {roc_auc_score(y_test, probabilities):.4f}")
print(f"Recall (SMOTE): {recall_score(y_test, predictions):.4f}")