데이터 전처리

In [5]:
from google.colab import drive
drive.mount('/content/drive')

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer

data_folder = '/content/drive/MyDrive/1month_data'
all_files = os.listdir(data_folder)
csv_files = [os.path.join(data_folder, f) for f in all_files if f.endswith('.csv')]

def read_csv_file(file_path):
    encodings = ['utf-8', 'cp949', 'euc-kr']  # List of possible encodings
    for encoding in encodings:
        try:
            return pd.read_csv(file_path, encoding=encoding)
        except (UnicodeDecodeError, pd.errors.EmptyDataError, pd.errors.ParserError):
            continue
    raise ValueError(f"Could not read file {file_path} with any encoding.")

valid_dataframes = []
for file in csv_files:
    try:
        df = read_csv_file(file)
        if not df.empty:
          valid_dataframes.append(df)
    except ValueError as e:
        print(e)

if valid_dataframes:
    combined_data = pd.concat(valid_dataframes, ignore_index=True)
else:
    combined_data = pd.DataFrame()

combined_data = combined_data.drop(columns=[
    "생성일",
    "소비전류",
    "진동센서1",
    "진동센서2",
    "운전시간",
    "정상 운전 확률",
    "송풍기 고장 확률",
    "AIR 댐퍼 고장 확률",
    "GAS 앰퍼 고장 확률",
    "확률 업데이트 시간",
    "순간 스팀량",
    "입출력법 효율",
    "열 손실법 효율",
    "효율(입출력법-스팀)",
    "배기 재 순환 온도",
    "버너온도"
])

scaler = StandardScaler()
numeric_columns = combined_data.select_dtypes(include=['float64', 'int64']).columns
combined_data[numeric_columns] = scaler.fit_transform(combined_data[numeric_columns])

# Select numerical columns only
numerical_columns = combined_data.select_dtypes(include=['float64', 'int64']).columns

# Fill missing values in numerical columns with the mean value
combined_data[numerical_columns] = combined_data[numerical_columns].apply(lambda col: col.fillna(col.mean()))

# For categorical columns, fill missing values with the most frequent value
cat_imputer = SimpleImputer(strategy='most_frequent')
combined_data[combined_data.select_dtypes(include=['object']).columns] = cat_imputer.fit_transform(combined_data.select_dtypes(include=['object']))

# Encode categorical variables (if any)
label_encoder = LabelEncoder()
for column in combined_data.select_dtypes(include=['object']).columns:
    combined_data[column] = label_encoder.fit_transform(combined_data[column])

combined_data.to_csv('/content/drive/MyDrive/preprocessed_boiler_data.csv', index=False, encoding='utf-8')
combined_data.head()

#checking if data preprocessing was correct
print(combined_data.isna().sum())  # Shows the count of NaNs in each column
print(combined_data.isna().sum().sum())  # Total count of missing values
print(combined_data.info())

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
부하율             0
설정 압력           0
보일러 압력          0
송풍기 인버터 출력      0
송풍기 입력          0
급수 펌프           0
급수펌프 입력         0
가스 댐퍼           0
가스 댐퍼 입력        0
Air 댐퍼          0
Air 댐퍼 입력       0
재순환 댐퍼          0
재순환 외기 댐퍼       0
재순환 댐퍼 입력       0
재순환 외기 댐퍼 입력    0
급수 수위           0
보일러 온도          0
배기가스온도1         0
배기가스온도2         0
배기가스온도3         0
에코 온도1          0
에코 온도2          0
배기가스 NOx        0
배기가스 O2         0
재순환 O2          0
재순환 NOx         0
급수량(적산유량)       0
급수량(순간유량)       0
연료량(적산유량)       0
연료량(순간유량)       0
효율(순간)          0
dtype: int64
0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1884101 entries, 0 to 1884100
Data columns (total 31 columns):
 #   Column        Dtype  
---  ------        -----  
 0   부하율           float64
 1   설정 압력         float64
 2   보일러 압력        float64
 3   송풍기 인버터 출력    float64
 4   송풍기 입력        floa

In [6]:

# 1. 필요한 라이브러리 임포트
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from scipy.stats import randint

# 2. 전처리된 데이터 사용 (전처리 코드는 이미 실행된 상태라고 가정)
# df = 전처리된 데이터프레임

# 3. 독립 변수(X)와 종속 변수(y) 설정
# 전처리한 데이터에서 종속 변수에 해당하는 열을 선택합니다.
X = combined_data.drop(columns=['효율(순간)'])  # 종속 변수 제외
y = combined_data['효율(순간)']

# 4. 데이터 분할: 훈련 데이터와 테스트 데이터로 분할
# 여기에서 train_test_split을 사용하여 X_train, X_test, y_train, y_test를 생성합니다.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # 예시: 80% 훈련, 20% 테스트

# 5. Random Forest Model with RandomizedSearchCV for hyperparameter tuning
rf = RandomForestRegressor(random_state=42)

# Define the hyperparameter grid
param_distributions = {
    'n_estimators': [50, 100, 200],      # Number of trees
    'max_depth': [None, 10, 15, 20, 25],           # Maximum depth of the tree
    'min_samples_split': [2,5,10],   # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1,2,4],    # Minimum number of samples required to be at a leaf node
    'max_features':['sqrt', 'log2'],  # Number of features to consider at each split
    'min_weight_fraction_leaf': [0.0, 0.1, 0.2]  # 리프 노드의 가중치 샘플 최소값
}

# Randomized Search CV with 5-fold cross-validation
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_distributions,
                                   n_iter=50, cv=5, verbose=2, n_jobs=-1, random_state=42)
random_search.fit(X_train, y_train)

# Best parameters
print("Best hyperparameters found by RandomizedSearchCV:")
print(random_search.best_params_)

# 6. Model evaluation using test data

y_pred = random_search.predict(X_test)

# Calculate performance metrics
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
mae = np.mean(np.abs(y_test - y_pred))
rmse = np.sqrt(np.mean((y_test - y_pred) ** 2))
mse = np.mean((y_test - y_pred) ** 2)

# Print the evaluation results
print(f"MAPE: {mape:.4f}")
print(f"MAE: {mae:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"MSE: {mse:.4f}")

Fitting 5 folds for each of 50 candidates, totalling 250 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Best hyperparameters found by RandomizedSearchCV:
{'n_estimators': 50, 'min_weight_fraction_leaf': 0.0, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 25}
MAPE: 7.5247
MAE: 0.0190
RMSE: 0.0463
MSE: 0.0021
