<a href="https://colab.research.google.com/github/qkrjuyeol/boiler-efficiency-checker/blob/main/boiler-efficiency-checker/AI/RandomForest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# RF

In [2]:
from google.colab import drive
drive.mount('/content/drive')

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer

data_folder = '/content/drive/MyDrive/2months_data'
all_files = os.listdir(data_folder)
csv_files = [os.path.join(data_folder, f) for f in all_files if f.endswith('.csv')]

def read_csv_file(file_path):
    encodings = ['utf-8', 'cp949', 'euc-kr']  # List of possible encodings
    for encoding in encodings:
        try:
            return pd.read_csv(file_path, encoding=encoding)
        except (UnicodeDecodeError, pd.errors.EmptyDataError, pd.errors.ParserError):
            continue
    raise ValueError(f"Could not read file {file_path} with any encoding.")

valid_dataframes = []
for file in csv_files:
    try:
        df = read_csv_file(file)
        if not df.empty:
          valid_dataframes.append(df)
    except ValueError as e:
        print(e)

if valid_dataframes:
    combined_data = pd.concat(valid_dataframes, ignore_index=True)
else:
    combined_data = pd.DataFrame()
combined_data.columns=["Creation date", "load factor", "Set Pressure", "Boiler Pressure",
                      "Blower Inverter Output", "Blower Input", "Water Supply Pump",
                      "Water Supply Pump Input", "Gas Damper", "Gas Damper Input",
                      "Air Damper", "Air Damper Input", "Recirculation Damper",
                      "Recirculation External Damper", "Recirculation Damper Input",
                      "Recirculation External Damper Input", "Water Supply Level",
                      "Boiler Temperature", "Exhaust Gas Temperature 1",
                      "Exhaust Gas Temperature 2", "Exhaust Gas Temperature 3",
                      "Exhaust Recirculation Temperature", "Economizer Temperature 1",
                      "Economizer Temperature 2", "Burner Temperature", "Exhaust Gas NOx",
                      "Exhaust Gas O2", "Recirculation O2", "Recirculation NOx",
                      "Water Supply Amount (Cumulative Flow)",
                      "Water Supply Amount (Instantaneous Flow)",
                      "Fuel Amount (Cumulative Flow)", "Fuel Amount (Instantaneous Flow)",
                      "Efficiency (Instantaneous)", "Power Consumption", "Vibration Sensor 1",
                      "Vibration Sensor 2", "Operating Time", "Normal Operation Probability",
                      "Blower Failure Probability", "Air Damper Failure Probability",
                      "Gas Damper Failure Probability", "Probability Update Time",
                      "Instantaneous Steam Amount", "Input-Output Efficiency",
                      "Heat Loss Efficiency", "Efficiency (input/output method-steam)"]
combined_data = combined_data.drop(columns=[
    "Creation date","Power Consumption","Vibration Sensor 1","Vibration Sensor 2","Operating Time",
    "Normal Operation Probability","Blower Failure Probability","Air Damper Failure Probability",
    "Gas Damper Failure Probability","Probability Update Time","Instantaneous Steam Amount",
    "Input-Output Efficiency","Heat Loss Efficiency","Efficiency (input/output method-steam)",
    "Exhaust Recirculation Temperature","Burner Temperature"
])

scaler = StandardScaler()
numeric_columns = combined_data.select_dtypes(include=['float64', 'int64']).columns
combined_data[numeric_columns] = scaler.fit_transform(combined_data[numeric_columns])

# Select numerical columns only
numerical_columns = combined_data.select_dtypes(include=['float64', 'int64']).columns

# Fill missing values in numerical columns with the mean value
combined_data[numerical_columns] = combined_data[numerical_columns].apply(lambda col: col.fillna(col.mean()))

# For categorical columns, fill missing values with the most frequent value
cat_imputer = SimpleImputer(strategy='most_frequent')
combined_data[combined_data.select_dtypes(include=['object']).columns] = cat_imputer.fit_transform(combined_data.select_dtypes(include=['object']))

# Encode categorical variables (if any)
label_encoder = LabelEncoder()
for column in combined_data.select_dtypes(include=['object']).columns:
    combined_data[column] = label_encoder.fit_transform(combined_data[column])

combined_data.to_csv('/content/drive/MyDrive/preprocessed_boiler_data.csv', index=False, encoding='utf-8')

Mounted at /content/drive


In [None]:
combined_data.head()

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from scipy.stats import randint

# combined_data = 전처리된 데이터프레임 (전처리 코드는 이미 실행된 상태라고 가정)

# 독립 변수(X)와 종속 변수(y) 설정
X = combined_data.drop(columns=['Efficiency (Instantaneous)'])  # 종속 변수 제외
y = combined_data['Efficiency (Instantaneous)']

# 데이터 분할: 훈련 데이터와 테스트 데이터로 분할
# 여기에서 train_test_split을 사용하여 X_train, X_test, y_train, y_test를 생성
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # 예시: 80% 훈련, 20% 테스트

# Random Forest Model with RandomizedSearchCV for hyperparameter tuning
rf = RandomForestRegressor(random_state=42)

# Define the hyperparameter grid
param_distributions = {
    'n_estimators': [50, 100, 200],      # Number of trees
    'max_depth': [None, 10, 15, 20, 25],           # Maximum depth of the tree
    'min_samples_split': [2,5,10],   # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1,2,4],    # Minimum number of samples required to be at a leaf node
    'max_features':['sqrt', 'log2'],  # Number of features to consider at each split
    'min_weight_fraction_leaf': [0.0, 0.1, 0.2]  # 리프 노드의 가중치 샘플 최소값
}

# Randomized Search CV with 5-fold cross-validation
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_distributions,
                                   n_iter=50, cv=5, verbose=2, n_jobs=-1, random_state=42)
random_search.fit(X_train, y_train)

# Best parameters
print("Best hyperparameters found by RandomizedSearchCV:")
print(random_search.best_params_)

# Model evaluation using test data
y_pred = random_search.predict(X_test)

# Calculate performance metrics
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
mae = np.mean(np.abs(y_test - y_pred))
rmse = np.sqrt(np.mean((y_test - y_pred) ** 2))
mse = np.mean((y_test - y_pred) ** 2)

print(f"MAPE: {mape:.4f}")
print(f"MAE: {mae:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"MSE: {mse:.4f}")

Fitting 5 folds for each of 50 candidates, totalling 250 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Best hyperparameters found by RandomizedSearchCV:
{'n_estimators': 50, 'min_weight_fraction_leaf': 0.0, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 25}
MAPE: 7.8843
MAE: 0.0207
RMSE: 0.0475
MSE: 0.0023


# 시각화

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import matplotlib.font_manager as fm
import shutil

# Seaborn 스타일 적용
sns.set(style="whitegrid")

# 예시 데이터 (random_search.best_estimator_는 미리 학습된 상태여야 함)
feature_importances = random_search.best_estimator_.feature_importances_
features = X.columns  # 각 피처의 이름

# 피처 중요도를 시리즈로 변환하고, 중요도 순서대로 정렬
feature_importance_series = pd.Series(feature_importances, index=features).sort_values(ascending=False)

plt.figure(figsize=(10, 6))
ax = sns.barplot(x=feature_importance_series, y=feature_importance_series.index)

# 각 바에 중요도 값을 텍스트로 표시
for i, (value, name) in enumerate(zip(feature_importance_series, feature_importance_series.index)):
    ax.text(value, i, f'{value:.4f}', va='center', ha='left', fontsize=10, color='black')

plt.title('Feature Importance from Random Forest Model (with values)', fontsize=14)
plt.xlabel('Importance Score', fontsize=12)
plt.ylabel('Feature', fontsize=12)
plt.show()

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# GA - 주열

In [4]:
!pip install deap


Collecting deap
  Downloading deap-1.4.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading deap-1.4.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (135 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/135.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m135.4/135.4 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: deap
Successfully installed deap-1.4.1


In [7]:
from deap import base, creator, tools, algorithms
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error as mape

# 데이터 준비
X = combined_data.drop(columns=['Efficiency (Instantaneous)'])
y = combined_data['Efficiency (Instantaneous)']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# RandomizedSearchCV에서 얻은 최적의 하이퍼파라미터
best_params_from_random_search = {
    'n_estimators': 100,
    'max_depth': 15,
    'min_samples_split': 5,
    'min_samples_leaf': 2
}

# GA의 평가 함수 정의
def evaluate(individual):
    n_estimators = int(individual[0])
    max_depth = int(individual[1])
    min_samples_split = max(2, int(individual[2]))  # 최소 2로 설정
    min_samples_leaf = max(1, int(individual[3]))  # 최소 1로 설정
    rf = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=42
    )
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    return mape(y_test, y_pred),

# GA 설정
creator.create("FitnessMin", base.Fitness, weights=(-1.0,))
creator.create("Individual", list, fitness=creator.FitnessMin)

toolbox = base.Toolbox()
toolbox.register("n_estimators", np.random.randint, 50, 200)
toolbox.register("max_depth", np.random.randint, 5, 50)
toolbox.register("min_samples_split", np.random.randint, 2, 20)  # 최소 2로 설정
toolbox.register("min_samples_leaf", np.random.randint, 1, 20)  # 최소 1로 설정

toolbox.register(
    "individual",
    tools.initCycle,
    creator.Individual,
    (toolbox.n_estimators, toolbox.max_depth, toolbox.min_samples_split, toolbox.min_samples_leaf),
    n=1
)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

# 평가 함수 등록
toolbox.register("evaluate", evaluate)

# 교차와 변이 등록
toolbox.register("mate", tools.cxBlend, alpha=0.5)
toolbox.register("mutate", tools.mutPolynomialBounded, low=[50, 5, 2, 1], up=[200, 50, 20, 20], eta=1.0, indpb=0.2)
toolbox.register("select", tools.selTournament, tournsize=3)

# 초기 개체군 생성 (기본 개체 + 최적 파라미터 개체)
population_size = 20
population = toolbox.population(n=population_size - 1)  # 기본 개체 생성
population.append(creator.Individual([
    best_params_from_random_search['n_estimators'],
    best_params_from_random_search['max_depth'],
    best_params_from_random_search['min_samples_split'],
    best_params_from_random_search['min_samples_leaf']
]))  # 최적 파라미터 개체 추가

# GA 실행
ngen = 40  # 세대 수
cxpb = 0.5  # 교차 확률
mutpb = 0.2  # 변이 확률

result_population, logbook = algorithms.eaSimple(population, toolbox, cxpb, mutpb, ngen, verbose=True)

# 최적의 하이퍼파라미터 찾기
best_individual = tools.selBest(result_population, k=1)[0]
print("Best individual is: ", best_individual)
print("With MAPE: ", evaluate(best_individual)[0])

# 최적의 파라미터로 모델 재학습
optimal_rf = RandomForestRegressor(
    n_estimators=int(best_individual[0]),
    max_depth=int(best_individual[1]),
    min_samples_split=max(2, int(best_individual[2])),  # 최소 2로 설정
    min_samples_leaf=max(1, int(best_individual[3])),  # 최소 1로 설정
    random_state=42
)
optimal_rf.fit(X_train, y_train)

# 최적의 모델 평가
y_pred = optimal_rf.predict(X_test)
final_mape = mape(y_test, y_pred)
print(f"Final MAPE with optimal parameters: {final_mape:.4f}")


gen	nevals
0  	20    


KeyboardInterrupt: 

# GA - Jennie

In [None]:
import random
import numpy as np
from deap import base, creator, tools, algorithms
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_percentage_error as mape
from sklearn.model_selection import train_test_split

# Assume you have pre-processed data available in X and y
X = combined_data.drop(columns=['Efficiency (Instantaneous)'])
y = combined_data['Efficiency (Instantaneous)']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Best hyperparameters from RandomizedSearchCV
best_params_from_random_search = {
    'n_estimators': 100,
    'max_depth': 15,
    'min_samples_split': 5,
    'min_samples_leaf': 2
}

# Define the objective function for GA to minimize (MAPE)
def objective_function(individual):
    n_estimators = int(individual[0])
    max_depth = int(individual[1])
    min_samples_split = max(2, int(individual[2]))
    min_samples_leaf = max(1, int(individual[3]))

    # Create the RandomForest model
    rf = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=42
    )

    # Train the model
    rf.fit(X_train, y_train)

    # Predict and calculate MAPE
    y_pred = rf.predict(X_test)
    return mape(y_test, y_pred),

# GA setup
creator.create("FitnessMin", base.Fitness, weights=(-1.0,))  # We are minimizing MAPE
creator.create("Individual", list, fitness=creator.FitnessMin)

# Register hyperparameters to be optimized
toolbox = base.Toolbox()
toolbox.register("attr_n_estimators", random.randint, 50, 200)
toolbox.register("attr_max_depth", random.randint, 5, 30)
toolbox.register("attr_min_samples_split", random.randint, 2, 10)
toolbox.register("attr_min_samples_leaf", random.randint, 1, 5)

# Define an individual and a population
toolbox.register("individual", tools.initCycle, creator.Individual,
                 (toolbox.attr_n_estimators, toolbox.attr_max_depth,
                  toolbox.attr_min_samples_split, toolbox.attr_min_samples_leaf), n=1)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

# Include the best params from RandomizedSearchCV in the initial population
population_size = 30
population = toolbox.population(n=population_size - 1)
population.append(creator.Individual([
    best_params_from_random_search['n_estimators'],
    best_params_from_random_search['max_depth'],
    best_params_from_random_search['min_samples_split'],
    best_params_from_random_search['min_samples_leaf']
]))

# Register Genetic Algorithm components
toolbox.register("mate", tools.cxTwoPoint)  # Crossover
toolbox.register("mutate", tools.mutUniformInt, low=[50, 5, 2, 1], up=[200, 30, 10, 5], indpb=0.2)  # Mutation
toolbox.register("select", tools.selTournament, tournsize=3)  # Selection
toolbox.register("evaluate", objective_function)  # Fitness evaluation

# Early stopping criteria (optional)
early_stop_threshold = 5
no_improvement_counter = 0
best_mape = float("inf")

# Define number of generations and probability parameters
ngen = 15  # Fewer generations
cxpb = 0.6  # Crossover probability
mutpb = 0.2  # Mutation probability

# Run the GA optimization
for gen in range(ngen):
    offspring = toolbox.select(population, len(population))
    offspring = list(map(toolbox.clone, offspring))

    # Apply crossover and mutation
    for child1, child2 in zip(offspring[::2], offspring[1::2]):
        if random.random() < cxpb:
            toolbox.mate(child1, child2)
            del child1.fitness.values
            del child2.fitness.values

    for mutant in offspring:
        if random.random() < mutpb:
            toolbox.mutate(mutant)
            del mutant.fitness.values

    # Evaluate the offspring
    invalid_ind = [ind for ind in offspring if not ind.fitness.valid]
    fitnesses = map(toolbox.evaluate, invalid_ind)
    for ind, fit in zip(invalid_ind, fitnesses):
        ind.fitness.values = fit

    # Replace the old population with the new
    population[:] = offspring

    # Track the best solution and implement early stopping
    best_individual = tools.selBest(population, k=1)[0]
    current_mape = toolbox.evaluate(best_individual)[0]

    if current_mape < best_mape:
        best_mape = current_mape
        no_improvement_counter = 0
    else:
        no_improvement_counter += 1

    # Early stopping if no improvement for 5 generations
    if no_improvement_counter >= early_stop_threshold:
        print("Early stopping triggered.")
        break

# Print the best individual and its MAPE
best_individual = tools.selBest(population, k=1)[0]
print(f'Best hyperparameters: n_estimators={best_individual[0]}, max_depth={best_individual[1]}, '
      f'min_samples_split={best_individual[2]}, min_samples_leaf={best_individual[3]}')
print(f'Best MAPE: {best_mape}')

# Retrain the model with the best hyperparameters
optimal_rf = RandomForestRegressor(
    n_estimators=int(best_individual[0]),
    max_depth=int(best_individual[1]),
    min_samples_split=int(best_individual[2]),
    min_samples_leaf=int(best_individual[3]),
    random_state=42
)
optimal_rf.fit(X_train, y_train)

# Final evaluation on test set
y_pred = optimal_rf.predict(X_test)
final_mape = mape(y_test, y_pred)
print(f'Final MAPE with optimal parameters: {final_mape:.4f}')
