PREDICT THE BURNED AREA OF FOREST FIRES WITH NEURAL NETWORKS


In [12]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import mean_absolute_error, mean_squared_error


data = pd.read_csv('forestfires.csv')


X = data.drop(['area'], axis=1)
y = data['area']


categorical_features = ['month', 'day', 'size_category']  # Assuming 'size_category' is the categorical feature causing the issue
numeric_features = list(set(X.columns) - set(categorical_features))


numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder()


preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

#pipeline with preprocessing
preprocessor_model = Pipeline(steps=[
    ('preprocessor', preprocessor)
])


preprocessor_model.fit(X)


X_transformed = preprocessor_model.transform(X)


X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=42)

# Neural network architecture
model = Sequential([
    Dense(64, input_dim=X_train.shape[1], activation='relu'),
    Dense(32, activation='relu'),
    Dense(1, activation='linear')
])

# Model compilation
model.compile(optimizer=Adam(), loss='mean_squared_error', metrics=['mean_absolute_error'])

# Model training
model.fit(X_train, y_train, epochs=50, batch_size=16, validation_split=0.2, verbose=2)

# Model evaluation
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

print(f'Mean Absolute Error: {mae}')
print(f'Mean Squared Error: {mse}')


Epoch 1/50


21/21 - 1s - loss: 2565.8564 - mean_absolute_error: 12.1332 - val_loss: 608.5529 - val_mean_absolute_error: 7.2869 - 1s/epoch - 58ms/step
Epoch 2/50
21/21 - 0s - loss: 2538.2546 - mean_absolute_error: 12.2191 - val_loss: 597.0404 - val_mean_absolute_error: 7.5468 - 93ms/epoch - 4ms/step
Epoch 3/50
21/21 - 0s - loss: 2506.8606 - mean_absolute_error: 12.4063 - val_loss: 584.6758 - val_mean_absolute_error: 8.0335 - 85ms/epoch - 4ms/step
Epoch 4/50
21/21 - 0s - loss: 2466.7727 - mean_absolute_error: 12.6537 - val_loss: 574.7407 - val_mean_absolute_error: 8.5493 - 88ms/epoch - 4ms/step
Epoch 5/50
21/21 - 0s - loss: 2426.9075 - mean_absolute_error: 13.1494 - val_loss: 565.5430 - val_mean_absolute_error: 9.5493 - 96ms/epoch - 5ms/step
Epoch 6/50
21/21 - 0s - loss: 2378.9214 - mean_absolute_error: 13.7299 - val_loss: 560.0966 - val_mean_absolute_error: 10.3144 - 102ms/epoch - 5ms/step
Epoch 7/50
21/21 - 0s - loss: 2326.6284 - mean_absolute_error: 14.0837 - val_loss: 557.2957 - val

The dataset contains 36733 instances of 11 sensor measures aggregated over one hour (by means of average or sum) from a gas turbine. 
The Dataset includes gas turbine parameters (such as Turbine Inlet Temperature and Compressor Discharge pressure) in addition to the ambient variables.



Problem statement: predicting turbine energy yield (TEY) using ambient variables as features.



Attribute Information:

The explanations of sensor measurements and their brief statistics are given below.

Variable (Abbr.) Unit Min Max Mean
Ambient temperature (AT) C â€“6.23 37.10 17.71
Ambient pressure (AP) mbar 985.85 1036.56 1013.07
Ambient humidity (AH) (%) 24.08 100.20 77.87
Air filter difference pressure (AFDP) mbar 2.09 7.61 3.93
Gas turbine exhaust pressure (GTEP) mbar 17.70 40.72 25.56
Turbine inlet temperature (TIT) C 1000.85 1100.89 1081.43
Turbine after temperature (TAT) C 511.04 550.61 546.16
Compressor discharge pressure (CDP) mbar 9.85 15.16 12.06
Turbine energy yield (TEY) MWH 100.02 179.50 133.51
Carbon monoxide (CO) mg/m3 0.00 44.10 2.37
Nitrogen oxides (NOx) mg/m3 25.90 119.91 65.29

In [13]:

import pandas as pd

data = pd.read_csv('gas_turbines.csv')


print(data.info())
print(data.describe())


X = data.drop(['TEY'], axis=1)  # Features
y = data['TEY']  # Target variable


from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


from sklearn.ensemble import RandomForestRegressor

# RandomForestRegressor model
model = RandomForestRegressor()

# Training the model
model.fit(X_train, y_train)


from sklearn.metrics import mean_absolute_error, mean_squared_error


y_pred = model.predict(X_test)


mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

print(f'Mean Absolute Error: {mae}')
print(f'Mean Squared Error: {mse}')




<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15039 entries, 0 to 15038
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   AT      15039 non-null  float64
 1   AP      15039 non-null  float64
 2   AH      15039 non-null  float64
 3   AFDP    15039 non-null  float64
 4   GTEP    15039 non-null  float64
 5   TIT     15039 non-null  float64
 6   TAT     15039 non-null  float64
 7   TEY     15039 non-null  float64
 8   CDP     15039 non-null  float64
 9   CO      15039 non-null  float64
 10  NOX     15039 non-null  float64
dtypes: float64(11)
memory usage: 1.3 MB
None
                 AT           AP            AH          AFDP          GTEP  \
count  15039.000000  15039.00000  15039.000000  15039.000000  15039.000000   
mean      17.764381   1013.19924     79.124174      4.200294     25.419061   
std        7.574323      6.41076     13.793439      0.760197      4.173916   
min        0.522300    985.85000     30.344000      