<a target="_blank" href="https://colab.research.google.com/github/BuczynskiRafal/stormwater-analysis/blob/main/stormwater_analysis/data/catchment_classification_model/first_approach_classification.ipynb">

<a target="_blank" href="https://colab.research.google.com/github/BuczynskiRafal/stormwater-analysis/blob/main/stormwater_analysis/data/catchment_classification_model/first_approach_classification.ipynb">
    <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

# Training a model for catchment classification. 
* kategoryzacja zlewni oparta o większość dostępnych ceech zlewni z plików inp i rpt

# Imports

In [20]:
import swmmio
import pyswmm
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Normalization, Input
from tensorflow.keras.callbacks import EarlyStopping
# from tensorflow.keras.layers.experimental import preprocessing


desired_width = 500
pd.set_option("display.width", desired_width)
np.set_printoptions(linewidth=desired_width)
pd.set_option("display.max_columns", 30)

In [3]:
classes = pd.DataFrame(
    data={
        "classes": [
            "compact_urban_development",
            "urban",
            "loose_urban_development",
            "wooded_area",
            "grassy",
            "loose_soil",
            "steep_area",
        ]
    }
)
classes

Unnamed: 0,classes
0,compact_urban_development
1,urban
2,loose_urban_development
3,wooded_area
4,grassy
5,loose_soil
6,steep_area


# Get files

In [4]:
INP_FILE_01 = "dataset/subcatchment_dataset_01.inp"
INP_FILE_02 = "dataset/subcatchment_dataset_02.inp"
INP_FILE_03 = "dataset/subcatchment_dataset_03.inp"
INP_FILE_04 = "dataset/subcatchment_dataset_04.inp"
INP_FILE_05 = "dataset/subcatchment_dataset_05.inp"
INP_FILE_06 = "dataset/subcatchment_dataset_06.inp"
INP_FILE_07 = "dataset/subcatchment_dataset_07.inp"
INP_FILE_08 = "dataset/subcatchment_dataset_08.inp"
INP_FILE_09 = "dataset/subcatchment_dataset_09.inp"
INP_FILE_10 = "dataset/subcatchment_dataset_10.inp"
files = [INP_FILE_01, INP_FILE_02, INP_FILE_03, INP_FILE_04, INP_FILE_05, INP_FILE_06, INP_FILE_07, INP_FILE_08, INP_FILE_09, INP_FILE_10]

# Run simulation

In [5]:
for f in files:
    with pyswmm.Simulation(f) as sim:
        for step in sim:
            pass

# Read inp and rpt file as swmmio model object

In [6]:
model_01 = swmmio.Model(INP_FILE_01)
model_02 = swmmio.Model(INP_FILE_02)
model_03 = swmmio.Model(INP_FILE_03)
model_04 = swmmio.Model(INP_FILE_04)
model_05 = swmmio.Model(INP_FILE_05)
model_06 = swmmio.Model(INP_FILE_06)
model_07 = swmmio.Model(INP_FILE_07)
model_08 = swmmio.Model(INP_FILE_08)
model_09 = swmmio.Model(INP_FILE_09)
model_10 = swmmio.Model(INP_FILE_10)

models = [model_01, model_02, model_03, model_04, model_05, model_06, model_07, model_08, model_09, model_10]

## Get subcatchments data from the model

In [7]:
raw_subcatchments_01 = model_01.subcatchments.dataframe
raw_subcatchments_02 = model_02.subcatchments.dataframe
raw_subcatchments_03 = model_03.subcatchments.dataframe
raw_subcatchments_04 = model_04.subcatchments.dataframe
raw_subcatchments_05 = model_05.subcatchments.dataframe
raw_subcatchments_06 = model_06.subcatchments.dataframe
raw_subcatchments_07 = model_07.subcatchments.dataframe
raw_subcatchments_08 = model_08.subcatchments.dataframe
raw_subcatchments_09 = model_09.subcatchments.dataframe
raw_subcatchments_10 = model_10.subcatchments.dataframe

subcatchments_01 = raw_subcatchments_01.copy()
subcatchments_02 = raw_subcatchments_02.copy()
subcatchments_03 = raw_subcatchments_03.copy()
subcatchments_04 = raw_subcatchments_04.copy()
subcatchments_05 = raw_subcatchments_05.copy()
subcatchments_06 = raw_subcatchments_06.copy()
subcatchments_07 = raw_subcatchments_07.copy()
subcatchments_08 = raw_subcatchments_08.copy()
subcatchments_09 = raw_subcatchments_09.copy()
subcatchments_10 = raw_subcatchments_10.copy()

subcatchments = [subcatchments_01, subcatchments_02, subcatchments_03, subcatchments_04, subcatchments_05, subcatchments_06, subcatchments_07, subcatchments_08, subcatchments_09, subcatchments_10]

### Drop unused columns

In [8]:
for frame in subcatchments:
    frame.drop(['N-Imperv', 'N-Perv', 'S-Imperv',	'S-Perv', 'coords', 'RouteTo', 'CurbLength', 'Outlet', 'Raingage', 'TotalRunon', 'TotalEvap', 'TotalInfil', 'ImpervRunoff', 'PervRunoff', 'TotalRunoffIn'], axis=1, inplace=True)

In [9]:
subcatchments_01.iloc[:5, :]

Unnamed: 0_level_0,Area,PercImperv,Width,PercSlope,PctZero,TotalPrecip,TotalRunoffMG,PeakRunoff,RunoffCoeff
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
S1,0.5,20.0,300.0,5.0,70,10.1,0.02,0.02,0.407
S10,1.87,45.0,136.75,15.0,90,10.1,0.11,0.09,0.587
S100,1.71,45.0,130.77,10.0,90,10.1,0.1,0.08,0.581
S1000,0.68,10.0,82.46,5.09,10,10.1,0.02,0.01,0.224
S1001,0.26,83.33,50.99,45.0,80,10.1,0.02,0.02,0.879


## Get categories

In [10]:
categories_01 = model_01.inp.tags
categories_02 = model_02.inp.tags
categories_03 = model_03.inp.tags
categories_04 = model_04.inp.tags
categories_05 = model_05.inp.tags
categories_06 = model_06.inp.tags
categories_07 = model_07.inp.tags
categories_08 = model_08.inp.tags
categories_09 = model_09.inp.tags
categories_10 = model_10.inp.tags
categories_01

Unnamed: 0_level_0,Name,Tag
ElementType,Unnamed: 1_level_1,Unnamed: 2_level_1
Subcatch,S1,loose_soil
Subcatch,S2,compact_urban_development
Subcatch,S3,loose_soil
Subcatch,S4,wooded_area
Subcatch,S5,loose_urban_development
...,...,...
Subcatch,S997,grassy
Subcatch,S998,steep_area
Subcatch,S999,urban
Subcatch,S1000,grassy


### Add categories column to subcatchments DataFrame

In [11]:
def merge_tag(subcatchment, model):
    subcatchment.reset_index(inplace=True)
    merged = subcatchment.merge(model.inp.tags, left_on="Name", right_on="Name", how="left")
    merged.rename(columns={"Tag": "categories"}, inplace=True)
    merged.set_index("Name", inplace=True)
    return merged

merged_df_01 = merge_tag(subcatchments_01, model_01)
merged_df_02 = merge_tag(subcatchments_02, model_02)
merged_df_03 = merge_tag(subcatchments_03, model_03)
merged_df_04 = merge_tag(subcatchments_04, model_04)
merged_df_05 = merge_tag(subcatchments_05, model_05)
merged_df_06 = merge_tag(subcatchments_06, model_06)
merged_df_07 = merge_tag(subcatchments_07, model_07)
merged_df_08 = merge_tag(subcatchments_08, model_08)
merged_df_09 = merge_tag(subcatchments_09, model_09)
merged_df_10 = merge_tag(subcatchments_10, model_10)

merged = [merged_df_01, merged_df_02, merged_df_03, merged_df_04, merged_df_05, merged_df_06, merged_df_07, merged_df_08, merged_df_09, merged_df_10]

df = pd.concat(merged)
df = df.reset_index(drop=True)

In [13]:
df.head()

Unnamed: 0,Area,PercImperv,Width,PercSlope,PctZero,TotalPrecip,TotalRunoffMG,PeakRunoff,RunoffCoeff,categories
0,0.5,20.0,300.0,5.0,70,10.1,0.02,0.02,0.407,loose_soil
1,1.87,45.0,136.75,15.0,90,10.1,0.11,0.09,0.587,compact_urban_development
2,1.71,45.0,130.77,10.0,90,10.1,0.1,0.08,0.581,compact_urban_development
3,0.68,10.0,82.46,5.09,10,10.1,0.02,0.01,0.224,grassy
4,0.26,83.33,50.99,45.0,80,10.1,0.02,0.02,0.879,steep_area


### Split data into features and target

In [14]:
X = df.drop('categories', axis=1)
y = df['categories']

X["TotalPrecip"] = pd.to_numeric(X["TotalPrecip"])
# X["TotalRunon"] = pd.to_numeric(X["TotalRunon"])
# X["TotalEvap"] = pd.to_numeric(X["TotalEvap"])
# X["TotalInfil"] = pd.to_numeric(X["TotalInfil"])
# X["PervRunoff"] = pd.to_numeric(X["PervRunoff"])
# X["ImpervRunoff"] = pd.to_numeric(X["ImpervRunoff"])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

y_train = pd.get_dummies(y_train)
y_test = pd.get_dummies(y_test)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(8008, 9) (8008, 7)
(2002, 9) (2002, 7)


In [15]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8008 entries, 2270 to 7270
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Area           8008 non-null   float64
 1   PercImperv     8008 non-null   float64
 2   Width          8008 non-null   float64
 3   PercSlope      8008 non-null   float64
 4   PctZero        8008 non-null   int64  
 5   TotalPrecip    8008 non-null   float64
 6   TotalRunoffMG  8008 non-null   float64
 7   PeakRunoff     8008 non-null   float64
 8   RunoffCoeff    8008 non-null   float64
dtypes: float64(8), int64(1)
memory usage: 625.6 KB


In [16]:
X_train[:50]

Unnamed: 0,Area,PercImperv,Width,PercSlope,PctZero,TotalPrecip,TotalRunoffMG,PeakRunoff,RunoffCoeff
2270,0.02,10.0,14.14,20.0,10,38.33,0.01,0.01,0.825
8668,1.87,10.0,136.75,20.0,10,12.67,0.06,0.01,0.256
8293,0.81,10.0,90.0,20.0,10,12.67,0.03,0.01,0.28
7868,1.46,10.0,120.83,5.09,10,9.5,0.01,0.0,0.101
1095,1.61,5.09,126.89,15.0,10,20.0,0.19,0.14,0.59
3131,0.58,15.0,76.16,30.0,5,15.0,0.02,0.01,0.259
8116,1.96,10.0,140.0,5.09,10,12.67,0.05,0.01,0.214
1561,0.89,5.09,94.34,15.0,10,20.0,0.11,0.09,0.606
1670,1.52,65.0,123.29,61.67,80,20.0,0.27,0.21,0.879
9951,1.68,45.0,129.61,5.09,90,0.32,0.0,0.0,0.403


In [21]:
normalizer = Normalization()
normalizer.adapt(np.array(X_train))


# Build the model

In [22]:
# model = Sequential()
# model.add(normalizer)
# model.add(Dense(units=X_train.shape[1], activation='relu', input_dim=X_train.shape[1]))
# model.add(Dropout(0.2))
# model.add(Dense(18, activation='relu'))
# model.add(Dropout(0.2))
# model.add(Dense(units=y_train.shape[1], activation='softmax'))
# model.summary()


model = Sequential([
    Input(shape=(X_train.shape[1],)),  # Dodanie Input Layer zamiast input_dim w Dense
    normalizer,
    Dense(units=X_train.shape[1], activation='relu'),
    Dropout(0.2),
    Dense(18, activation='relu'),
    Dropout(0.2),
    Dense(units=y_train.shape[1], activation='softmax')
])

model.summary()

In [23]:
model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)
history = model.fit(
    x=X_train,
    y=y_train,
    epochs=100,
    validation_data=(X_test, y_test),
    validation_split=0.2,
    verbose=1,
    batch_size=16,
    callbacks=[EarlyStopping(monitor='val_loss', patience=10)]
)

Epoch 1/100
[1m501/501[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.3532 - loss: 1.7535 - val_accuracy: 0.6708 - val_loss: 0.9539
Epoch 2/100
[1m501/501[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.5907 - loss: 1.0527 - val_accuracy: 0.7348 - val_loss: 0.6809
Epoch 3/100
[1m501/501[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.6653 - loss: 0.8309 - val_accuracy: 0.8671 - val_loss: 0.5327
Epoch 4/100
[1m501/501[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.7289 - loss: 0.6997 - val_accuracy: 0.9161 - val_loss: 0.4087
Epoch 5/100
[1m501/501[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.7770 - loss: 0.5953 - val_accuracy: 0.9231 - val_loss: 0.3052
Epoch 6/100
[1m501/501[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.7938 - loss: 0.5162 - val_accuracy: 0.9261 - val_loss: 0.2320
Epoch 7/100
[1m501/50

In [24]:
# Evaluate ANN model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test set accuracy: {accuracy:.3f}")

[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 1.0000 - loss: 0.0044
Test set accuracy: 1.000


In [25]:
# model.predict(X_test)
y_pred = model.predict(X_test)

[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step


In [26]:
metrics = pd.DataFrame(history.history)
metrics['epoch'] = history.epoch
metrics

Unnamed: 0,accuracy,loss,val_accuracy,val_loss,epoch
0,0.468781,1.497201,0.670829,0.953897,0
1,0.625125,0.963627,0.734765,0.680919,1
2,0.692932,0.78874,0.867133,0.53265,2
3,0.745504,0.673396,0.916084,0.408687,3
4,0.781344,0.573008,0.923077,0.305236,4
5,0.802448,0.492889,0.926074,0.231999,5
6,0.834915,0.415817,0.926573,0.165817,6
7,0.871379,0.347341,0.987013,0.118346,7
8,0.899351,0.282165,0.996503,0.086296,8
9,0.907343,0.255308,0.993506,0.066328,9


# Model evaluate

In [27]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

fig = make_subplots(rows=1, cols=2)
fig.add_trace(go.Scatter(x=metrics['epoch'], y=metrics['accuracy'], name='accuracy'), row=1, col=1)
fig.add_trace(go.Scatter(x=metrics['epoch'], y=metrics['loss'], name='loss'), row=1, col=2)
fig.add_trace(go.Scatter(x=metrics['epoch'], y=metrics['val_accuracy'], name='val_accuracy'), row=1, col=1)
fig.add_trace(go.Scatter(x=metrics['epoch'], y=metrics['val_loss'], name='val_loss'), row=1, col=2)

fig.update_xaxes(title_text='epochs')
fig.update_yaxes(title_text='accuracy', row=1, col=1)
fig.update_yaxes(title_text='loss', row=1, col=2)
fig.update_layout(width=1000, title='Accuracy and Loss')
fig.show()

In [28]:
test_loss, test_acc = model.evaluate(X_test, y_test, verbose=0)
print(test_acc)

1.0


Predykcja na podstawie modelu.
* model.evaluate(y_true, y_pred) - pozwala obliczyć metryki modelu
* model.predict_classes() - pozwala zwrócić odpowiednio przewidziane klasy
* model.predict_proba(), model.predict() - pozwala zwrócić prawdopodobieństwo danej klasy

In [29]:
predictions = model.predict(X_test)
predictions

[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


array([[4.7946610e-34, 1.0000000e+00, 4.7604715e-11, ..., 6.1301779e-25, 3.4358207e-17, 8.6972562e-27],
       [3.6285966e-36, 1.0000000e+00, 1.1931296e-09, ..., 5.2703477e-26, 3.6148277e-18, 5.4745868e-27],
       [6.6952460e-05, 1.4827287e-16, 3.9370196e-23, ..., 6.9322764e-10, 9.9993300e-01, 4.6980023e-20],
       ...,
       [2.5773288e-13, 1.0000000e+00, 9.0302478e-13, ..., 2.2426110e-10, 1.0944027e-10, 6.7731287e-09],
       [7.8852318e-06, 0.0000000e+00, 0.0000000e+00, ..., 9.9999213e-01, 0.0000000e+00, 5.8488611e-22],
       [8.8950037e-07, 5.9983533e-05, 0.0000000e+00, ..., 1.2915916e-04, 3.4836165e-37, 9.9981004e-01]], dtype=float32)

In [30]:
predictions_df = pd.DataFrame(predictions)
predictions_df

Unnamed: 0,0,1,2,3,4,5,6
0,4.794661e-34,1.000000e+00,4.760472e-11,1.587735e-12,6.130178e-25,3.435821e-17,8.697256e-27
1,3.628597e-36,1.000000e+00,1.193130e-09,3.409753e-13,5.270348e-26,3.614828e-18,5.474587e-27
2,6.695246e-05,1.482729e-16,3.937020e-23,5.892462e-11,6.932276e-10,9.999330e-01,4.698002e-20
3,1.868925e-03,2.392493e-04,0.000000e+00,1.445893e-31,5.974065e-03,2.642885e-19,9.919178e-01
4,1.554817e-05,3.907212e-21,2.905704e-33,2.212286e-13,4.195121e-14,9.999845e-01,1.468950e-31
...,...,...,...,...,...,...,...
1997,3.287582e-13,0.000000e+00,0.000000e+00,0.000000e+00,1.000000e+00,0.000000e+00,1.744623e-36
1998,1.198635e-05,0.000000e+00,0.000000e+00,0.000000e+00,9.999880e-01,8.759131e-38,1.069375e-21
1999,2.577329e-13,1.000000e+00,9.030248e-13,1.605759e-11,2.242611e-10,1.094403e-10,6.773129e-09
2000,7.885232e-06,0.000000e+00,0.000000e+00,0.000000e+00,9.999921e-01,0.000000e+00,5.848861e-22


In [31]:
predictions_cls = predictions.argmax(axis=-1)
predictions_cls

array([1, 1, 5, ..., 1, 4, 6], dtype=int64)

In [36]:
!pip show tensorflow


Name: tensorflow
Version: 2.16.1
Summary: TensorFlow is an open source machine learning framework for everyone.
Home-page: https://www.tensorflow.org/
Author: Google Inc.
Author-email: packages@tensorflow.org
License: Apache 2.0
Location: c:\users\dell\documents\git\stormwater-analysis\venv\lib\site-packages
Requires: tensorflow-intel
Required-by: 




# Zapisanie modelu

In [39]:
# model.save(r'C:\Users\Dell\Documents\Git\stormwater-analysis\stormwater_analysis\data\catchment_classification_model\catchemnt_classifier')
model.save(R'C:\Users\Dell\Documents\Git\stormwater-analysis\stormwater_analysis\data\catchment_classification_model\catchemnt_classifier\model.keras')


# Załadowanie modelu

In [40]:
from tensorflow import keras
model = keras.models.load_model(R'C:\Users\Dell\Documents\Git\stormwater-analysis\stormwater_analysis\data\catchment_classification_model\catchemnt_classifier\model.keras')

In [41]:
X_test

Unnamed: 0,Area,PercImperv,Width,PercSlope,PctZero,TotalPrecip,TotalRunoffMG,PeakRunoff,RunoffCoeff
7438,1.43,5.09,119.58,10.00,10,9.50,0.01,0.00,0.063
9058,1.00,5.09,100.00,10.00,10,0.32,0.00,0.00,0.005
568,0.77,30.00,87.75,15.00,80,10.10,0.04,0.03,0.462
9344,0.60,15.00,77.46,30.00,5,0.32,0.00,0.00,0.007
2236,1.39,30.00,117.90,5.09,80,38.33,0.44,0.30,0.820
...,...,...,...,...,...,...,...,...,...
533,0.19,83.33,43.59,45.00,80,10.10,0.02,0.01,0.880
132,1.36,65.00,116.62,61.67,80,10.10,0.10,0.09,0.761
7329,0.30,10.00,54.77,20.00,10,9.50,0.00,0.00,0.127
8466,1.25,65.00,111.80,61.67,80,12.67,0.12,0.04,0.772


In [42]:
! pip install pydot






In [43]:
from tensorflow.keras.utils import plot_model
plot_model(model)

You must install graphviz (see instructions at https://graphviz.gitlab.io/download/) for `plot_model` to work.


In [44]:
pred = model.predict(X_test)
pred

[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step


array([[4.7946610e-34, 1.0000000e+00, 4.7604715e-11, ..., 6.1301779e-25, 3.4358207e-17, 8.6972562e-27],
       [3.6285966e-36, 1.0000000e+00, 1.1931296e-09, ..., 5.2703477e-26, 3.6148277e-18, 5.4745868e-27],
       [6.6952460e-05, 1.4827287e-16, 3.9370196e-23, ..., 6.9322764e-10, 9.9993300e-01, 4.6980023e-20],
       ...,
       [2.5773288e-13, 1.0000000e+00, 9.0302478e-13, ..., 2.2426110e-10, 1.0944027e-10, 6.7731287e-09],
       [7.8852318e-06, 0.0000000e+00, 0.0000000e+00, ..., 9.9999213e-01, 0.0000000e+00, 5.8488611e-22],
       [8.8950037e-07, 5.9983533e-05, 0.0000000e+00, ..., 1.2915916e-04, 3.4836165e-37, 9.9981004e-01]], dtype=float32)

In [45]:
tensorflow.version

NameError: name 'tensorflow' is not defined