###### Source

This dataset has been obtained from Kaggle

[Flood Prediction Dataset](https://www.kaggle.com/datasets/naiyakhalid/flood-prediction-dataset/)

###### Citations:
- Tellman, B., J.A. Sullivan, C. Kuhn, A.J. Kettner, C.S. Doyle, G.R. Brakenridge, T. Erickson, D.A. Slayback. (Accepted.) Satellites observe increasing proportion of population exposed to floods. Nature. doi:10.1038/s41586-021-03695-w

imports

In [145]:
import pandas as pd
import tensorflow as tf

from keras.layers import Dense, Dropout, Input
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.losses import Huber
from tensorflow.keras.models import Sequential

read and analyse data

In [146]:
df = pd.read_csv("flood.csv")

In [147]:
df.head()

Unnamed: 0,MonsoonIntensity,TopographyDrainage,RiverManagement,Deforestation,Urbanization,ClimateChange,DamsQuality,Siltation,AgriculturalPractices,Encroachments,...,DrainageSystems,CoastalVulnerability,Landslides,Watersheds,DeterioratingInfrastructure,PopulationScore,WetlandLoss,InadequatePlanning,PoliticalFactors,FloodProbability
0,3,8,6,6,4,4,6,2,3,2,...,10,7,4,2,3,4,3,2,6,0.45
1,8,4,5,7,7,9,1,5,5,4,...,9,2,6,2,1,1,9,1,3,0.475
2,3,10,4,1,7,5,4,7,4,9,...,7,4,4,8,6,1,8,3,6,0.515
3,4,4,2,7,3,4,1,4,6,4,...,4,2,6,6,8,8,6,6,10,0.52
4,3,7,5,2,5,8,5,2,7,5,...,7,6,5,3,3,4,4,3,4,0.475


In [148]:
df.columns

Index(['MonsoonIntensity', 'TopographyDrainage', 'RiverManagement',
       'Deforestation', 'Urbanization', 'ClimateChange', 'DamsQuality',
       'Siltation', 'AgriculturalPractices', 'Encroachments',
       'IneffectiveDisasterPreparedness', 'DrainageSystems',
       'CoastalVulnerability', 'Landslides', 'Watersheds',
       'DeterioratingInfrastructure', 'PopulationScore', 'WetlandLoss',
       'InadequatePlanning', 'PoliticalFactors', 'FloodProbability'],
      dtype='object')

In [149]:
json_data = df.to_json(orient="records", indent=4)

In [150]:
with open("flood.json", "w") as f:
    f.write(json_data)

In [151]:
df = pd.read_json("flood.json", orient="records")

In [152]:
df.head()

Unnamed: 0,MonsoonIntensity,TopographyDrainage,RiverManagement,Deforestation,Urbanization,ClimateChange,DamsQuality,Siltation,AgriculturalPractices,Encroachments,...,DrainageSystems,CoastalVulnerability,Landslides,Watersheds,DeterioratingInfrastructure,PopulationScore,WetlandLoss,InadequatePlanning,PoliticalFactors,FloodProbability
0,3,8,6,6,4,4,6,2,3,2,...,10,7,4,2,3,4,3,2,6,0.45
1,8,4,5,7,7,9,1,5,5,4,...,9,2,6,2,1,1,9,1,3,0.475
2,3,10,4,1,7,5,4,7,4,9,...,7,4,4,8,6,1,8,3,6,0.515
3,4,4,2,7,3,4,1,4,6,4,...,4,2,6,6,8,8,6,6,10,0.52
4,3,7,5,2,5,8,5,2,7,5,...,7,6,5,3,3,4,4,3,4,0.475


In [153]:
df.columns

Index(['MonsoonIntensity', 'TopographyDrainage', 'RiverManagement',
       'Deforestation', 'Urbanization', 'ClimateChange', 'DamsQuality',
       'Siltation', 'AgriculturalPractices', 'Encroachments',
       'IneffectiveDisasterPreparedness', 'DrainageSystems',
       'CoastalVulnerability', 'Landslides', 'Watersheds',
       'DeterioratingInfrastructure', 'PopulationScore', 'WetlandLoss',
       'InadequatePlanning', 'PoliticalFactors', 'FloodProbability'],
      dtype='object')

In [154]:
print(df.isnull().sum())

MonsoonIntensity                   0
TopographyDrainage                 0
RiverManagement                    0
Deforestation                      0
Urbanization                       0
ClimateChange                      0
DamsQuality                        0
Siltation                          0
AgriculturalPractices              0
Encroachments                      0
IneffectiveDisasterPreparedness    0
DrainageSystems                    0
CoastalVulnerability               0
Landslides                         0
Watersheds                         0
DeterioratingInfrastructure        0
PopulationScore                    0
WetlandLoss                        0
InadequatePlanning                 0
PoliticalFactors                   0
FloodProbability                   0
dtype: int64


- There are no missing data

In [155]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 21 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   MonsoonIntensity                 50000 non-null  int64  
 1   TopographyDrainage               50000 non-null  int64  
 2   RiverManagement                  50000 non-null  int64  
 3   Deforestation                    50000 non-null  int64  
 4   Urbanization                     50000 non-null  int64  
 5   ClimateChange                    50000 non-null  int64  
 6   DamsQuality                      50000 non-null  int64  
 7   Siltation                        50000 non-null  int64  
 8   AgriculturalPractices            50000 non-null  int64  
 9   Encroachments                    50000 non-null  int64  
 10  IneffectiveDisasterPreparedness  50000 non-null  int64  
 11  DrainageSystems                  50000 non-null  int64  
 12  CoastalVulnerabili

In [156]:
df.describe()

Unnamed: 0,MonsoonIntensity,TopographyDrainage,RiverManagement,Deforestation,Urbanization,ClimateChange,DamsQuality,Siltation,AgriculturalPractices,Encroachments,...,DrainageSystems,CoastalVulnerability,Landslides,Watersheds,DeterioratingInfrastructure,PopulationScore,WetlandLoss,InadequatePlanning,PoliticalFactors,FloodProbability
count,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,...,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0
mean,4.99148,4.9841,5.01594,5.00848,4.98906,4.98834,5.01536,4.9886,5.00612,5.00638,...,5.00606,4.99992,4.98422,4.97982,4.9882,4.98498,5.00512,4.99436,4.99052,0.49966
std,2.236834,2.246488,2.23131,2.222743,2.243159,2.226761,2.245,2.232642,2.234588,2.241633,...,2.238107,2.247101,2.227741,2.23219,2.231134,2.238279,2.23176,2.230011,2.246075,0.050034
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.285
25%,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,...,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,0.465
50%,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,...,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,0.5
75%,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,...,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,0.535
max,16.0,18.0,16.0,17.0,17.0,17.0,16.0,16.0,16.0,18.0,...,17.0,17.0,16.0,16.0,17.0,19.0,22.0,16.0,16.0,0.725


separate independent and dependent variables

In [157]:
X = df.drop(columns=['FloodProbability'])
y = df['FloodProbability']

transforming feature values in the range [0, 1]

In [158]:
scaler = MinMaxScaler()
X_n = scaler.fit_transform(X)

In [159]:
X_train, X_test, y_train, y_test = train_test_split(X_n, y, test_size=0.2, random_state=42)

checking the shape to help determine the number of neurons per layer

In [160]:
X_train.shape

(40000, 20)

set seed for consistency

In [161]:
tf.random.set_seed(42)

### designing model

#### features

- wide to narrow
- starting number of neurons 32, because input shape is 20
- minimum number of neurons 16, because using 20 / 2 = 10 as the lower limit

In [162]:
model = Sequential([
    Input(shape=(X_train.shape[1],)),
    Dense(32, activation="relu"),
    Dense(16, activation="swish"),
    Dense(1, activation="linear")
])

### compiling model

#### features

- using adam optimizer as it works in most cases
- using huber loss instead of mean squared error, as huber loss is less sensitive to outliers
- using mean absolute error, and mean absolute percentage error as metrics (just for learning purpose)

In [163]:
model.compile(optimizer="adam", loss=Huber(delta=1.0), metrics=["mae", "mape"])

### designing early stopping

#### features

- monitoring the val_loss
- will wait for 2 epoch
- will restore best weights

In [164]:
early_stopping = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

### training model

#### features

- setting epoch number to 100, as dataset is small
- setting batch size to 32 to avoid overfitting
- using 20% data for validation
- allowing extra verbose outputs category 1
- applying early stopping as callback

In [165]:
history = model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.2, verbose=1, callbacks=[early_stopping])

Epoch 1/100
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - loss: 0.0032 - mae: 0.0426 - mape: 8.5969 - val_loss: 3.2988e-05 - val_mae: 0.0060 - val_mape: 1.1996
Epoch 2/100
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 2.5021e-05 - mae: 0.0052 - mape: 1.0435 - val_loss: 1.2960e-05 - val_mae: 0.0036 - val_mape: 0.7286
Epoch 3/100
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - loss: 1.0944e-05 - mae: 0.0034 - mape: 0.6850 - val_loss: 7.0342e-06 - val_mae: 0.0029 - val_mape: 0.5793
Epoch 4/100
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - loss: 6.7993e-06 - mae: 0.0027 - mape: 0.5489 - val_loss: 5.8990e-06 - val_mae: 0.0028 - val_mape: 0.5629
Epoch 5/100
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - loss: 4.8746e-06 - mae: 0.0023 - mape: 0.4721 - val_loss: 2.9037e-06 - val_mae: 0.0018 - val_mape: 0.3551
Epoch 6/100
[1m100

In [166]:
y_pred = model.predict(X_test)

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step


In [167]:
huber_loss = Huber(delta=1.0)
huber_loss(y_test, y_pred).numpy()

1.3133548e-06

In [168]:
mean_absolute_error(y_test, y_pred)

0.0012487128499746264

In [169]:
mean_absolute_percentage_error(y_test, y_pred)

0.0025033856091152243