<a href="https://colab.research.google.com/github/shreyassathyamangalam/colab_notebooks/blob/main/keras_tuner_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import Libraries ##

In [1]:
!pip install keras_tuner

Collecting keras_tuner
  Downloading keras_tuner-1.4.7-py3-none-any.whl (129 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.1/129.1 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Collecting kt-legacy (from keras_tuner)
  Downloading kt_legacy-1.0.5-py3-none-any.whl (9.6 kB)
Installing collected packages: kt-legacy, keras_tuner
Successfully installed keras_tuner-1.4.7 kt-legacy-1.0.5


In [2]:
# tensorflow
import tensorflow as tf
from tensorflow.keras import backend as K
import keras_tuner as kt

# holy grail
import numpy as np
import pandas as pd

# visualization
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.metrics import mean_squared_error, mean_squared_log_error, r2_score


# boosting models
import lightgbm as lgb
from xgboost import XGBRegressor

# warnings
import warnings
warnings.filterwarnings(action="ignore", category=Warning)

## Import Data ##

In [3]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [4]:
!pwd

/content


In [5]:
!ls

drive  sample_data


In [6]:
!ls /content/drive/MyDrive

'Colab Notebooks'   test.csv   train.csv


In [7]:
df_train = pd.read_csv("/content/drive/MyDrive/train.csv")

In [8]:
df_train.head()

Unnamed: 0,id,MonsoonIntensity,TopographyDrainage,RiverManagement,Deforestation,Urbanization,ClimateChange,DamsQuality,Siltation,AgriculturalPractices,...,DrainageSystems,CoastalVulnerability,Landslides,Watersheds,DeterioratingInfrastructure,PopulationScore,WetlandLoss,InadequatePlanning,PoliticalFactors,FloodProbability
0,0,5,8,5,8,6,4,4,3,3,...,5,3,3,5,4,7,5,7,3,0.445
1,1,6,7,4,4,8,8,3,5,4,...,7,2,0,3,5,3,3,4,3,0.45
2,2,6,5,6,7,3,7,1,5,4,...,7,3,7,5,6,8,2,3,3,0.53
3,3,3,4,6,5,4,8,4,7,6,...,2,4,7,4,4,6,5,7,5,0.535
4,4,5,3,2,6,4,4,3,3,3,...,2,2,6,6,4,1,2,3,5,0.415


In [9]:
df_test = pd.read_csv("/content/drive/MyDrive/test.csv")

In [10]:
df_test.head()

Unnamed: 0,id,MonsoonIntensity,TopographyDrainage,RiverManagement,Deforestation,Urbanization,ClimateChange,DamsQuality,Siltation,AgriculturalPractices,...,IneffectiveDisasterPreparedness,DrainageSystems,CoastalVulnerability,Landslides,Watersheds,DeterioratingInfrastructure,PopulationScore,WetlandLoss,InadequatePlanning,PoliticalFactors
0,1117957,4,6,3,5,6,7,8,7,8,...,8,5,7,5,6,3,6,4,4,5
1,1117958,4,4,2,9,5,5,4,7,5,...,2,4,7,4,5,1,7,4,4,3
2,1117959,1,3,6,5,7,2,4,6,4,...,7,9,2,5,5,2,3,6,8,3
3,1117960,2,4,4,6,4,5,4,3,4,...,7,8,4,6,7,6,4,2,4,4
4,1117961,6,3,2,4,6,4,5,5,3,...,4,3,2,6,4,6,8,4,5,5


In [11]:
df_test.columns.to_list()

['id',
 'MonsoonIntensity',
 'TopographyDrainage',
 'RiverManagement',
 'Deforestation',
 'Urbanization',
 'ClimateChange',
 'DamsQuality',
 'Siltation',
 'AgriculturalPractices',
 'Encroachments',
 'IneffectiveDisasterPreparedness',
 'DrainageSystems',
 'CoastalVulnerability',
 'Landslides',
 'Watersheds',
 'DeterioratingInfrastructure',
 'PopulationScore',
 'WetlandLoss',
 'InadequatePlanning',
 'PoliticalFactors']

In [12]:
num_cols=['MonsoonIntensity', 'TopographyDrainage', 'RiverManagement',
       'Deforestation', 'Urbanization', 'ClimateChange', 'DamsQuality',
       'Siltation', 'AgriculturalPractices', 'Encroachments',
       'IneffectiveDisasterPreparedness', 'DrainageSystems',
       'CoastalVulnerability', 'Landslides', 'Watersheds',
       'DeterioratingInfrastructure', 'PopulationScore', 'WetlandLoss',
       'InadequatePlanning', 'PoliticalFactors']

unique_vals = []
for df in [df_train, df_test]:
    for col in num_cols:
        unique_vals += list(df[col].unique())

unique_vals = list(set(unique_vals))
#
def getFeats(df):

    scaler = StandardScaler()

    df['ClimateAnthropogenicInteraction'] = (df['MonsoonIntensity'] + df['ClimateChange']) * (df['Deforestation'] + df['Urbanization'] + df['AgriculturalPractices'] + df['Encroachments'])
    df['InfrastructurePreventionInteraction'] = (df['DamsQuality'] + df['DrainageSystems'] + df['DeterioratingInfrastructure']) * (df['RiverManagement'] + df['IneffectiveDisasterPreparedness'] + df['InadequatePlanning'])

    df['sum'] = df[num_cols].sum(axis=1)
    df['std']  = df[num_cols].std(axis=1)
    df['mean'] = df[num_cols].mean(axis=1)
    df['max']  = df[num_cols].max(axis=1)
    df['min']  = df[num_cols].min(axis=1)
    df['mode'] = df[num_cols].mode(axis=1)[0]
    df['median'] = df[num_cols].median(axis=1)
    df['q_25th'] = df[num_cols].quantile(0.25, axis=1)
    df['q_75th'] = df[num_cols].quantile(0.75, axis=1)
    df['skew'] = df[num_cols].skew(axis=1)
    df['kurt'] = df[num_cols].kurt(axis=1)
    df['sum_72_76'] = df['sum'].isin(np.arange(72, 76))
    for i in range(10,100,10):
        df[f'{i}th'] = df[num_cols].quantile(i/100, axis=1)
    df['harmonic'] = len(num_cols) / df[num_cols].apply(lambda x: (1/x).mean(), axis=1)
    df['geometric'] = df[num_cols].apply(lambda x: x.prod()**(1/len(x)), axis=1)
    df['zscore'] = df[num_cols].apply(lambda x: (x - x.mean()) / x.std(), axis=1).mean(axis=1)
    df['cv'] = df['std'] / df['mean']
    df['Skewness_75'] = (df[num_cols].quantile(0.75, axis=1) - df[num_cols].mean(axis=1)) / df[num_cols].std(axis=1)
    df['Skewness_25'] = (df[num_cols].quantile(0.25, axis=1) - df[num_cols].mean(axis=1)) / df[num_cols].std(axis=1)
    df['2ndMoment'] = df[num_cols].apply(lambda x: (x**2).mean(), axis=1)
    df['3rdMoment'] = df[num_cols].apply(lambda x: (x**3).mean(), axis=1)
    df['entropy'] = df[num_cols].apply(lambda x: -1*(x*np.log(x)).sum(), axis=1)
    df['rng'] = df['max'] - df['min']

    for v in unique_vals:
        if v<16:
            df['cnt_{}'.format(v)] = (df[num_cols] == v).sum(axis=1)

    df[num_cols] = scaler.fit_transform(df[num_cols])

    return df

In [13]:
df_train['typ'] = 0
df_test['typ'] = 1

df_all = pd.concat(objs=[df_train, df_test], axis=0)
df_all = getFeats(df_all)
df_all.head()

Unnamed: 0,id,MonsoonIntensity,TopographyDrainage,RiverManagement,Deforestation,Urbanization,ClimateChange,DamsQuality,Siltation,AgriculturalPractices,...,cnt_6,cnt_7,cnt_8,cnt_9,cnt_10,cnt_11,cnt_12,cnt_13,cnt_14,cnt_15
0,0,0.039335,1.467011,0.020654,1.489348,0.508518,-0.453698,-0.458827,-0.932697,-0.938761,...,1,2,2,0,0,0,0,0,0,0
1,1,0.525633,0.98945,-0.461971,-0.459918,1.468781,1.489614,-0.938314,0.034964,-0.455784,...,2,2,2,1,0,0,0,0,0,0
2,2,0.525633,0.034328,0.50328,1.002031,-0.931878,1.003786,-1.897287,0.034964,-0.455784,...,4,4,1,0,0,0,0,0,0,0
3,3,-0.933262,-0.443233,0.50328,0.027398,-0.451746,1.489614,-0.458827,1.002626,0.510172,...,3,3,2,0,0,0,0,0,0,0
4,4,0.039335,-0.920793,-1.427222,0.514715,-0.451746,-0.453698,-0.938314,-0.932697,-0.938761,...,3,0,0,0,0,0,0,0,0,0


In [14]:
df_train = df_all[df_all['typ']==0]
df_test = df_all[df_all['typ']==1]

cleaned_train_df = df_train.drop(columns=['id', 'FloodProbability', 'typ'], axis=1)
cleaned_target_df = df_train[['FloodProbability']]

feats = list(cleaned_train_df.columns)

## Neural Network Tuning ##

In [15]:
def r2_score_metric(y_true, y_pred):
    SS_res =  K.sum(K.square(y_true - y_pred))
    SS_tot = K.sum(K.square(y_true - K.mean(y_true)))
    return 1 - SS_res/(SS_tot + K.epsilon())

In [16]:
# split the data into traning and test sets

X_train, X_test, y_train, y_test = train_test_split(cleaned_train_df, cleaned_target_df,
                                                    test_size=0.3, random_state=42)

In [17]:
# standardize the features

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [18]:
def build_model(hp):
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Input(shape=(X_train_scaled.shape[1],)))

    for i in range(hp.Int('num_layers', 1, 3)):
        model.add(tf.keras.layers.Dense(
            units=hp.Int('units_' + str(i), min_value=32, max_value=512, step=32),
            activation='relu',
            kernel_regularizer=tf.keras.regularizers.l1_l2(
                l1=hp.Float('l1_' + str(i), 1e-5, 1e-2, sampling='LOG'),
                l2=hp.Float('l2_' + str(i), 1e-5, 1e-2, sampling='LOG')
            )
        ))
        model.add(tf.keras.layers.Dropout(hp.Float('dropout_' + str(i), 0.0, 0.5, step=0.1)))

    model.add(tf.keras.layers.Dense(1))

    model.compile(
        optimizer=tf.keras.optimizers.Adam(
            learning_rate=hp.Float('learning_rate', 1e-4, 1e-2, sampling='LOG')
        ),
        loss='mean_squared_error',
        metrics=[r2_score_metric]
    )
    return model

In [19]:
# create a tuner
tuner = kt.RandomSearch(
    build_model,
    objective='val_loss',
    max_trials=10,
    executions_per_trial=2,
    directory='my_dir',
    project_name='intro_to_kt'
)

# search the best hyperparameters
tuner.search(X_train_scaled, y_train, epochs=11, validation_split=0.2)

# get the best model
best_model = tuner.get_best_models(num_models=1)[0]

# evaluate the best model on the test set
y_pred = best_model.predict(X_test_scaled)
r2 = r2_score(y_test, y_pred)
print(f"R^2 score of the best model: {r2}")

Trial 10 Complete [00h 22m 34s]
val_loss: 0.001132278237491846

Best val_loss So Far: 0.0009984005591832101
Total elapsed time: 05h 53m 27s
R^2 score of the best model: 0.8403666355826824


In [20]:
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
print("Best hyperparameters:")
for key, value in best_hps.values.items():
    print(f"{key}: {value}")

Best hyperparameters:
num_layers: 1
units_0: 416
l1_0: 3.54764526934597e-05
l2_0: 2.7442736591932518e-05
dropout_0: 0.1
learning_rate: 0.0035576180732357536
units_1: 416
l1_1: 0.0001537789381718759
l2_1: 2.1917930117674487e-05
dropout_1: 0.1
units_2: 64
l1_2: 1.0787507332043223e-05
l2_2: 0.009471180177237368
dropout_2: 0.1


In [22]:
# Best hyperparameters:
# num_layers: 1
# units_0: 416
# l1_0: 3.54764526934597e-05
# l2_0: 2.7442736591932518e-05
# dropout_0: 0.1
# learning_rate: 0.0035576180732357536
# units_1: 416
# l1_1: 0.0001537789381718759
# l2_1: 2.1917930117674487e-05
# dropout_1: 0.1
# units_2: 64
# l1_2: 1.0787507332043223e-05
# l2_2: 0.009471180177237368
# dropout_2: 0.1

## Evaluate the model ##

In [23]:
def build_fixed_model():
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Input(shape=(X_train_scaled.shape[1],)))
    model.add(tf.keras.layers.Dense(242, activation='relu'))
    model.add(tf.keras.layers.Dense(12, activation='relu'))
    model.add(tf.keras.layers.Dense(1))

    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
        loss='mean_squared_error',
        metrics=[r2_score_metric]
    )
    return model

# Build and compile the model
fixed_model = build_fixed_model()

# Print the summary of the model
fixed_model.summary()

# Train the model
fixed_model.fit(X_train_scaled, y_train, epochs=5, validation_split=0.2)

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_2 (Dense)             (None, 242)               16940     
                                                                 
 dense_3 (Dense)             (None, 12)                2916      
                                                                 
 dense_4 (Dense)             (None, 1)                 13        
                                                                 
Total params: 19869 (77.61 KB)
Trainable params: 19869 (77.61 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x785eee226ef0>

In [24]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
# Evaluate the model on the test set
y_pred = fixed_model.predict(X_test_scaled)

# Calculate and print detailed metrics
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print("R² Score of the fixed model:", r2)
print("Mean Squared Error of the fixed model:", mse)
print("Mean Absolute Error of the fixed model:", mae)

R² Score of the fixed model: 0.8622455633519024
Mean Squared Error of the fixed model: 0.00035813541027602386
Mean Absolute Error of the fixed model: 0.014830387113466032


In [25]:
df_test = pd.read_csv("/content/drive/MyDrive/test.csv")

In [27]:
df_test = getFeats(df_test)
df_test.head()

Unnamed: 0,id,MonsoonIntensity,TopographyDrainage,RiverManagement,Deforestation,Urbanization,ClimateChange,DamsQuality,Siltation,AgriculturalPractices,...,cnt_6,cnt_7,cnt_8,cnt_9,cnt_10,cnt_11,cnt_12,cnt_13,cnt_14,cnt_15
0,1117957,-0.445272,0.510818,-0.946087,0.026267,0.509928,1.003513,1.455758,1.00205,1.473275,...,4,3,3,0,0,0,0,0,0,0
1,1117958,-0.445272,-0.444239,-1.428778,1.975015,0.029578,0.032282,-0.458748,1.00205,0.026378,...,0,3,0,1,0,0,0,0,0,0
2,1117959,-1.904208,-0.921767,0.501985,0.026267,0.990279,-1.424565,-0.458748,0.518517,-0.455921,...,3,2,1,1,0,0,0,0,0,0
3,1117960,-1.417896,-0.444239,-0.463396,0.513454,-0.450772,0.032282,-0.458748,-0.932084,-0.455921,...,3,2,1,0,0,0,0,0,0,0
4,1117961,0.527351,-0.921767,-1.428778,-0.46092,0.509928,-0.453334,0.019878,0.034983,-0.93822,...,4,1,1,0,0,0,0,0,0,0


In [28]:
df_test = df_test.drop(columns=['id'], axis=1)

In [29]:
X_test = scaler.transform(df_test)

In [30]:
predictions = fixed_model.predict(X_test)

